In [86]:
import pandas as pd
import numpy as np
from core import filter_clientes_importantes, filter_productos_importantes

In [87]:
# Reading the CSV file
stocks_02 = pd.read_csv("./datasets/originales_clase_3/tb_stocks_02.txt.gz", sep="\t")
stocks_01 = pd.read_csv("./datasets/originales_clase_2/tb_stocks_01.txt.gz", sep="\t", header=None)
productos_todos = pd.read_csv("./datasets/tb_sellout_02_productos_todos.csv")
maestro = pd.read_csv("./datasets/maestro_productos.csv")

In [88]:
stocks_01

Unnamed: 0,0
0,periodo
1,product_id
2,stock_final
3,201810
4,20524
...,...
41071,20981
41072,2.18491
41073,201912
41074,20853


In [89]:
columns = 3
data_series = stocks_01.iloc[:, 0]
column_names = data_series[:columns].tolist()
data_series = data_series[columns:]

# Ensuring the data length is a multiple of 5
data_series = data_series[: len(data_series) - (len(data_series) % columns)]

# Reshaping the data
reshaped_data = data_series.values.reshape(-1, columns)

stocks_01 = pd.DataFrame(
    reshaped_data,
    columns=column_names,
)
stocks_01

Unnamed: 0,periodo,product_id,stock_final
0,201810,20524,1.61267
1,201810,20311,2.93657
2,201810,20654,6.83269
3,201810,21005,1.01338
4,201810,20974,0.34595
...,...,...,...
13686,201912,20453,1.43741
13687,201912,21026,7.26817
13688,201912,21054,0.50833
13689,201912,20981,2.18491


In [90]:
stocks_01["periodo"].value_counts()

periodo
201910    949
201911    937
201907    931
201909    929
201908    928
201912    925
201904    915
201906    914
201905    909
201811    897
201810    896
201903    893
201902    892
201812    891
201901    885
Name: count, dtype: int64

In [91]:
stocks_02["periodo"].value_counts()

periodo
201910    949
201911    937
201907    931
201909    929
201908    928
201912    925
201904    915
201906    914
201905    909
201811    897
201810    896
201903    893
201902    892
201812    891
201901    885
Name: count, dtype: int64

In [92]:
# group original_data by periodo and product_id, sum stock_final
stocks_02 = stocks_02.groupby(['periodo', 'product_id']).agg({'stock_final': 'sum'}).reset_index()
stocks_02

Unnamed: 0,periodo,product_id,stock_final
0,201810,20001,33.65111
1,201810,20002,13.11456
2,201810,20003,916.34190
3,201810,20004,16.04067
4,201810,20005,415.71523
...,...,...,...
13686,201912,21265,0.22068
13687,201912,21266,0.11603
13688,201912,21267,0.54007
13689,201912,21271,0.22128


In [93]:
product_ids_maestro = maestro["product_id"].unique().tolist()
product_ids_sellout = productos_todos["product_id"].unique().tolist()
product_ids_stock = stocks_02["product_id"].unique().tolist()
faltantes_en_stock = list(set([*product_ids_sellout, *product_ids_maestro]) - set(product_ids_stock))
len(faltantes_en_stock)

201

In [94]:
periodos = productos_todos["periodo"].unique().tolist()
periodos_stock_02 = stocks_02["periodo"].unique().tolist()
periodos_faltantes = list(set(periodos) - set(periodos_stock_02))
len(periodos_faltantes)

21

In [95]:
# for each value in faltantes_en_stock, add a row to original_data with 0 values and periodo = 201701
for product_id in faltantes_en_stock:
    stocks_02 = pd.concat([stocks_02, pd.DataFrame([{'product_id': product_id, 'periodo': 201701, 'stock_final': 0}])], axis=0, ignore_index=True)
stocks_02["product_id"].nunique()

1296

In [96]:
# for each value in periodos, add a row to original_data with 0 values and periodo = 201701
for periodo in periodos_faltantes:
    stocks_02 = pd.concat([stocks_02, pd.DataFrame([{'product_id': 20001, 'periodo': periodo, 'stock_final': 0}])], axis=0, ignore_index=True)
stocks_02["periodo"].nunique()

36

In [97]:
# Find all unique values for 'periodo' and 'product_id'
unique_periodos = stocks_02['periodo'].unique()
unique_product_ids = stocks_02['product_id'].unique()
len(unique_periodos), len(unique_product_ids)

(36, 1296)

In [98]:
print("Shape antes: ", stocks_02.shape)

from itertools import product

# Create all possible combinations of 'periodo' and 'product_id'
all_combinations = list(product(unique_periodos, unique_product_ids))

# Convert the combinations into a DataFrame
all_combinations_df = pd.DataFrame(all_combinations, columns=['periodo', 'product_id'])

# Merge with the existing DataFrame to find missing rows
combined_df = all_combinations_df.merge(stocks_02, on=['periodo', 'product_id'], how='left')

# Identify the missing rows by looking for NaNs
missing_rows = combined_df[combined_df['stock_final'].isna()]

missing_rows.loc[:, 'stock_final'] = 0

# Now we append the missing rows to the original DataFrame
# convert to pd.concat: complete_df = stocks_02.append(missing_rows[stocks_02.columns], ignore_index=True)
stocks_02 = pd.concat([stocks_02, missing_rows[stocks_02.columns]], ignore_index=True)

# Return the number of new rows added and the path to the new file
num_new_rows = len(missing_rows)
print(num_new_rows)

print("Shape Después: ", stocks_02.shape)


Shape antes:  (13913, 3)
32743
Shape Después:  (46656, 3)


In [99]:
stocks_02.drop_duplicates(subset=['product_id', 'periodo'], keep='first', inplace=True)

In [100]:
# Debería tener resto 0
print(stocks_02.shape[0] / 36)

1296.0


In [101]:
stocks_02.isna().sum()

periodo        0
product_id     0
stock_final    0
dtype: int64

In [102]:
print("ORIGINAL", pd.read_csv("./datasets/originales_clase_3/tb_stocks_02.txt.gz", sep="\t")["stock_final"].sum())
print("PRODUCTOS", stocks_02["stock_final"].sum())

ORIGINAL 266675.31713
PRODUCTOS 266675.31713


In [103]:
stocks_02.to_csv('./datasets/tb_stocks_02_productos_todos.csv', index=False)

In [104]:
stocks_02 = stocks_02[stocks_02["periodo"] <= 201902]
stocks_02.to_csv('./datasets/tb_stocks_02_productos_todos_anti_leak.csv', index=False)

In [105]:
# Debería tener resto 0
print(stocks_02.shape[0] / 26)

1296.0
