In [1]:
import numpy as np
import pandas as pd
from itertools import product
from tqdm import tqdm

In [2]:
item_categories = pd.read_csv('data/item_categories.csv')
items = pd.read_csv('data/items.csv')
sales_train = pd.read_csv('data/sales_train.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')
shops = pd.read_csv('data/shops.csv')
test = pd.read_csv('data/test.csv')

datasets = {
    'item_categories': item_categories.columns.values,
    'items': items.columns.values,
    'sales_train': sales_train.columns.values,
    'sample_submission': sample_submission.columns.values,
    'shops': shops.columns.values,
    'test': test.columns.values
}

# Si cada dataset tiene menos de 6 columnas, completar con NaN.
for dataset in datasets:
    while len(datasets[dataset]) < 6:
        datasets[dataset] = np.append(datasets[dataset], '-')

pd.DataFrame(datasets)

Unnamed: 0,item_categories,items,sales_train,sample_submission,shops,test
0,item_category_name,item_name,date,ID,shop_name,ID
1,item_category_id,item_id,date_block_num,item_cnt_month,shop_id,shop_id
2,-,item_category_id,shop_id,-,-,item_id
3,-,-,item_id,-,-,-
4,-,-,item_price,-,-,-
5,-,-,item_cnt_day,-,-,-


In [36]:
max_date = pd.to_datetime(sales_train['date'], format='%d.%m.%Y').max()
# min_date = pd.to_datetime(sales_train['date'], format='%d.%m.%Y').min()
min_date = max_date - pd.Timedelta(days=730)
dates = pd.date_range(start=min_date, end=max_date, freq='D')

dfprod = pd.DataFrame({
    'date': pd.to_datetime(sales_train['date'], format='%d.%m.%Y'),
    'day': pd.to_datetime(sales_train['date'], format='%d.%m.%Y').dt.day,
    'month': pd.to_datetime(sales_train['date'], format='%d.%m.%Y').dt.month,
    'year': pd.to_datetime(sales_train['date'], format='%d.%m.%Y').dt.year,
    'weekday': pd.to_datetime(sales_train['date'], format='%d.%m.%Y').dt.weekday,
    'is_month_start': pd.to_datetime(sales_train['date'], format='%d.%m.%Y').dt.is_month_start,
    'is_month_end': pd.to_datetime(sales_train['date'], format='%d.%m.%Y').dt.is_month_end,
    'acc_month_num': (pd.to_datetime(sales_train['date'], format='%d.%m.%Y').dt.year - min_date.year) * 12 + pd.to_datetime(sales_train['date'], format='%d.%m.%Y').dt.month - min_date.month,
    'shop_id': sales_train['shop_id'],
    'item_id': sales_train['item_id'],
    'category_id': pd.merge(sales_train, items, on='item_id', how='left')['item_category_id'],
    'price': sales_train['item_price'],
    'quantity': sales_train['item_cnt_day']
})
ventas = dfprod.groupby(['shop_id', 'item_id']).size().reset_index(name='size')
ventasfilt = ventas.loc[(ventas['size'] > 20)]
dfprod = pd.merge(dfprod, ventasfilt, on=['item_id', 'shop_id']).drop(['size'], axis=1)
dfprod = dfprod.loc[(dfprod['date'] >= min_date)]

In [37]:
dfprod

Unnamed: 0,date,day,month,year,weekday,is_month_start,is_month_end,acc_month_num,shop_id,item_id,category_id,price,quantity
100,2013-11-24,24,11,2013,6,False,False,1,25,2574,55,399.0,1.0
101,2013-11-27,27,11,2013,2,False,False,1,25,2574,55,399.0,1.0
102,2013-11-01,1,11,2013,4,True,False,1,25,2574,55,399.0,1.0
103,2013-11-05,5,11,2013,1,False,False,1,25,2574,55,399.0,1.0
104,2013-11-08,8,11,2013,4,False,False,1,25,2574,55,399.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288284,2015-10-26,26,10,2015,0,False,False,24,25,15069,28,2499.0,1.0
1288285,2015-10-27,27,10,2015,1,False,False,24,25,15069,28,2499.0,1.0
1288286,2015-10-29,29,10,2015,3,False,False,24,25,15069,28,2499.0,3.0
1288287,2015-10-30,30,10,2015,4,False,False,24,25,15069,28,2499.0,1.0


In [38]:
# Obtén todos los productos únicos
todos_productos = dfprod[['shop_id', 'item_id']].drop_duplicates().assign(key=range(1, len(dfprod[['shop_id', 'item_id']].drop_duplicates()) + 1))
todas_fechas = pd.DataFrame({'date': dates})

# Divide las fechas en lotes
batch_size = 100  # ajusta según tu capacidad de memoria
fechas_lotes = [todas_fechas[i:i + batch_size] for i in range(0, todas_fechas.shape[0], batch_size)]

# Crea un DataFrame con todas las combinaciones de fechas y item_id
for i, fechas_lote in tqdm(enumerate(fechas_lotes)):
    # Realiza un left join con las ventas reales
    all_combinations = pd.DataFrame(list(product(fechas_lote['date'], todos_productos['key'])), columns=['date', 'key'])
    all_combinations = pd.merge(all_combinations, todos_productos, on='key', how='left').drop('key', axis=1)
    merged_df = pd.merge(all_combinations, dfprod, on=['date', 'shop_id', 'item_id'], how='left')

    # Rellenar NaN con 0 en la columna 'quantity'
    merged_df['quantity'].fillna(0, inplace=True)

    if i == 0:
        merged_df.to_csv('dataset.csv', index=False)
    else:
        merged_df.to_csv('dataset.csv', mode='a', header=False, index=False)



8it [01:10,  8.76s/it]
