In [245]:
import numpy as np
import pandas as pd
from itertools import product
from tqdm import tqdm

In [246]:
items = pd.read_csv('data/items.csv')
sales_train = pd.read_csv('data/sales_train.csv')
sales_train['date'] = pd.to_datetime(sales_train['date'], format='%d.%m.%Y')
opep = pd.read_csv('barrilOPEP.csv', header=0)
urate = pd.read_csv('desempleo.csv', header=0)
IPCrate = pd.read_csv('inflacion.csv', header=0)

datasets = {
    'items': items.columns.values,
    'sales_train': sales_train.columns.values,
}

# Si cada dataset tiene menos de 6 columnas, completar con NaN.
for dataset in datasets:
    while len(datasets[dataset]) < 6:
        datasets[dataset] = np.append(datasets[dataset], '-')

pd.DataFrame(datasets)

Unnamed: 0,items,sales_train
0,item_name,date
1,item_id,date_block_num
2,item_category_id,shop_id
3,-,item_id
4,-,item_price
5,-,item_cnt_day


In [247]:
max_date = pd.to_datetime(sales_train['date'], format='%d.%m.%Y').max()
min_date = max_date - pd.Timedelta(days=366)
dates = pd.DataFrame({'date': pd.date_range(start=min_date, end=max_date, freq='D')})

sales = sales_train.groupby(['shop_id', 'item_id']).size().reset_index(name='size')
filsales = sales.loc[(sales['size'] > 50)]
prods = pd.merge(sales_train, filsales, on=['item_id', 'shop_id']).drop(['size'], axis=1)
filprods = prods.loc[(pd.to_datetime(prods['date'], format='%d.%m.%Y') >= min_date)]

all_prod = filprods[['shop_id', 'item_id']].drop_duplicates().assign(key=range(1, len(filprods[['shop_id', 'item_id']].drop_duplicates()) + 1))

batch_size = 10
fechas_lotes = [dates[i:i + batch_size] for i in range(0, dates.shape[0], batch_size)]
df = pd.DataFrame()
for i, dates_batch in tqdm(enumerate(fechas_lotes)):
    all_combinations = pd.DataFrame(list(product(dates_batch['date'], all_prod['key'])), columns=['date', 'key'])   # 1. Crea todas las combinaciones de fechas y productos.
    df = pd.merge(all_combinations, all_prod, on='key', how='left').drop('key', axis=1) # 2. Elimina la columna key.
    df = pd.DataFrame({
        'date': df['date'],
        'day': df['date'].dt.day.astype(int),
        'month': df['date'].dt.month.astype(int),
        'year': df['date'].dt.year.astype(int),
        'quarter': df['date'].dt.quarter.astype(int),
        'weekday': df['date'].dt.weekday.astype(int),
        'is_month_start': df['date'].dt.is_month_start.map({True: 1, False: 0}),
        'is_month_end': df['date'].dt.is_month_end.map({True: 1, False: 0}),
        'acc_month_num': ((df['date'].dt.year - min_date.year) * 12 + df['date'].dt.month - min_date.month).astype(int),
        'shop_id': df['shop_id'].astype(int),
        'item_id': df['item_id'].astype(int),
        'category_id': pd.merge(df, items, on='item_id', how='left')['item_category_id'].astype(int),
        'price': pd.merge(df, sales_train, on=['date', 'shop_id', 'item_id'], how='left')['item_price'],
        'quantity': pd.merge(df, sales_train, on=['date', 'shop_id', 'item_id'], how='left')['item_cnt_day'].fillna(0).astype(int)
    })

    pbirusia = pd.DataFrame([
    [396372, 410803, 413837, 341693, 287406, 351676, 299481, 292302]
    ], columns=['2014T1', '2014T2', '2014T3', '2014T4', '2015T1', '2015T2', '2015T3', '2015T4'])
    for quarter, id in enumerate(pbirusia):
        df.loc[df['quarter'] == quarter + 1, 'pbi'] = pbirusia[id].values[0]
    opep['date'] = pd.to_datetime(opep['date'], format='%Y-%m-%d')
    df = pd.merge(df, opep, on='date', how='left')
    df['OPEP_oil_price'] = df['OPEP_oil_price'].ffill().bfill()
    df = pd.merge(df, urate, on=['month', 'year'], how='left')
    df = pd.merge(df, IPCrate, on=['month', 'year'], how='left')

    if i == 0:
        df.to_csv('data/dataset.csv', index=False)
    else:
        df.to_csv('data/dataset.csv', mode='a', header=False, index=False)


df = pd.read_csv('data/dataset.csv')
df = df.sort_values(by=['date', 'shop_id', 'item_id'])
df['price'] = df.groupby(['shop_id', 'item_id'])['price'].bfill()
df['price'] = df.groupby(['shop_id', 'item_id'])['price'].ffill()
df.drop(['date'], axis=1, inplace=True)

df.to_csv('data/dataset.csv', index=False)

37it [00:45,  1.24s/it]


In [248]:
df.loc[(df['shop_id'] == 25) & (df['item_id'] == 2574)]

Unnamed: 0,day,month,year,quarter,weekday,is_month_start,is_month_end,acc_month_num,shop_id,item_id,category_id,price,quantity,pbi,OPEP_oil_price,unemployment_rate,IPC_rate
0,30.0,10.0,2014.0,4.0,3.0,0.0,0.0,0.0,25.0,2574.0,55.0,399.0,0,341693.0,82.79,5.1,8.3
5941,31.0,10.0,2014.0,4.0,4.0,0.0,1.0,0.0,25.0,2574.0,55.0,399.0,1,341693.0,81.97,5.1,8.3
11882,1.0,11.0,2014.0,4.0,5.0,1.0,0.0,1.0,25.0,2574.0,55.0,398.7,0,341693.0,81.97,5.2,9.1
17823,2.0,11.0,2014.0,4.0,6.0,0.0,0.0,1.0,25.0,2574.0,55.0,398.7,1,341693.0,81.97,5.2,9.1
23764,3.0,11.0,2014.0,4.0,0.0,0.0,0.0,1.0,25.0,2574.0,55.0,399.0,0,341693.0,80.64,5.2,9.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2150643,27.0,10.0,2015.0,4.0,1.0,0.0,0.0,12.0,25.0,2574.0,55.0,449.0,0,341693.0,42.40,5.5,15.6
2156584,28.0,10.0,2015.0,4.0,2.0,0.0,0.0,12.0,25.0,2574.0,55.0,449.0,0,341693.0,43.20,5.5,15.6
2162525,29.0,10.0,2015.0,4.0,3.0,0.0,0.0,12.0,25.0,2574.0,55.0,449.0,0,341693.0,44.34,5.5,15.6
2168466,30.0,10.0,2015.0,4.0,4.0,0.0,0.0,12.0,25.0,2574.0,55.0,449.0,0,341693.0,43.66,5.5,15.6
