In [32]:
import numpy as np
import pandas as pd
from itertools import product
from tqdm import tqdm

In [33]:
items = pd.read_csv('data/items.csv')
sales_train = pd.read_csv('data/sales_train.csv')
sales_train['date'] = pd.to_datetime(sales_train['date'], format='%d.%m.%Y')
opep = pd.read_csv('barrilOPEP.csv', header=0)
urate = pd.read_csv('desempleo.csv', header=0)
IPCrate = pd.read_csv('inflacion.csv', header=0)

datasets = {
    'items': items.columns.values,
    'sales_train': sales_train.columns.values,
}

# Si cada dataset tiene menos de 6 columnas, completar con NaN.
for dataset in datasets:
    while len(datasets[dataset]) < 6:
        datasets[dataset] = np.append(datasets[dataset], '-')

pd.DataFrame(datasets)

Unnamed: 0,items,sales_train
0,item_name,date
1,item_id,date_block_num
2,item_category_id,shop_id
3,-,item_id
4,-,item_price
5,-,item_cnt_day


In [34]:
max_date = pd.to_datetime(sales_train['date'], format='%d.%m.%Y').max()
min_date = max_date - pd.Timedelta(days=730)
dates = pd.DataFrame({'date': pd.date_range(start=min_date, end=max_date, freq='D')})

sales_train = sales_train.loc[pd.to_datetime(sales_train['date'], format='%d.%m.%Y') > min_date]
sales_train = sales_train.loc[sales_train['item_cnt_day'] > 0]
sales = sales_train.groupby(['shop_id', 'item_id'])['item_cnt_day'].sum().reset_index(name='sales')
filsales = sales.loc[(sales['sales'] > 50)]
prods = pd.merge(sales_train, filsales, on=['item_id', 'shop_id']).drop(['sales'], axis=1)

all_prod = prods[['shop_id', 'item_id']].drop_duplicates().assign(key=range(1, len(prods[['shop_id', 'item_id']].drop_duplicates()) + 1))

batch_size = 10
fechas_lotes = [dates[i:i + batch_size] for i in range(0, dates.shape[0], batch_size)]
save = pd.DataFrame()
for i, dates_batch in tqdm(enumerate(fechas_lotes)):
    all_combinations = pd.DataFrame(list(product(dates_batch['date'], all_prod['key'])), columns=['date', 'key'])   # 1. Crea todas las combinaciones de fechas y productos.
    df = pd.merge(all_combinations, all_prod, on='key', how='left').drop('key', axis=1) # 2. Elimina la columna key.
    df = pd.DataFrame({
        'date': df['date'],
        'day': df['date'].dt.day.astype(int),
        'month': df['date'].dt.month.astype(int),
        'year': df['date'].dt.year.astype(int),
        'quarter': df['date'].dt.quarter.astype(int),
        'weekday': df['date'].dt.weekday.astype(int),
        'is_month_start': df['date'].dt.is_month_start.map({True: 1, False: 0}),
        'is_month_end': df['date'].dt.is_month_end.map({True: 1, False: 0}),
        'acc_month_num': ((df['date'].dt.year - min_date.year) * 12 + df['date'].dt.month - min_date.month).astype(int),
        'shop_id': df['shop_id'].astype(int),
        'item_id': df['item_id'].astype(int),
        'category_id': pd.merge(df, items, on='item_id', how='left')['item_category_id'].astype(int),
        'price': pd.merge(df, sales_train, on=['date', 'shop_id', 'item_id'], how='left')['item_price'],
        'quantity': pd.merge(df, sales_train, on=['date', 'shop_id', 'item_id'], how='left')['item_cnt_day'].fillna(0).astype(int)
    })

    pbirusia = pd.DataFrame([
    [396372, 410803, 413837, 341693, 287406, 351676, 299481, 292302]
    ], columns=['2014T1', '2014T2', '2014T3', '2014T4', '2015T1', '2015T2', '2015T3', '2015T4'])
    for quarter, id in enumerate(pbirusia):
        df.loc[df['quarter'] == quarter + 1, 'pbi'] = pbirusia[id].values[0]
    opep['date'] = pd.to_datetime(opep['date'], format='%Y-%m-%d')
    df = pd.merge(df, opep, on='date', how='left')
    df['OPEP_oil_price'] = df['OPEP_oil_price'].bfill().ffill()
    df = pd.merge(df, urate, on=['month', 'year'], how='left')
    df = pd.merge(df, IPCrate, on=['month', 'year'], how='left')

    save = pd.concat([save, df])

save = save.sort_values(by=['date', 'shop_id', 'item_id'])
save['price'] = save.groupby(['shop_id', 'item_id'])['price'].bfill()
save['price'] = save.groupby(['shop_id', 'item_id'])['price'].ffill()
save.drop(['date'], axis=1, inplace=True)
save = save.dropna(subset=['item_id'])
save[['day', 'month', 'year', 'quarter', 'weekday', 'is_month_start', 'is_month_end', 'acc_month_num', 'shop_id', 'item_id', 'category_id', 'quantity']] = save[['day', 'month', 'year', 'quarter', 'weekday', 'is_month_start', 'is_month_end', 'acc_month_num', 'shop_id', 'item_id', 'category_id', 'quantity']].astype(int)

save.to_csv('data/dataset.csv', index=False)

74it [00:35,  2.09it/s]
