In [1]:
import numpy as np
import pandas as pd

import lightgbm as lgb

from numpy.fft import rfft, irfft, rfftfreq
from sklearn.metrics import mean_squared_error, mean_absolute_error

import matplotlib.pyplot as plt

In [2]:
! ls ./data

submission.csv test.csv       train.csv


In [3]:
train = pd.read_csv('./data/train.csv', delimiter=';')
test  = pd.read_csv('./data/test.csv')

In [4]:
def clean(df: pd.DataFrame) -> pd.DataFrame:
    new_df = df.copy()
    new_df[df.columns[4:]] = df[df.columns[4:]]\
                                              .apply(lambda x : x.str.replace('\xa0', '').str.replace(',', '.'))\
                                              .astype('float32')
    return new_df

In [5]:
train = clean(train)

In [6]:
train['date'] = pd.to_datetime(train['date'],infer_datetime_format=True)
test['date'] = pd.to_datetime(test['date'],infer_datetime_format=True)

In [7]:
columns = test.columns

targets = ['pasta', 'legumes', 'bread', 'flour', 'rice', 'groats', 'potatoes', 'сucumbers_tomatoes',
           'vegetables', 'roots', 'сabbage', 'fruit', 'sugar', 'candy', 'biscuits',
           'mutton', 'beef', 'chicken', 'pork', 'fish', 'herring', 'curd',
           'sour_creame', 'butter', 'milk_kefir', 'cheese', 'egg', 'margarine',
           'oil', 'salt', 'spice', 'tea', 'cpi_1', 'cpi_2', 'cpi_3', 'pasta_value',
           'legumes_value', 'bread_value', 'flour_value', 'rice_value',
           'groats_value', 'potatoes_value', 'сucumbers_tomatoes_value',
           'vegetables_value', 'roots_value', 'сabbage_value', 'fruit_value',
           'sugar_value', 'candy_value', 'biscuits_value', 'mutton_value',
           'beef_value', 'chicken_value', 'pork_value', 'fish_value',
           'herring_value', 'curd_value', 'sour_creame_value', 'butter_value',
           'milk_kefir_value', 'cheese_value', 'egg_value', 'margarine_value',
           'oil_value', 'salt_value', 'spice_value', 'tea_value', 'ai92', 'ai95',
           'ai98', 'dt', 'ai92_value', 'ai95_value', 'ai98_value', 'dt_value']

In [8]:
def get_date_feats(data: pd.DataFrame, series: pd.Series) -> pd.DataFrame:
    new_data = data.copy()
    
    new_data['day_month'] = series.dt.day
    new_data['day_week'] = series.dt.dayofweek
    new_data['day_year'] = series.dt.dayofyear
    new_data['month'] = series.dt.month
    new_data['year'] = series.dt.year
    
    new_data['sin_day_year'] = np.sin(series.dt.dayofyear)
    new_data['sin_month'] = np.sin(series.dt.month)   
    new_data['sin_year'] = np.sin(series.dt.year)
    
    return new_data


def stats_per_period(tr_data: pd.DataFrame,
                     te_data: pd.DataFrame,
                     groupby: list,
                     targets: list) -> tuple:

    prefix = groupby[-1] + '_'
    new_tr = tr_data.copy()
    new_te = te_data.copy()

    stds = tr_data.groupby(groupby).std().reset_index()[groupby + targets]
    stds.columns = groupby + [prefix + col + '_std' for col in stds.columns if col not in groupby]

    means = tr_data.groupby(groupby).mean().reset_index()[groupby + targets]
    means.columns = groupby + [prefix + col + '_mean' for col in means.columns if col not in groupby]

    medians = tr_data.groupby(groupby).median().reset_index()[groupby + targets]
    medians.columns = groupby + [prefix + col + '_median' for col in medians.columns if col not in groupby]

    maxs = tr_data.groupby(groupby).max().reset_index()[groupby + targets]
    maxs.columns = groupby + [prefix + col + '_max' for col in maxs.columns if col not in groupby]

    mins = tr_data.groupby(groupby).min().reset_index()[groupby + targets]
    mins.columns = groupby + [prefix + col + '_min' for col in mins.columns if col not in groupby]

    var_s = tr_data.groupby(groupby).var().reset_index()[groupby + targets]
    var_s.columns = groupby + [prefix + col + '_var' for col in mins.columns if col not in groupby]

    new_tr = tr_data.merge(var_s, how='left', on=groupby)
    new_te = te_data.merge(var_s, how='left', on=groupby)

    new_tr = new_tr.merge(mins, how='left', on=groupby)
    new_te = new_te.merge(mins, how='left', on=groupby)

    new_tr = new_tr.merge(maxs, how='left', on=groupby)
    new_te = new_te.merge(maxs, how='left', on=groupby)

    new_tr = new_tr.merge(means, how='left', on=groupby)
    new_te = new_te.merge(means, how='left', on=groupby)

    new_tr = new_tr.merge(stds, how='left', on=groupby)
    new_te = new_te.merge(stds, how='left', on=groupby)

    return new_tr, new_te

In [9]:
# median = tr_feats.groupby(['region',"day_year"])[targets].transform('median')
# mean = tr_feats.groupby(['region',"day_year"])[targets].transform('mean')

In [10]:
tr_feats = get_date_feats(train, train['date'])
te_feats = get_date_feats(test, test['date'])

In [11]:
tr_feats, te_feats = stats_per_period(tr_feats, te_feats, ['region', 'month'], targets=targets)

tr_feats, te_feats = stats_per_period(tr_feats, te_feats, ['region', 'sin_year'], targets=targets)
tr_feats, te_feats = stats_per_period(tr_feats, te_feats, ['region', 'sin_month'], targets=targets)
tr_feats, te_feats = stats_per_period(tr_feats, te_feats, ['region', 'sin_day_year'], targets=targets)

tr_feats, te_feats = stats_per_period(tr_feats, te_feats, ['region', 'day_month'], targets=targets)
tr_feats, te_feats = stats_per_period(tr_feats, te_feats, ['region', 'day_week'], targets=targets)
tr_feats, te_feats = stats_per_period(tr_feats, te_feats, ['region', 'day_year'], targets=targets)

In [12]:
tr_feats.head()

Unnamed: 0,region,oktmo,okato,date,pasta,legumes,bread,flour,rice,groats,...,day_year_spice_value_std,day_year_tea_value_std,day_year_ai92_std,day_year_ai95_std,day_year_ai98_std,day_year_dt_std,day_year_ai92_value_std,day_year_ai95_value_std,day_year_ai98_value_std,day_year_dt_value_std
0,72,71000000000,71000000000,2021-03-31,86.360001,108.629997,75.099998,34.060001,81.139999,65.370003,...,3850.218926,12602.785424,1.497698,1.912625,2.571225,1.18951,165688.418777,92848.488139,554.253832,11384.350561
1,75,76000000000,76000000000,2021-03-31,83.480003,123.529999,73.870003,42.34,65.980003,50.66,...,2963.711997,5980.915565,2.434837,2.497445,2.147463,1.254844,64368.93295,37857.37757,2518.607275,8074.896824
2,74,75000000000,75000000000,2021-03-31,75.589996,103.610001,81.43,33.27,77.010002,70.489998,...,4608.081842,19570.129628,1.268081,1.149144,0.424421,0.924067,226579.766715,126345.157487,329.632896,16923.41249
3,73,73000000000,73000000000,2021-03-31,73.769997,87.809998,76.339996,32.330002,72.690002,69.959999,...,2270.057384,4205.018543,1.319129,1.676433,0.420159,1.136091,60132.069725,25724.936011,3468.80001,15819.405396
4,65,64000000000,64000000000,2021-03-31,132.149994,123.599998,109.730003,50.709999,24.65,20.25,...,2009.641306,2049.357604,3.600339,2.785935,3.663118,2.728266,44221.221308,31611.459956,10935.277605,3796.443531


In [13]:
tr_feats = tr_feats.drop(columns=targets+['date']).copy()
te_feats = te_feats.drop(columns=targets+['date']).copy()

In [14]:
bubble = np.zeros(len(test))
mse_res = np.ones(len(targets))
mae_res = np.ones(len(targets))

for i, t in enumerate(targets):
    model = lgb.LGBMRegressor()
    try:
        target = train[t]
        model.fit(tr_feats, target)
        pred = model.predict(tr_feats)
        
        mse = mean_squared_error(target, pred)
        mae = mean_absolute_error(target, pred)
        
        mse_res[i] = mse
        mae_res[i] = mae
        
        test[t] = model.predict(te_feats)
        
        print('[INFO] For {} MSE={:2.6f}, MAE={:2.6f}'.format(t, mse, mae))
        
    except ValueError:
        print('[ERROR] {}'.format(t))
        test[t] = bubble
        
print('[INFO] MSE: mean={:2.6f}, std={:2.6f}'.format(np.mean(mse_res), np.std(mse_res)))
print('[INFO] MAE: mean={:2.6f}, std={:2.6f}'.format(np.mean(mae_res), np.std(mae_res)))

[INFO] For pasta MSE=3.623093, MAE=1.060649
[INFO] For legumes MSE=15.340278, MAE=2.014404
[INFO] For bread MSE=1.282962, MAE=0.696431
[INFO] For flour MSE=1.400502, MAE=0.553143
[INFO] For rice MSE=6.113641, MAE=1.285507
[INFO] For groats MSE=3.999361, MAE=0.987414
[INFO] For potatoes MSE=1.637988, MAE=0.796150
[INFO] For сucumbers_tomatoes MSE=81.191539, MAE=4.913577
[INFO] For vegetables MSE=2.998717, MAE=1.066737
[INFO] For roots MSE=5.536946, MAE=1.412186
[INFO] For сabbage MSE=2.233310, MAE=0.840475
[INFO] For fruit MSE=4.988003, MAE=1.111048
[INFO] For sugar MSE=1.667787, MAE=0.779655
[INFO] For candy MSE=137.346997, MAE=7.020575
[INFO] For biscuits MSE=23.616456, MAE=2.971617
[INFO] For mutton MSE=1162.566949, MAE=11.882577
[INFO] For beef MSE=533.448424, MAE=14.834959
[INFO] For chicken MSE=14.676873, MAE=2.646527
[INFO] For pork MSE=121.337099, MAE=6.142768
[INFO] For fish MSE=986.066557, MAE=19.495919
[INFO] For herring MSE=142.770293, MAE=7.107381
[INFO] For curd MSE=89.072

In [15]:
test.to_csv('./data/submission.csv')