In [43]:
import pandas as pd
from pandas._libs import algos, lib
from pandas._libs.tslibs import conversion
from pandas.compat import PY36
import pandas.core.dtypes.common
import os
import math
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [6]:
def add_datepart(df, fldname, drop=True, time=False, errors="raise"):
    import numpy as np
    import re
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.
    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.
    Examples:
    ---------
    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df
        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13
    >>> add_datepart(df, 'A')
    >>> df
        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [45]:
def set_rf_samples(n):
    """ Changes Scikit learn's random forests to give each tree a random sample of
    n random rows.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))

In [9]:
PATH = "data/"

In [10]:
!ls {PATH}

item_categories.csv      sales_train.csv          shops.csv
items.csv                sample_submission.csv.gz test.csv.gz


In [11]:
!python --version

Python 3.6.8 :: Anaconda, Inc.


In [12]:
!head {PATH}/sales_train.csv

date,date_block_num,shop_id,item_id,item_price,item_cnt_day
02.01.2013,0,59,22154,999.0,1.0
03.01.2013,0,25,2552,899.0,1.0
05.01.2013,0,25,2552,899.0,-1.0
06.01.2013,0,25,2554,1709.05,1.0
15.01.2013,0,25,2555,1099.0,1.0
10.01.2013,0,25,2564,349.0,1.0
02.01.2013,0,25,2565,549.0,1.0
04.01.2013,0,25,2572,239.0,1.0
11.01.2013,0,25,2572,299.0,1.0


In [9]:
df_raw = pd.read_csv(F"{PATH}/sales_train.csv", low_memory=False,
                     parse_dates=["date"])

In [10]:
df_raw.tail().T

Unnamed: 0,2935844,2935845,2935846,2935847,2935848
date,2015-10-10 00:00:00,2015-09-10 00:00:00,2015-10-14 00:00:00,2015-10-22 00:00:00,2015-03-10 00:00:00
date_block_num,33,33,33,33,33
shop_id,25,25,25,25,25
item_id,7409,7460,7459,7440,7460
item_price,299,299,349,299,299
item_cnt_day,1,1,1,1,1


In [11]:
df_raw.describe(include='all').T

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
date,2935849.0,1034.0,2013-12-28 00:00:00,9434.0,2013-01-01 00:00:00,2015-12-10 00:00:00,,,,,,,
date_block_num,2935850.0,,,,,,14.5699,9.42299,0.0,7.0,14.0,23.0,33.0
shop_id,2935850.0,,,,,,33.0017,16.227,0.0,22.0,31.0,47.0,59.0
item_id,2935850.0,,,,,,10197.2,6324.3,0.0,4476.0,9343.0,15684.0,22169.0
item_price,2935850.0,,,,,,890.853,1729.8,-1.0,249.0,399.0,999.0,307980.0
item_cnt_day,2935850.0,,,,,,1.24264,2.61883,-22.0,1.0,1.0,1.0,2169.0


In [12]:
??df_raw.drop

In [12]:
add_datepart(df_raw, 'date')

In [15]:
df_raw.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,0,59,22154,999.0,1.0,2013,2,5,1,4,32,False,True,False,False,False,False,1359676800
1,0,25,2552,899.0,1.0,2013,3,9,1,4,60,False,True,False,False,False,False,1362096000
2,0,25,2552,899.0,-1.0,2013,5,18,1,2,121,False,True,False,False,False,False,1367366400
3,0,25,2554,1709.05,1.0,2013,6,22,1,5,152,False,True,False,False,False,False,1370044800
4,0,25,2555,1099.0,1.0,2013,1,3,15,1,15,False,False,False,False,False,False,1358208000


In [16]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(df_raw.drop('item_cnt_day', axis=1), df_raw.item_cnt_day)



CPU times: user 5min 16s, sys: 2.87 s, total: 5min 19s
Wall time: 1min 1s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [16]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/predict-sales-raw')

In [2]:
df_raw = pd.read_feather('tmp/predict-sales-raw')

In [3]:
df_raw.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,0,59,22154,999.0,1.0,2013,2,5,1,4,32,False,True,False,False,False,False,1359676800
1,0,25,2552,899.0,1.0,2013,3,9,1,4,60,False,True,False,False,False,False,1362096000
2,0,25,2552,899.0,-1.0,2013,5,18,1,2,121,False,True,False,False,False,False,1367366400
3,0,25,2554,1709.05,1.0,2013,6,22,1,5,152,False,True,False,False,False,False,1370044800
4,0,25,2555,1099.0,1.0,2013,1,3,15,1,15,False,False,False,False,False,False,1358208000


In [49]:
df = df_raw.drop('item_cnt_day', axis=1)
y = df_raw.item_cnt_day

In [53]:
df.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,0,59,22154,999.0,2013,2,5,1,4,32,False,True,False,False,False,False,1359676800
1,0,25,2552,899.0,2013,3,9,1,4,60,False,True,False,False,False,False,1362096000
2,0,25,2552,899.0,2013,5,18,1,2,121,False,True,False,False,False,False,1367366400
3,0,25,2554,1709.05,2013,6,22,1,5,152,False,True,False,False,False,False,1370044800
4,0,25,2555,1099.0,2013,1,3,15,1,15,False,False,False,False,False,False,1358208000


In [54]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 214201
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((2721648, 17), (2721648,), (214201, 17), (214201,))

In [55]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [F"RMSE TRAIN: {rmse(m.predict(X_train), y_train)}",
           F"RMSE VALID: {rmse(m.predict(X_valid), y_valid)}",
                F"SCORE train: {m.score(X_train, y_train)}",
           F"SCORE VALID {m.score(X_valid, y_valid)}"]
    if hasattr(m, 'oob_score_'): res.append(F"OOB SCORE: {m.oob_score_}")
    print(res)

In [56]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)



CPU times: user 4min 48s, sys: 2.72 s, total: 4min 51s
Wall time: 58.2 s
['RMSE TRAIN: 0.7530979610786089', 'RMSE VALID: 5.7390305480788815', 'SCORE train: 0.8811612085495595', 'SCORE VALID 0.012701214860079468']
