In [1]:
import numpy as np
import pandas as pd
import random as rd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression, Ridge

In [2]:
# Import all csv file data

sales = pd.read_csv( '../input/sales_train.csv')

# Warnings
import warnings
warnings.filterwarnings('ignore')

item_cat = pd.read_csv( '../input/item_categories.csv')
item = pd.read_csv( '../input/items.csv')
sub = pd.read_csv( '../input/sample_submission.csv')
shops = pd.read_csv( '../input/shops.csv')
test = pd.read_csv( '../input/test.csv')

In [3]:
# Reformat the date column
sales.date = sales.date.apply( lambda x: datetime.datetime.strptime( x, '%d.%m.%Y' ) )

# Add month and year columns
sales['month'] = [ x.month for x in sales.date ]
sales['year'] = [ x.year for x in sales.date ]
sales['year_month'] = sales.year * 100 + sales.month

# Add the item_category_id to the training set
sales = sales.set_index('item_id').join(item.set_index('item_id')).drop('item_name', axis=1).reset_index()
test = test.set_index('item_id').join(item.set_index('item_id')).drop('item_name', axis=1).reset_index()

# Add a unique id for the shop + item combo
sales['shop_item_id'] = sales.shop_id + sales.item_id * 100
test['shop_item_id'] = test.shop_id + test.item_id * 100
sales['shop_cat_id'] = sales.shop_id + sales.item_category_id * 100
test['shop_cat_id'] = test.shop_id + test.item_category_id * 100

# Add the revenue
sales[ "revenue" ] = sales.item_price * sales.item_cnt_day


In [4]:
agg_rules = {'item_price' : "mean", "revenue" : "sum", "item_cnt_day" : "sum" }
monthly_shop_item = sales.groupby([ "year_month", 'date_block_num', "shop_item_id", "item_id", "shop_id", "item_category_id" ] ).agg( agg_rules ).reset_index()

In [5]:

def create_pivot_ts( input_table, pivot_column, val_column, agg_rule, missing_method ):
    
        # Make time series out of the monthly  sales
        tmp_table = input_table.groupby( [ 'year_month', pivot_column ] ).agg( { val_column : agg_rule } ).reset_index()
        ts = tmp_table.pivot_table( val_column, index="year_month", columns=pivot_column )
        
        # Fill missing values with 0
        if missing_method == "zero":
            ts = ts.fillna(0)
        elif missing_method == 'ffill':
            ts = ts.fillna(method=missing_method)
        else:
            ValueError( 'Unsupported value: .' + missing_method )
        
        # Set negative values to 0
        ts[ ts < 0 ] = 0
        
        # Set the index to be the dates
        dates = year_month_to_datetime(ts.index.values )
        ts = ts.set_index( pd.Index( dates ) )
    
        # Make sure the dates are sorted
        ts.sort_index(axis=0, ascending=True, inplace=True )
        
        return(ts)
    

# Define a helper function
def year_month_to_datetime( ym ):

    if isinstance(ym, float ) or isinstance(ym, int):
        m = ym % 100
        y = ym // 100

        output = datetime.date( y, m, 1 )
    else:
        if isinstance( ym, pd.Series):
            ym = list(ym)

        output = []
        for j in range(len(ym)):
            m = ym[j] % 100
            y = ym[j] // 100

            output.append( datetime.date( y, m, 1 ) )

    return output    

def rmse( x1, x2 ):
    
    res = np.sqrt( np.mean( (x1.ravel()[:,np.newaxis] - x2.ravel()[:,np.newaxis] ) ** 2 ) )
    return(res)    


def decompose_shop_item_id( shop_item_id ):
    
    item_id = shop_item_id // 100
    shop_id = shop_item_id % 100

    return shop_id, item_id

In [6]:
# Create time series of the variables that we will use for prediction
ts_item_day = create_pivot_ts( monthly_shop_item, "shop_item_id", "item_cnt_day", "sum", 'zero'  )

In [7]:
# Create time series of the shop id and category id for the different shop/item combinations

shop_id, item_id = decompose_shop_item_id( ts_item_day.columns )

ts_shop_id = pd.DataFrame( np.vstack( [ shop_id.values ] * ts_item_day.shape[0] ), index=ts_item_day.index )
ts_shop_id.columns = ts_item_day.columns

uniq_items = item_id.unique()
cat_ids_for_uniq_ids = [ item[ item.item_id == x ].item_category_id.iloc[0] for x in uniq_items ]
id_map = dict(zip( list(uniq_items), cat_ids_for_uniq_ids ))
cat_ids = pd.Series( [ id_map[x] for x in item_id ] )

ts_cat_id = pd.DataFrame( np.vstack( [ cat_ids.values ] * ts_item_day.shape[0] ), index=ts_item_day.index )
ts_cat_id.columns = ts_item_day.columns

date_block_nums = np.array(list(range(0,ts_shop_id.shape[0])))[:,np.newaxis]
ts_date_num_block = pd.DataFrame( np.hstack( [ date_block_nums ] * ts_item_day.shape[1] ), index=ts_item_day.index )
ts_date_num_block.columns = ts_item_day.columns

months = np.array([ x % 100 for x in sorted(sales.year_month.unique()) ] )[:,np.newaxis]
ts_month = pd.DataFrame( np.hstack( [ months ] * ts_item_day.shape[1] ), index=ts_item_day.index )
ts_month.columns = ts_item_day.columns

years = np.array([ x // 100 for x in sorted(sales.year_month.unique()) ] )[:,np.newaxis]
ts_year = pd.DataFrame( np.hstack( [ years ] * ts_item_day.shape[1] ), index=ts_item_day.index )
ts_year.columns = ts_item_day.columns

In [105]:
import scipy.sparse.csr

def convert_to_sparse( xx_input ):

    xx_output = dict()
    xx_output[DESC] = xx_input[DESC].copy()
    
    for ds in [ TRAIN, VALID, TEST ]:
        xx_output[ds] = scipy.sparse.csr.csr_matrix( xx_input[ds] )    
        
    return(xx_output)
    

def add_one_hot_encoding( xx_input, feature_name, binarizer ):

    xx_output = dict()
    for ds in [ TRAIN, VALID, TEST, DESC ]:
        xx_output[ds] = xx_input[ds].copy()
    
    # Get the column with the target feature
    idx = xx_input[DESC].index(feature_name)
    
    # Update the descriptions of the features
    new_cols = [ feature_name + "_{:02d}".format(x) for x in binarizer.classes_ ]    
    xx_output[DESC] = xx_input[DESC][:idx] + xx_input[DESC][idx+1:] + new_cols
    
    # (1) Remove the target column from the data sets
    # (2) One-hot encode it
    # (3) Append to the right side of the data sets
    for ds in [ TRAIN, VALID, TEST ]:
        target_data = xx_input[ds][:,idx]
        one_hot_train = binarizer.transform( target_data.todense() )
        tmp = scipy.sparse.hstack( [ xx_input[ds][:,:idx], xx_input[ds][:,idx+1:], one_hot_train ] )
        xx_output[ds] = scipy.sparse.csr.csr_matrix( tmp )
        
    return xx_output
    

In [10]:
# Initialize dictionaries to store the labels and features
X = { TRAIN : [], VALID : [], TEST : [], DESC : [] }
Y = { TRAIN : [], VALID : [], TEST : [] }

# Create the test and train sets from the observation matrices
Y = get_regression_vectors_from_matrix( Y, ts_item_day, is_X=False )

# Use different sales lags as features
lags = [1, 2, 3, 6, 12 ]
for L in lags:
    x_lag_mtx = ts_item_day.shift(periods=L-1)
    X = get_regression_vectors_from_matrix( X, x_lag_mtx, is_X=True, descrip='sales_lag_{:02}'.format(L) )

# Use different means as features
means = [2, 3, 6, 12]
for M in means:
    x_mean_mtx = ts_item_day.rolling(window=M).mean()    
    X = get_regression_vectors_from_matrix( X, x_mean_mtx, is_X=True, descrip='sales_mean_{:02}'.format(M) )

# Add the shop id as a feature
X = get_regression_vectors_from_matrix( X, ts_shop_id, is_X=True, descrip='shop_id' )

# Add the category id as a feature
X = get_regression_vectors_from_matrix( X, ts_cat_id, is_X=True, descrip='cat_id' )

# Add the date block number as a feature
X = get_regression_vectors_from_matrix( X, ts_date_num_block, is_X=True, descrip='date_block_num' )

# Add the month as a feature
X = get_regression_vectors_from_matrix( X, ts_month, is_X=True, descrip='month' )

# Add the date block number as a feature
X = get_regression_vectors_from_matrix( X, ts_year, is_X=True, descrip='year' )


In [11]:
# Get the combined features in matrix form for the train, validation and test sets
xx_raw, yy = combine_features( X, Y )

In [12]:
# Preprocess the features (e.g. winsorize, clip values, etc.)
xx = preprocess_features(xx_raw)

In [106]:
xx_sparse = convert_to_sparse( xx )

In [107]:
# Replace shop_id with one-hot encodings
shop_binarizer = LabelBinarizer(sparse_output=True)
shop_binarizer.fit(sales.shop_id.unique())
xx_sparse = add_one_hot_encoding( xx_sparse, "shop_id", binarizer=shop_binarizer )

# Replace category_id with one-hot encodings
cat_binarizer = LabelBinarizer(sparse_output=True)
cat_binarizer.fit(sales.item_category_id.unique())
xx_sparse = add_one_hot_encoding( xx_sparse, "cat_id", binarizer=shop_binarizer )

# Replace month with one-hot encodings
cat_binarizer = LabelBinarizer(sparse_output=True)
cat_binarizer.fit(np.arange(1,12))
xx_sparse = add_one_hot_encoding( xx_sparse, "month", binarizer=shop_binarizer )

In [140]:
def forecast_sklearn( model_constructor_fun, xx_input, yy_input, output_file, \
                                     date_block_cutoff=0, clip_forecasts=(0,20)):
    """A function that performs the validation testing and writes the formatted
    predictions to a csv file. 
    
    The forecasts will be based on the combined train and validation sets.
    
    The model_constructor_fun should take no arguments, and produce a model of the sklearn class.
    The model that is created is expected to have both 'fit' and 'predict' methods.
    
    The results of the tests will be written to the output_file."""
    
    # Exclude date blocks before the cutoff
    if date_block_cutoff > 0:
        idx_db_col = [ x == 'date_block_num' for x in xx_input[DESC] ]
        
        # Make copies before removing rows so we don't change the original matrices
        xx = dict()
        yy = dict()        
        xx[DESC] = xx_input[DESC].copy()
        for ds in [ TRAIN, VALID, TEST ]:
            xx[ds] = xx_input[ds].copy()
            yy[ds] = yy_input[ds].copy()
            
            idx = [ r > date_block_cutoff for r in xx[ds][:,idx_db_col] ]
            xx[ds] = xx[ds][idx,:]
            yy[ds] = yy[ds][idx,:]
    else:
        # No need to make copies if we are not removing rows
        xx = xx_input
        yy = yy_input
    
    # Fit the model using the training data
    model = model_constructor_fun()
    model.fit( xx[TRAIN], yy[TRAIN] )        

    # Run the model on the validation set and see the score
    yhat_valid = model.predict(xx[VALID])

    # Clip the forecast to lie in the appropriate interval
    yhat_valid = np.maximum( clip_forecasts[0], yhat_valid )
    yhat_valid = np.minimum( clip_forecasts[1], yhat_valid )    

    res = rmse( yhat_valid, yy[VALID] )
    print( 'Validation RMSE {}'.format( res ) )

    ###############################################################
    # Write the forecast for the test set to csv

    # Fit the model using the training plus validation data
    model = model_constructor_fun()
    if isinstance( xx[TRAIN], np.ndarray ):
        model.fit( np.vstack( [ xx[TRAIN], xx[VALID] ] ), \
                   np.vstack( [ yy[TRAIN], yy[VALID] ] ) )
    else:
        model.fit( scipy.sparse.vstack( [ xx[TRAIN], xx[VALID] ] ), \
                   np.vstack( [ yy[TRAIN], yy[VALID] ] ) )        

    # Forecast values for the test set
    yhat_test = model.predict(xx[TEST])
    
    # Clip the forecast to lie in the appropriate interval
    yhat_test = np.maximum( clip_forecasts[0], yhat_test )
    yhat_test = np.minimum( clip_forecasts[1], yhat_test )   
    
    # Format the forecast as a Pandas Series and write the output to .csv
    fcst = pd.Series( yhat_test.ravel(), index=pd.Index( ts_item_day.columns, dtype="int64") )
    fcst_df = format_forecast( test, fcst, fill_na=True )

    # Write the results to the output file
    fcst_df.to_csv( '../forecasts/'+output_file, index=False, header=True )

In [461]:
# Benchmark forecast - use the previous month's sales

yhat_valid = xx[VALID][:,0]

res = rmse( yhat_valid, yy[VALID] )
print( 'Validation RMSE {}'.format( res ) )

###############################################################
# Write the forecast for the test set to csv

yhat_test = xx[TEST][:,0]

# Format the forecast as a Pandas Series and write the output to .csv
fcst = pd.Series( yhat_test.ravel(), index=pd.Index( ts_item_day.columns, dtype="int64") )
fcst_df = format_forecast( test, fcst, fill_na=True )

fcst_df.to_csv( '../forecasts/benchmark_04.csv', index=False, header=True )

Validation RMSE 3.942574988430639


In [None]:
# Linear Regression
model_constructor_fun = lambda : LinearRegression()
forecast_sklearn( model_constructor_fun, xx, yy, 'linreg_02.csv' )

In [None]:
# Ridge Regression
model_constructor_fun = lambda : Ridge(alpha=0.2, normalize=True)
forecast_sklearn( model_constructor_fun, xx_sparse, yy, 'ridge_03.csv', date_block_cutoff=32 )

In [None]:
# Support Vector Regression
from sklearn.svm import SVR

model_constructor_fun = lambda : SVR(C=1)
forecast_sklearn( model_constructor_fun, xx, yy, 'svr_01.csv', date_block_cutoff=30 )

In [137]:
# Try to forecast using Random forests
from sklearn.ensemble import RandomForestRegressor

model_constructor_fun = lambda : RandomForestRegressor(n_estimators=10, max_depth=2)
forecast_sklearn( model_constructor_fun, xx_sparse, yy, 'ridge_03.csv', date_block_cutoff=32 )

KeyboardInterrupt: 

In [54]:
# Try to forecast using Adaboost
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

sub_cols = [0,2,5,11,17]
sub_cols = [0,2,5,11,17]
xx_train_sub = xx_train[:,sub_cols]
xx_valid_sub = xx_valid[:,sub_cols]
xx_test_sub = xx_test[:,sub_cols]

# Fit the model using the training data
BE = DecisionTreeRegressor(max_depth=3)
BE = LinearRegression()
model = AdaBoostRegressor(n_estimators=50, random_state=0, base_estimator=BE, loss="linear", learning_rate=0.0005 )
model.fit(xx_train_sub, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid_sub).reshape(xx_valid_sub.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

Validation RMSE 1.965791369627067


In [33]:
# Try to forecast using Random forests
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge

# Fit the model using the training data
model = Ridge(alpha=0.6, normalize=True)
model.fit(xx_train, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid).reshape(xx_valid.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

# Make a forecast using the test data
yhat_test = model.predict(xx_test).reshape(xx_test.shape[0],1)
yhat_test = np.maximum(0, yhat_test)

# Format the forecast as a Pandas Series and write the output to .csv
fcst = pd.Series( yhat_test.ravel(), index=pd.Index(ts_item_day.columns) )
output = format_forecast( test.ID, fcst )

output.to_csv( '../forecasts/forecast_10.csv', index=True, header=True )

Validation RMSE 1.7383397340041775


In [146]:
monthly_shop_item.head()

Unnamed: 0,year_month,date_block_num,shop_item_id,item_id,shop_id,item_category_id,item_price,revenue,item_cnt_day
0,201301,0,1925,19,25,40,28.0,28.0,1.0
1,201301,0,2701,27,1,19,1890.0,1890.0,1.0
2,201301,0,2702,27,2,19,2499.0,2499.0,1.0
3,201301,0,2710,27,10,19,1890.0,1890.0,1.0
4,201301,0,2719,27,19,19,2499.0,2499.0,1.0


'price_lag_12'