In [1]:
import numpy as np
import pandas as pd
import random as rd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import all csv file data

sales = pd.read_csv( 'data/sales_train.csv')

# Warnings
import warnings
warnings.filterwarnings('ignore')

item_cat = pd.read_csv( 'data/item_categories.csv')
item = pd.read_csv( 'data/items.csv')
sub = pd.read_csv( 'data/sample_submission.csv')
shops = pd.read_csv( 'data/shops.csv')
test = pd.read_csv( 'data/test.csv')

In [3]:
# Reformat the date column
sales.date = sales.date.apply( lambda x: datetime.datetime.strptime( x, '%d.%m.%Y' ) )

# Add month and year columns
sales['month'] = [ x.month for x in sales.date ]
sales['year'] = [ x.year for x in sales.date ]
sales['year_month'] = sales.year * 100 + sales.month

# Add the item_category_id to the training set
sales = sales.set_index('item_id').join(item.set_index('item_id')).drop('item_name', axis=1).reset_index()
test = test.set_index('item_id').join(item.set_index('item_id')).drop('item_name', axis=1).reset_index()

# Add a unique id for the shop + item combo
sales['shop_item_id'] = sales.shop_id + sales.item_id * 100
test['shop_item_id'] = test.shop_id + test.item_id * 100
sales['shop_cat_id'] = sales.shop_id + sales.item_category_id * 100
test['shop_cat_id'] = test.shop_id + test.item_category_id * 100

# Add the revenue
sales[ "revenue" ] = sales.item_price * sales.item_cnt_day

In [4]:
agg_rules = {'item_price' : "mean", "revenue" : "sum", "item_cnt_day" : "sum" }
sales_monthly = sales.groupby([ "year_month", "shop_id", "item_id", "item_category_id", "shop_item_id", "shop_cat_id" ] ).agg( agg_rules ).reset_index()

agg_rules = {'item_price' : "mean", "revenue" : "sum", "item_cnt_day" : "sum", "shop_id" : "count", "item_id" : "count" }
sales_monthly_cat = sales_monthly.groupby([ "year_month", "item_category_id" ] ).agg( agg_rules ).reset_index()

agg_rules = {'item_price' : "mean", "revenue" : "sum", "item_cnt_day" : "sum", "item_category_id" : "count", "item_id" : "count" }
sales_monthly_shop = sales_monthly.groupby([ "year_month", "shop_id" ] ).agg( agg_rules ).reset_index()

agg_rules = {'item_price' : "mean", "revenue" : "sum", "item_cnt_day" : "sum", "shop_id" : "count" }
sales_monthly_item = sales_monthly.groupby([ "year_month", "item_id" ] ).agg( agg_rules ).reset_index()

agg_rules = {'item_price' : "mean", "revenue" : "sum", "item_cnt_day" : "sum" }
sales_monthly_shop_item = sales_monthly.groupby([ "year_month", "shop_item_id", "item_id", "shop_id", "item_category_id" ] ).agg( agg_rules ).reset_index()

agg_rules = {'item_price' : "mean", "revenue" : "sum", "item_cnt_day" : "sum", "item_id" : "sum" }
sales_monthly_shop_cat = sales_monthly.groupby([ "year_month", "shop_cat_id", "item_category_id", "shop_id" ] ).agg( agg_rules ).reset_index()

In [5]:

def create_pivot_ts( input_table, pivot_column ):
    
        # Make time series out of the monthly  sales
        tmp_table = input_table.groupby( [ 'year_month', pivot_column ] ).agg( { 'item_cnt_day' : 'sum' } ).reset_index()
        ts = tmp_table.pivot_table( "item_cnt_day", index="year_month", columns=pivot_column )
        
        # Fill missing values with 0
        ts = ts.fillna(0)
        
        # Set negative values to 0
        ts[ ts < 0 ] = 0
        
        # Set the index to be the dates
        dates = year_month_to_datetime(ts.index.values )
        ts = ts.set_index( pd.Index( dates ) )
    
        # Make sure the dates are sorted
        ts.sort_index(axis=0, ascending=True, inplace=True )
        
        return(ts)
        

# Define a helper function
def year_month_to_datetime( ym ):

    if isinstance(ym, float ) or isinstance(ym, int):
        m = ym % 100
        y = ym // 100

        output = datetime.date( y, m, 1 )
    else:
        if isinstance( ym, pd.Series):
            ym = list(ym)

        output = []
        for j in range(len(ym)):
            m = ym[j] % 100
            y = ym[j] // 100

            output.append( datetime.date( y, m, 1 ) )

    return output    

def rmse( x1, x2 ):
    
    res = np.sqrt( np.mean( (x1.ravel()-x2.ravel()) ** 2 ) )
    return(res)


In [6]:
# Create the pivot table containing sales for all shop and item combinations
sales_monthly_shop_item = create_pivot_ts( sales_monthly, 'shop_item_id' )

In [7]:
# Only keep the shop/item combinations that appear in the test set
Z = sales_monthly_shop_item.loc[ :, [ x in test.shop_item_id for x in sales_monthly_shop_item ] ]

In [8]:
def format_forecast( test_ids, fcst ):
    
    output = pd.Series( np.zeros_like(test_ids), index=test_ids, name="item_cnt_month" )

    # Copy the forecasts into the same order as the test IDs
    output[ fcst.index.values ] = fcst.values
    
    return(output)

In [9]:
SEED = 42
VALIDATION_PROPORTION = 0.1

def get_regression_vectors_from_matrix( input_mtx, is_X, seed=SEED, valid_prop=VALIDATION_PROPORTION ):

    N = input_mtx.shape[1]
    new_row = np.array( [np.NaN] * N ).reshape(1,N)    
    if is_X:
        M = np.vstack( [ new_row, input_mtx.to_numpy() ] )
    else:
        M = np.vstack( [ input_mtx.to_numpy(), new_row ] )

    # Randomly split the validation and training sets
    s_train_valid = M[:-1].ravel()[:,np.newaxis]
    s_train, s_valid = split_train_validation_sets( s_train_valid, seed=seed, valid_prop=valid_prop )
    
    # Get the test set
    s_test = M[-1].ravel()[:,np.newaxis]    

    return s_train, s_valid, s_test


def split_train_validation_sets( s, seed=SEED, valid_prop=VALIDATION_PROPORTION ):
    
    rng = np.random.RandomState(42)
    N = len(s)
    n_valid = int( np.floor( N * valid_prop ))
    
    idx = rng.choice(N, N, replace=False)
    s_valid = s[ idx[:n_valid] ]
    s_train = s[ idx[n_valid:] ]
    
    return s_train, s_valid
    

def remove_nan_rows( xx_input, yy_input ):
    idx = np.any( np.isnan( xx_input ), axis=1 ).ravel() | np.any( np.isnan( yy_input ), axis=1 ).ravel() 
    xx_output = xx_input[~idx,:]
    yy_output = yy_input[~idx,:]    
    return xx_output, yy_output
    
    
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:,:] - cumsum[:-N,:]) / float(N)

In [44]:
# Only keep the shop/item combinations that appear in the test set
W = sales_monthly_shop_item.loc[ :, [ x in test.shop_item_id for x in sales_monthly_shop_item ] ]
dW = W - W.shift(periods=1)
Z = W

In [88]:
# Create the test and train sets from the observation matrices
yy_train, yy_valid, yy_test = get_regression_vectors_from_matrix( Z, is_X=False )
W_train, W_valid, W_test = get_regression_vectors_from_matrix( W, is_X=True)

# Get feature vectors
X_train = [None] * 23
X_test = [None] * 23
X_valid = [None] * 23

# Use different lags as features
c = 0
for j in range(0, 12):
    X_train[c], X_valid[c], X_test[c] = get_regression_vectors_from_matrix( Z.shift(periods=j), is_X=True)
    c += 1
    
# Get rolling means
for j in range(1, 12):
    # Calculate the rolling mean
    x_mean_mtx = Z.rolling(window=j+1).mean()
    X_train[c], X_valid[c], X_test[c] = get_regression_vectors_from_matrix(x_mean_mtx, is_X=True )
    c += 1

# Combine all of the X vectors into a single feature matrix
xx_train = np.hstack(X_train)
xx_valid = np.hstack(X_valid)
xx_test = np.hstack(X_test)

# Remove all rows with NaNs in the Train and Validation sets
# In the test set, all Y values are unknown, so we leave these alone
W_train, _ = remove_nan_rows( W_train, np.hstack( [ xx_train, yy_train ] ) )
W_valid, _ = remove_nan_rows( W_valid, np.hstack( [ xx_valid, yy_valid ] ) )

xx_train, yy_train = remove_nan_rows( xx_train, yy_train )
xx_valid, yy_valid = remove_nan_rows( xx_valid, yy_valid )


In [106]:
# Predict with average of previous observation
from sklearn.linear_model import LinearRegression, Ridge

# Run the model on the validation set and see the score
yhat_valid = xx_valid[:,19].copy()

# Set the forecast to 0 if the past few observations have been 0
idx = xx_valid[:,0:1].sum(axis=1) == 0
yhat_valid[idx] = 0
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

Validation RMSE 1.4687934117681465


In [52]:
# Linear Regression
from sklearn.linear_model import LinearRegression, Ridge

# Fit the model using the training data
model = LinearRegression()
#model = Ridge(alpha=0.2, normalize=True)
model.fit(xx_train, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid).reshape(xx_valid.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

Validation RMSE 1.3694612664495778


In [108]:
%time

# Try to forecast using Random forests
from sklearn.ensemble import RandomForestRegressor

# Fit the model using the training data
model = RandomForestRegressor(n_estimators=100, max_depth=2)
model.fit(xx_train, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid).reshape(xx_valid.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs
Validation RMSE 1.3903701542426856


In [152]:
%time

# Try to forecast using Adaboost
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

sub_cols = [0,2,5,11,17]
sub_cols = [0,2,5,11,17]
xx_train_sub = xx_train[:,sub_cols]
xx_valid_sub = xx_valid[:,sub_cols]
xx_test_sub = xx_test[:,sub_cols]

# Fit the model using the training data
BE = DecisionTreeRegressor(max_depth=3)
BE = LinearRegression()
model = AdaBoostRegressor(n_estimators=80, random_state=0, base_estimator=BE, loss="linear", learning_rate=0.001)
model.fit(xx_train_sub, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid_sub).reshape(xx_valid_sub.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.34 µs
Validation RMSE 1.369535774506645


In [42]:
%time

# Try to forecast using Random forests
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge

# Fit the model using the training data
model = RandomForestRegressor(n_estimators=10, max_depth=2)
model.fit(xx_train, yy_train)

# Run the model on the validation set and see the score
dyhat_valid = model.predict(xx_valid).reshape(xx_valid.shape[0],1)
yhat_valid = W_valid + dyhat_valid
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, W_valid + yy_valid)
print( 'Validation RMSE {}'.format( res ) )

# Make a forecast using the test data
dyhat_test = model.predict(xx_test).reshape(xx_test.shape[0],1)
yhat_test = W_test + dyhat_test
yhat_test = np.maximum(0, yhat_test)

# Format the forecast as a Pandas Series and write the output to .csv
fcst = pd.Series( yhat_test.ravel(), index=pd.Index(Z.columns) )
output = format_forecast( test.ID, fcst )

# Make sure all values that have been 0 for the past 2 observations are predicted to be 0
idx = ( 0.1 > Z.iloc[-2:-1,:].sum() )
output[idx.index] = 0

output.to_csv( 'forecast_7.csv', index=True, header=True )

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs
Validation RMSE 1.4045803663300878
