In [1]:
import numpy as np
import pandas as pd
import random as rd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelBinarizer

In [2]:
alpha = np.log(0.3)
beta = np.log(0.7)

(1.01 + beta ) / (beta - alpha)

0.7710689317245613

In [2]:
# Import all csv file data

sales = pd.read_csv( '../input/sales_train.csv')

# Warnings
import warnings
warnings.filterwarnings('ignore')

item_cat = pd.read_csv( '../input/item_categories.csv')
item = pd.read_csv( '../input/items.csv')
sub = pd.read_csv( '../input/sample_submission.csv')
shops = pd.read_csv( '../input/shops.csv')
test = pd.read_csv( '../input/test.csv')

In [3]:
# Reformat the date column
sales.date = sales.date.apply( lambda x: datetime.datetime.strptime( x, '%d.%m.%Y' ) )

# Add month and year columns
sales['month'] = [ x.month for x in sales.date ]
sales['year'] = [ x.year for x in sales.date ]
sales['year_month'] = sales.year * 100 + sales.month

# Add the item_category_id to the training set
sales = sales.set_index('item_id').join(item.set_index('item_id')).drop('item_name', axis=1).reset_index()
test = test.set_index('item_id').join(item.set_index('item_id')).drop('item_name', axis=1).reset_index()

# Add a unique id for the shop + item combo
sales['shop_item_id'] = sales.shop_id + sales.item_id * 100
test['shop_item_id'] = test.shop_id + test.item_id * 100
sales['shop_cat_id'] = sales.shop_id + sales.item_category_id * 100
test['shop_cat_id'] = test.shop_id + test.item_category_id * 100

# Add the revenue
sales[ "revenue" ] = sales.item_price * sales.item_cnt_day


In [4]:
agg_rules = {'item_price' : "mean", "revenue" : "sum", "item_cnt_day" : "sum" }
monthly_shop_item = sales.groupby([ "year_month", 'date_block_num', "shop_item_id", "item_id", "shop_id", "item_category_id" ] ).agg( agg_rules ).reset_index()

In [5]:

def create_pivot_ts( input_table, pivot_column, val_column, agg_rule, missing_method ):
    
        # Make time series out of the monthly  sales
        tmp_table = input_table.groupby( [ 'year_month', pivot_column ] ).agg( { val_column : agg_rule } ).reset_index()
        ts = tmp_table.pivot_table( val_column, index="year_month", columns=pivot_column )
        
        # Fill missing values with 0
        if missing_method == "zero":
            ts = ts.fillna(0)
        elif missing_method == 'ffill':
            ts = ts.fillna(method=missing_method)
        else:
            ValueError( 'Unsupported value: .' + missing_method )
        
        # Set negative values to 0
        ts[ ts < 0 ] = 0
        
        # Set the index to be the dates
        dates = year_month_to_datetime(ts.index.values )
        ts = ts.set_index( pd.Index( dates ) )
    
        # Make sure the dates are sorted
        ts.sort_index(axis=0, ascending=True, inplace=True )
        
        return(ts)
    

# Define a helper function
def year_month_to_datetime( ym ):

    if isinstance(ym, float ) or isinstance(ym, int):
        m = ym % 100
        y = ym // 100

        output = datetime.date( y, m, 1 )
    else:
        if isinstance( ym, pd.Series):
            ym = list(ym)

        output = []
        for j in range(len(ym)):
            m = ym[j] % 100
            y = ym[j] // 100

            output.append( datetime.date( y, m, 1 ) )

    return output    

def rmse( x1, x2 ):
    
    res = np.sqrt( np.mean( (x1.ravel()-x2.ravel()) ** 2 ) )
    return(res)    


def decompose_shop_item_id( shop_item_id ):
    
    item_id = shop_item_id // 100
    shop_id = shop_item_id % 100

    return shop_id, item_id

In [6]:
# Create time series of the variables that we will use for prediction
ts_item_day = create_pivot_ts( monthly_shop_item, "shop_item_id", "item_cnt_day", "sum", 'zero'  )
ts_item_prc = create_pivot_ts( monthly_shop_item, "shop_item_id", "item_price", "sum", "ffill" )

In [7]:
# Create time series of the shop id and category id for the different shop/item combinations

shop_id, item_id = decompose_shop_item_id( ts_item_day.columns )

ts_shop_id = pd.DataFrame( np.vstack( [ shop_id.values ] * ts_item_day.shape[0] ), index=ts_item_day.index )
ts_shop_id.columns = ts_item_day.columns

uniq_items = item_id.unique()
cat_ids_for_uniq_ids = [ item[ item.item_id == x ].item_category_id.iloc[0] for x in uniq_items ]
id_map = dict(zip( list(uniq_items), cat_ids_for_uniq_ids ))
cat_ids = pd.Series( [ id_map[x] for x in item_id ] )

ts_cat_id = pd.DataFrame( np.vstack( [ cat_ids.values ] * ts_item_day.shape[0] ), index=ts_item_day.index )
ts_cat_id.columns = ts_item_day.columns

date_block_nums = np.array(list(range(0,ts_shop_id.shape[0])))[:,np.newaxis]
ts_date_num_block = pd.DataFrame( np.hstack( [ date_block_nums ] * ts_item_day.shape[1] ), index=ts_item_day.index )
ts_date_num_block.columns = ts_item_day.columns

In [17]:
SEED = 42
VALIDATION_PROPORTION = 0.1

def get_regression_vectors_from_matrix( input_mtx, is_X, seed=SEED, valid_prop=VALIDATION_PROPORTION ):

    N = input_mtx.shape[1]
    new_row = np.array( [np.NaN] * N ).reshape(1,N)    
    if is_X:
        M = np.vstack( [ new_row, input_mtx.to_numpy() ] )
    else:
        M = np.vstack( [ input_mtx.to_numpy(), new_row ] )

    # Randomly split the validation and training sets
    s_train_valid = M[:-1].ravel()[:,np.newaxis]
    s_train, s_valid = split_train_validation_sets( s_train_valid, seed=seed, valid_prop=valid_prop )
    
    # Get the test set
    s_test = M[-1].ravel()[:,np.newaxis]

    return s_train, s_valid, s_test


def split_train_validation_sets( s, seed=SEED, valid_prop=VALIDATION_PROPORTION ):
    
    rng = np.random.RandomState(42)
    N = len(s)
    n_valid = int( np.floor( N * valid_prop ))
    
    idx = rng.choice(N, N, replace=False)
    s_valid = s[ idx[:n_valid] ]
    s_train = s[ idx[n_valid:] ]
    
    return s_train, s_valid
    

def remove_nan_rows( xx_input, yy_input ):
    idx = np.any( np.isnan( xx_input ), axis=1 ).ravel() | np.any( np.isnan( yy_input ), axis=1 ).ravel() 
    xx_output = xx_input[~idx,:]
    yy_output = yy_input[~idx,:]    
    return xx_output, yy_output
    
    
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:,:] - cumsum[:-N,:]) / float(N)

def format_forecast( test_ids, fcst ):
    
    output = pd.Series( np.zeros_like(test_ids), index=test_ids, name="item_cnt_month" )

    # Only keep the forecast values that are in the test set
    fcst_sub = fcst[ [ x in test_ids for x in fcst.index.values ] ]
    
    # Copy the forecasts into the same order as the test IDs
    output[ fcst_sub.index.values ] = fcst_sub.values
    
    return(output)

In [None]:
# Create the test and train sets from the observation matrices
yy_train, yy_valid, yy_test = get_regression_vectors_from_matrix( ts_item_day, is_X=False )

# Get feature vectors
X_train = []
X_test = []
X_valid = []
X_descrip = []

# Use different sales lags as features
c = 0
lags = [1, 2, 3, 6, 12 ]
for L in lags:
    X_train.append([]), X_valid.append([]), X_test.append([])
    x_lag_mtx = ts_item_day.shift(periods=L-1)
    X_train[c], X_valid[c], X_test[c] = get_regression_vectors_from_matrix( x_lag_mtx, is_X=True)
    X_descrip.append(  'sales_lag_{:02}'.format(L) )
    c += 1

# Use different means as features
means = [2, 3, 6, 12]
for M in means:
    X_train.append([]), X_valid.append([]), X_test.append([])
    x_mean_mtx = ts_item_day.rolling(window=M).mean()    
    X_train[c], X_valid[c], X_test[c] = get_regression_vectors_from_matrix(x_mean_mtx, is_X=True )
    X_descrip.append( 'sales_mean_{:02}'.format(M) ) 
    c += 1
    
# Use different price lags as features
lags = [1]
for L in lags:
    X_train.append([]), X_valid.append([]), X_test.append([])
    x_lag_mtx = ts_item_prc.shift(periods=L-1)
    X_train[c], X_valid[c], X_test[c] = get_regression_vectors_from_matrix( x_lag_mtx, is_X=True)
    X_descrip.append( 'price_lag_{:02}'.format(L) )
    c += 1
    
    
# Add the shop id as a feature
X_train.append([]), X_valid.append([]), X_test.append([])  
X_train[c], X_valid[c], X_test[c] = get_regression_vectors_from_matrix( ts_shop_id, is_X=True )
X_descrip.append( 'shop_id' )
c += 1

# Add the category id as a feature
X_train.append([]), X_valid.append([]), X_test.append([])  
X_train[c], X_valid[c], X_test[c] = get_regression_vectors_from_matrix( ts_cat_id, is_X=True )
X_descrip.append( 'cat_id' )
c += 1

# Add the date block number as a feature
X_train.append([]), X_valid.append([]), X_test.append([])  
X_train[c], X_valid[c], X_test[c] = get_regression_vectors_from_matrix( ts_date_num_block, is_X=True )
X_descrip.append( 'date_block_num' )
c += 1

In [21]:
# Combine all of the X vectors into a single feature matrix
xx_train = np.hstack(X_train)
xx_valid = np.hstack(X_valid)
xx_test = np.hstack(X_test)

# Remove all rows with NaNs in the Train and Validation sets
# In the test set, all Y values are unknown, so we leave these alone
xx_train, yy_train = remove_nan_rows( xx_train, yy_train )
xx_valid, yy_valid = remove_nan_rows( xx_valid, yy_valid )

In [22]:
def add_one_hot_encoding( descrip, train, valid, test, feature_name, binarizer ):

    # Get the column with the target feature
    idx = descrip.index(feature_name)
    
    # Update the descriptions of the features
    new_cols = [ feature_name + "_{:02d}".format(x) for x in binarizer.classes_ ]    
    new_descrip = descrip[:idx] + descrip[idx+1:] + new_cols
    
    # Update the training data
    target_train = train[:,idx]
    sub_train = np.hstack( [ train[:,:idx], train[:,idx+1:] ] )    
    one_hot_train = binarizer.transform(target_train)
    new_train = np.hstack( [ sub_train, one_hot_train ] )
    
    # Update the validation data
    targe_valid = valid[:,idx]
    sub_valid = np.hstack( [ valid[:,:idx], valid[:,idx+1:] ] )    
    one_hot_valid = binarizer.transform(targe_valid)
    new_valid = np.hstack( [ sub_valid, one_hot_valid ] )
    
    # Update the test data
    target_test = test[:,idx]
    sub_test = np.hstack( [ test[:,:idx], test[:,idx+1:] ] )    
    one_hot_test = binarizer.transform(target_test)
    new_test = np.hstack( [ sub_test, one_hot_test ] ) 
    
    return new_descrip, new_train, new_valid, new_test
    

In [23]:
# Replace shop_id with one-hot encodings
shop_binarizer = LabelBinarizer()
shop_binarizer.fit(sales.shop_id.unique())
X_descrip, xx_train, xx_valid, xx_test = add_one_hot_encoding( X_descrip, xx_train, xx_valid, xx_test, "shop_id", binarizer=shop_binarizer )

# Replace category_id with one-hot encodings
cat_binarizer = LabelBinarizer()
cat_binarizer.fit(sales.item_category_id.unique())
X_descrip, xx_train, xx_valid, xx_test =  add_one_hot_encoding( X_descrip, xx_train, xx_valid, xx_test, "cat_id", binarizer=cat_binarizer )


In [24]:
xx_train_full = xx_train
yy_train_full = yy_train

In [67]:
N_full = xx_train_full.shape[0]
N_train = N_full // 10

rng = np.random.RandomState(42)
idx = rng.choice( N_full, N_full, replace=False )
xx_train = xx_train_full[idx[0:N_train],:]
yy_train = yy_train_full[idx[0:N_train],:]

In [68]:
# Linear Regression
from sklearn.linear_model import LinearRegression, Ridge

# Fit the model using the training data
#model = LinearRegression()
model = Ridge(alpha=0.6, normalize=True)
model.fit(xx_train, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid).reshape(xx_valid.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

Validation RMSE 1.7459644921217206


In [69]:
# Winsorize the non-categorical features

from scipy.stats.mstats import winsorize

xx_train_w = xx_train.copy()
xx_valid_w = xx_valid.copy()
xx_test_w = xx_test.copy()

p = 0.000001
idx_last = X_descrip.index('price_lag_01')
for j in range(0,idx_last+1):
    xx_train_w[:,j] = winsorize( xx_train_w[:,j], (0, p) )
    xx_valid_w[:,j] = winsorize( xx_valid_w[:,j], (0, p) )    
    xx_test_w[:,j] = winsorize( xx_test_w[:,j], (0, p) )    


In [None]:
# Support Vector Regression
from sklearn.svm import SVR

# Fit the model using the training data
model = SVR(C=1)
model.fit(xx_train, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid).reshape(xx_valid.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

In [34]:
# Try to forecast using Random forests
from sklearn.ensemble import RandomForestRegressor

# Fit the model using the training data
model = RandomForestRegressor(n_estimators=10, max_depth=2)
model.fit(xx_train, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid).reshape(xx_valid.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

Validation RMSE 1.803924967460872


In [54]:
# Try to forecast using Adaboost
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

sub_cols = [0,2,5,11,17]
sub_cols = [0,2,5,11,17]
xx_train_sub = xx_train[:,sub_cols]
xx_valid_sub = xx_valid[:,sub_cols]
xx_test_sub = xx_test[:,sub_cols]

# Fit the model using the training data
BE = DecisionTreeRegressor(max_depth=3)
BE = LinearRegression()
model = AdaBoostRegressor(n_estimators=50, random_state=0, base_estimator=BE, loss="linear", learning_rate=0.0005 )
model.fit(xx_train_sub, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid_sub).reshape(xx_valid_sub.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

Validation RMSE 1.965791369627067


In [33]:
# Try to forecast using Random forests
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge

# Fit the model using the training data
model = Ridge(alpha=0.6, normalize=True)
model.fit(xx_train, yy_train)

# Run the model on the validation set and see the score
yhat_valid = model.predict(xx_valid).reshape(xx_valid.shape[0],1)
yhat_valid = np.maximum(0, yhat_valid)

# Check the goodness of fit
res = rmse( yhat_valid, yy_valid)
print( 'Validation RMSE {}'.format( res ) )

# Make a forecast using the test data
yhat_test = model.predict(xx_test).reshape(xx_test.shape[0],1)
yhat_test = np.maximum(0, yhat_test)

# Format the forecast as a Pandas Series and write the output to .csv
fcst = pd.Series( yhat_test.ravel(), index=pd.Index(ts_item_day.columns) )
output = format_forecast( test.ID, fcst )

output.to_csv( '../forecasts/forecast_10.csv', index=True, header=True )

Validation RMSE 1.7383397340041775


In [146]:
monthly_shop_item.head()

Unnamed: 0,year_month,date_block_num,shop_item_id,item_id,shop_id,item_category_id,item_price,revenue,item_cnt_day
0,201301,0,1925,19,25,40,28.0,28.0,1.0
1,201301,0,2701,27,1,19,1890.0,1890.0,1.0
2,201301,0,2702,27,2,19,2499.0,2499.0,1.0
3,201301,0,2710,27,10,19,1890.0,1890.0,1.0
4,201301,0,2719,27,19,19,2499.0,2499.0,1.0


'price_lag_12'