## Kaggle: M5_Accuracy Competition _ Predictive Model Baseline

Competition Link: https://www.kaggle.com/c/m5-forecasting-accuracy

After preprocessing the competition files using the preprocessing pipeline notebook, locate the training, test, validation files and run all.

Training file has approximately 65 million rows, and model uses 59 columns for training.

Features are standard features, and besides encoding the categorical variables, no features are engineered. 

MSE = 0.7

For prediction:



### import libraries, adjust the setting

In [1]:
import pandas as pd
import numpy as np
import time
x=time.time()

### Adjust settings

In [2]:
#set precision to 3 for easier reading
np.set_printoptions(precision=3, suppress=True)

#set the number of columns and rows visible with pandas dataframes
pd.set_option('max_column', None)
pd.set_option('max_rows', 999)

### 2. Save Schema and Training columns

In [3]:
#schema of the final processed file
schema = {'date':'O','id':'O', 'item_id':'O', 'dept_id':'O', 'cat_id':'O',
 'store_id':'O', 'state_id':'O', 'd':'O', 'weekday':'O', 'event_name_1':'O',
 'event_type_1':'O', 'event_name_2':'O', 'event_type_2':'O', 'day':'int16',
 'wm_yr_wk':'int16', 'year':'int16', 'month':'int8', 'wday':'int8', 'snap_CA':'int8',
 'snap_TX':'int8', 'snap_WI':'int8', 'Eid al-Fitr':'int8', 'Pesach End':'int8',
 "Father's day":'int8', 'ValentinesDay':'int8', 'NBAFinalsStart':'int8', 'NewYear':'int8',
 'Chanukah End':'int8', 'StPatricksDay':'int8', 'LentWeek2':'int8', 'Cinco De Mayo':'int8',
 'Christmas':'int8', "Mother's day":'int8', 'ColumbusDay':'int8', 'VeteransDay':'int8',
 'IndependenceDay':'int8', 'SuperBowl':'int8', 'Easter':'int8', 'Halloween':'int8',
 'MartinLutherKingDay':'int8', 'OrthodoxChristmas':'int8', 'Thanksgiving':'int8', 'OrthodoxEaster':'int8',
 'EidAlAdha':'int8', 'NBAFinalsEnd':'int8', 'PresidentsDay':'int8', 'LentStart':'int8',
 'Ramadan starts':'int8', 'Purim End':'int8', 'LaborDay':'int8', 'MemorialDay':'int8',
 'Cultural':'int8', 'National':'int8', 'Religious':'int8', 'Sporting':'int8', 'FOODS_2':'int8',
 'FOODS_3':'int8', 'HOBBIES_1':'int8', 'HOBBIES_2':'int8', 'HOUSEHOLD_1':'int8',
 'HOUSEHOLD_2':'int8', 'HOBBIES':'int8', 'HOUSEHOLD':'int8', 'TX':'int8', 'WI':'int8',
 'CA_2':'int8', 'CA_3':'int8', 'CA_4':'int8', 'TX_1':'int8', 'TX_2':'int8',
 'TX_3':'int8', 'WI_1':'int8', 'WI_2':'int8', 'WI_3':'int8', 'sell_price':'float32',
 'quantity_sold':'int32'}

In [4]:
training_cols = ['id','item_id','day', 'month','wday','CA_2', 'CA_3', 'CA_4', 
                 'Chanukah End', 'Christmas', 'Cinco De Mayo', 'ColumbusDay', 
                 'Cultural', 'Easter', 'Eid al-Fitr', 'EidAlAdha', 'FOODS_2', 
                 'FOODS_3', "Father's day", 'HOBBIES', 'HOBBIES_1', 'HOBBIES_2',
                 'HOUSEHOLD', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'Halloween', 'IndependenceDay',
                 'LaborDay', 'LentStart', 'LentWeek2', 'MartinLutherKingDay', 'MemorialDay',
                 "Mother's day", 'NBAFinalsEnd', 'NBAFinalsStart', 'National', 
                 'NewYear', 'OrthodoxChristmas', 'OrthodoxEaster', 'Pesach End', 
                 'PresidentsDay', 'Purim End', 'Ramadan starts', 'Religious', 'Sporting',
                 'StPatricksDay', 'SuperBowl', 'TX', 'TX_1', 'TX_2', 'TX_3', 'Thanksgiving',
                 'ValentinesDay', 'VeteransDay', 'WI', 'WI_1', 'WI_2', 'WI_3', 
                 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'quantity_sold']

### identify the input and target columns

In [5]:
#Columns to use for training
input_cols = ['month', 'wday', 'snap_CA', 'snap_TX', 'snap_WI',
       'Eid al-Fitr', 'Pesach End', 'Father\'s day', 'ValentinesDay',
       'NBAFinalsStart', 'NewYear', 'Chanukah End', 'StPatricksDay',
       'LentWeek2', 'Cinco De Mayo', 'Christmas', 'Mother\'s day',
       'ColumbusDay', 'VeteransDay', 'IndependenceDay', 'SuperBowl', 'Easter',
       'Halloween', 'MartinLutherKingDay', 'OrthodoxChristmas', 'Thanksgiving',
       'OrthodoxEaster', 'EidAlAdha', 'NBAFinalsEnd', 'PresidentsDay',
       'LentStart', 'Ramadan starts', 'Purim End', 'LaborDay', 'MemorialDay',
       'Cultural', 'National', 'Religious', 'Sporting', 'FOODS_2', 'FOODS_3',
       'HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'HOBBIES',
       'HOUSEHOLD', 'TX', 'WI', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3',
       'WI_1', 'WI_2', 'WI_3', 'sell_price']

target_col = 'quantity_sold'

In [6]:
#save the submission file column names to a variable
submission_columns = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10',
       'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20',
       'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28']

## Import the data 

In [7]:
training_file_path = '../../../../Documents/KAGGLE/WALMART/m5_training_df.csv'

validation_file_path = '../../../../Documents/KAGGLE/WALMART/m5_validation_df.csv'

testing_file_path = '../../../../Documents/KAGGLE/WALMART/m5_testing_df.csv'

In [None]:
#import the training file
training_df = pd.read_csv(training_file_path, index_col=0, dtype=schema, usecols=training_cols)

In [None]:
#import the validation file
validation_df = pd.read_csv(validation_file_path, usecols=training_cols, dtype=schema)

In [None]:
#import the testing file
testing_df = pd.read_csv(testing_file_path, usecols=training_cols, dtype=schema)

### Set up the file locations and names

In [None]:
file_name = input('Enter a name for the results file?: ')
file_path = '../../../../Documents/KAGGLE/WALMART/'
file_name = file_path+file_name

## Build the model

In [None]:
#create dataframes to save the results into a new dataframe

weight_dict = {}  #ths will store the best weights for each item

#make a list of the unique item_ids
items = training_df.item_id.unique()

for item in items:
    
    
    temp_df = training_df[training_df['item_id'] == item]

    X = temp_df[input_cols]   #filter the dataset on the item_id

    target = np.array(temp_df[target_col]).reshape(-1,1)   #labels



    #initialize coefficients and the bias
    np.random.seed(1)
    W = np.random.rand(59, 1)/10
    B = np.random.random(size=(1))/10
    N = np.random.rand(X.shape[0],1)/10

    #Set parameters
    epochs = 30
    early_stop_threshold = 3
    learning_rate = 0.001
    num_observations = X.shape[0]

    #initialize the loss and and a loss list to help with early stopping
    losses = [0]
    loss_score = 0


    for epoch in range(epochs):

        Y =  X.dot(W) +  B + N  #calculate the targets

        deltas = target - Y   #Find the error

        loss = deltas.pow(2).mean()   #Loss function

        

        #set up an early stopping mechanism
        #add the loss to the losses list
        losses.append(loss)
        
        if loss[0] > loss_score:

            loss_score += 1   #keep track of the change in loss function


        print(f'Epoch: {epoch}  Loss: {loss[0]:.3f}')


        if loss_score > early_stop_threshold or epoch == (epochs-1):   #If the loss is increasing 10 times or finished epochs stop learning

            weight_dict[item] = (W,B)
            
            
            #prepare a submission file with the validation file
            
            result = validation_df[validation_df['item_id'] == item][['id', 'item_id', 'day', 'quantity_sold']]   #create a new dataframe to save predictions

            result['predictions_test'] =  validation_df[validation_df.item_id==item][input_cols].dot(weight_dict[item][0])+weight_dict[item][1]#save the predictions

            result.to_csv(file_name + '_validation.csv', mode='a', header=False)
            
            
            #prepare a submission file with the ground truth file
            result = testing_df[testing_df['item_id'] == item][['id', 'item_id', 'day', 'quantity_sold']]   #create a new dataframe to save predictions

            result['predictions_test'] =  testing_df[testing_df.item_id==item][input_cols].dot(weight_dict[item][0])+weight_dict[item][1]#save the predictions

            result.to_csv(file_name + '_test.csv', mode='a', header=False)
          

            break



        #scale the deltas
        deltas_scaled = deltas / num_observations

        #Optimize
        #update the weights biases
        W = W -learning_rate * np.dot(X.T, deltas_scaled)
        B = B - learning_rate * np.sum(deltas_scaled)
        
        


### Export the results

In [None]:
submission = pd.read_csv(file_name+'_test.csv', index_col=0, header=None)
submission.columns = ['id', 'item_id', 'day', 'quantity_sold', 'predictions']
submission_file = submission.drop(['item_id','quantity_sold'], axis=1)
submission_file = submission_file.pivot_table(index=['id'], columns='day', values='predictions')
submission_file.columns = submission_columns
for col in submission_file:
    submission_file [col] = submission_file[col].apply(lambda x: x if x>0 else 0)
    
submission_file.to_csv(file_path+'m5_test_submission_ML_GD_LR-001_EStop-3_RSeed-1_Loss-MSE_Epoch-30.csv')

In [None]:
submission = pd.read_csv(file_name+'_validation.csv', index_col=0, header=None)
submission.columns = ['id', 'item_id', 'day', 'quantity_sold', 'predictions']
submission_file = submission.drop(['item_id','quantity_sold'], axis=1)
submission_file = submission_file.pivot_table(index=['id'], columns='day', values='predictions')
submission_file.columns = submission_columns
for col in submission_file:
    submission_file [col] = submission_file[col].apply(lambda x: x if x>0 else 0)
    
submission_file.to_csv(file_path+'m5_validation_submission_ML_GD_LR-001_EStop-3_RSeed-1_Loss-MSE_Epoch-30.csv')

In [None]:
y=time.time()

In [None]:
print(f'It took {(y-x)/60: .1f} minutes to complete the run')