<a href="https://colab.research.google.com/github/danielad-k/SPRB/blob/master/Capstone%202/1%2C3%20Walmart%20Modeling%20-%20LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**SUMMARY OF FINDINGS**

1. A single layer LSTM model  produced an accuracy of 22%; therefore, we will keep the first model
2. In terms of preprocessing the following steps were taken:
  *   Fineline and UPC , only keep top 50% of most common values
  *   Convert all categorical variables into one hot encoding





In [None]:
import pandas as pd
import tensorflow as ts
import numpy as np
import pickle
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
import seaborn as sns
import matplotlib.pyplot as plt
import keras
from keras.models import Model,Sequential
from keras.layers import Input, LSTM, Dense,Flatten,Dropout,TimeDistributed
from keras import regularizers
from imblearn.under_sampling import RandomUnderSampler
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split

## **Functions**

In [None]:
def convert_df(data,var,ncol):
  #convert category
  data[var] = data[var].astype('category')
  #groupby
  y= data[['VisitNumber','TripType']].dropna().groupby('VisitNumber')['TripType'].agg(max).reset_index()['TripType'].astype(str)

  #add varible name, add cat codes but add plus 2 
  data[var] = data[var].cat.codes.apply(lambda x: x + 2).astype(str)
  #group by #visit number and aggregate such that each row contains all the categories they have visited for each visit
  data_new = data[['VisitNumber',var]].dropna().groupby('VisitNumber')[var].apply('_'.join).reset_index()
  #then split each category and fill na with zero and select n columns
  data_d = data_new.iloc[:,1].str.split('_', expand=True).fillna(0).iloc[:,0:ncol]
  #concat final data between y and new dataset
  data_final = pd.concat([data_new.loc[:,'VisitNumber'],y,data_d],axis = 1)

  return data_final



In [None]:
def create_dataset(dataset, look_back,ncol):
    dataX,dataY = [],[]
    df = dataset.reset_index().index
    df2 = df[df % look_back == 0] 
    dataset = dataset.values
    for i in df2:
        a = dataset[i:(i + look_back), 2:ncol]
        dataX.append(a)
        dataY.append(dataset[i:(i + look_back), 1])
    return np.array(dataX),np.array(dataY)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **1. Data Preprocessing**

In [None]:
#read data
data = pd.read_csv(r'/content/drive/My Drive/Colab Notebooks/data_modeling (1).csv')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,PLU,COMMODITY,Return,Count_of_Item,Rename_Dep,count
0,0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000,,,Returned,Single,Financial Services,1
1,1,30,7,Friday,60538815980,1,SHOES,8931,,,Bought,Single,"Clothing,Shoes,and Accessories",1
2,2,30,7,Friday,7410811099,1,PERSONAL CARE,4504,,,Bought,Single,"Pharmacy,Health,and Beauty",1
3,3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565,,,Bought,Multiple,Home Improvement and Patio,1
4,4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017,,,Bought,Multiple,Home Improvement and Patio,1


In [None]:
#convert visit number as string
data.VisitNumber = data.VisitNumber.astype(str)

## 1.1 Convert dataframe to wide format

Take dataframe for each column, aggregate by Visit Number and aggregate all the variable in Visit Number and then split across different columns. Thus, each row will be a "visit".

In [None]:
nrow = 8
ncol = 10

**Department Dataset**

In [None]:
data_dep = convert_df(data,"DepartmentDescription",ncol)

**Return Dataset**

In [None]:
data_return = convert_df(data,"Return", ncol)

**Department Rename Dataset**

In [None]:
data_rename = convert_df(data,"Rename_Dep",ncol)

**Upc Dataset**

In [None]:
data_upc = convert_df(data,"Upc",ncol)

**Fineline Dataset**

In [None]:
data_fineline = convert_df(data,"FinelineNumber",ncol)

**Count of Item Dataset**

In [None]:
data_count = convert_df(data,"Count_of_Item",ncol)

**Week Dataset**

In [None]:
data_week = convert_df(data,"Weekday",ncol)

**ScanCount Dataset**

In [None]:
data_scan = convert_df(data,"ScanCount",ncol)

**Join and concatenate **

In [None]:
data_f = pd.concat([data_dep,data_return,data_rename,data_upc,data_fineline,data_count,data_week, data_scan],axis = 0).sort_values(by = 'VisitNumber')

In [None]:
#convert TripType to category
data_f.TripType = data_f.TripType.astype('category')
data_f.TripType = data_f.TripType.cat.codes


# **2.0 Test and Train**

In [None]:
#create dataset for X,y
X,y = create_dataset(data_f,nrow,ncol)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42,
                                                   shuffle = True)

**Reshape**

In [None]:
y_train_2 = y_train.reshape(nrow,y_train.shape[0],1)
y_test_2 = y_test.reshape(nrow,y_test.shape[0],1)

In [None]:
X_train_2 = X_train.reshape((nrow,X_train.shape[0],ncol - 2))
X_test_2 = X_test.reshape((nrow,X_test.shape[0],ncol - 2))

# **3.0 Modeling**

In [None]:
model = Sequential()
model.add(LSTM(600, kernel_regularizer=regularizers.l1(0.00001), input_shape= (X_train_2.shape[1],X_train_2.shape[2]), return_sequences=True))
model.add(Dropout(0.3))

In [None]:

model.add(Dense(38,kernel_regularizer=regularizers.l1(0.00001),  activation='softmax'))


In [None]:
model.compile(loss= 'sparse_categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy']) 
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 47837, 600)        1461600   
_________________________________________________________________
dropout_4 (Dropout)          (None, 47837, 600)        0         
_________________________________________________________________
dense_3 (Dense)              (None, 47837, 38)         22838     
Total params: 1,484,438
Trainable params: 1,484,438
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

In [None]:
model.fit(X_train_2,y_train_2,validation_split = 0.33, epochs= 20 ,callbacks=[learning_rate_reduction],verbose = 1, batch_size= 64)

Train on 5 samples, validate on 3 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7fd106c58d68>

In [None]:
score = model.evaluate(X_test_2, y_test_2, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 2.688533067703247
Test accuracy: 0.22442618012428284


As the accuracy is too low, then we will keep the original model.