In [0]:
#importing libraries for data loading and data preparation
import numpy as np
import pandas as pd
import os

In [0]:
#read necessary files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
os.chdir('/content/drive/My Drive/black_friday')
train = pd.read_csv('BlackFriday_train.csv')
test = pd.read_csv('BlackFriday_test.csv')

In [0]:
#combining both test and train data to do processing together
train['Type'] = 1
test['Type'] = 0
fullData = pd.concat([train, test], axis = 0)
print(fullData.columns)

Index(['Age', 'City_Category', 'Gender', 'Marital_Status', 'Occupation',
       'Product_Category_1', 'Product_Category_2', 'Product_Category_3',
       'Product_ID', 'Purchase', 'Stay_In_Current_City_Years', 'Type',
       'User_ID'],
      dtype='object')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
#identifying continuous and categorical value columns
ID_col = ['User_ID','Product_ID']
flag_col = ['Type']
target_col = ['Purchase']
cat_cols = ['Age', 'City_Category', 'Gender', 'Marital_Status', 'Occupation',
       'Product_Category_1', 'Product_Category_2', 'Product_Category_3',
       'Stay_In_Current_City_Years']
num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(flag_col))

#combined numerical and categorical variables
num_cat_cols = num_cols + cat_cols

#create a new variable for each variable having missing value with VariableNmae_NA
#flag mission value with 1 and the other with 0

for var in num_cat_cols:
  if fullData[var].isnull().any() == True:
    fullData[var + '_NA'] = fullData[var].isnull()*1
    

#impute numerical missing value with mean
fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean())

#impute categorical missing value with -9999
fullData[cat_cols] = fullData[cat_cols].fillna(value = -9999)

In [0]:
#creating label encoders for categorical variables
from sklearn.preprocessing import LabelEncoder

for var in cat_cols:
  number = LabelEncoder()
  fullData[var] = number.fit_transform(fullData[var].astype('str'))

In [0]:
#normalizing the data
features = list(set(list(fullData.columns))-set(ID_col)-set(target_col))
fullData[features] = fullData[features]/fullData[features].max()

In [0]:
#create validation set for the data
from sklearn.model_selection import train_test_split

train = fullData[fullData['Type'] == 1]
test = fullData[fullData['Type'] == 0]
features = list(set(list(fullData.columns)) - set(ID_col) - set(target_col) - set(flag_col))

X = train[features].values
y = train[target_col].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [0]:
import random
from sklearn.ensemble import RandomForestRegressor

random.seed(42)
rf = RandomForestRegressor(n_estimators = 10)
rf.fit(X_train, y_train)


  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
from keras import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [0]:
#regression model using NN(keras)
#define model
model = Sequential()
model.add(Dense(100, input_dim = 11, activation = 'relu'))
model.add(Dense(50, activation = 'relu'))
model.add(Dense(1))
model.summary()
#compile model
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mean_squared_error'])

#fit model
model.fit(X_train, y_train, epochs = 10, validation_data = (X_valid, y_valid))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 100)               1200      
_________________________________________________________________
dense_5 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 51        
Total params: 6,301
Trainable params: 6,301
Non-trainable params: 0
_________________________________________________________________
Train on 338673 samples, validate on 145146 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4f22855fd0>

In [0]:
#evaluation
#evaluation from random forest

from sklearn.metrics import mean_squared_error
pred = rf.predict(X_valid)
score = np.sqrt(mean_squared_error(y_valid, pred))
print(score)

pred = model.predict(X_valid)
score = np.sqrt(mean_squared_error(y_valid, pred))
print(score)

3127.6047101559607
4170.125913933396


In [0]:
#select the independent variables for the test data
X_test = test[features].values

#prediction from random forest
y_test_rf = rf.predict(X_test)

#prediction from nn
y_test_nn = model.predict(X_test)