In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df_train = pd.read_csv('../input/black-friday/train.csv')
df_test = pd.read_csv('../input/black-friday/test.csv')

In [3]:
df_train.head().append(df_train.tail())

In [4]:
df_train[df_train.Product_ID == 'P00248942']


In [5]:
print(df_train[df_train.Product_ID == 'P00248942']['Purchase'].min(),df_train[df_train.Product_ID == 'P00248942']['Purchase'].max())

In [6]:
print(df_train[df_train.Product_ID == 'P00371644']['Purchase'].min(),df_train[df_train.Product_ID == 'P00371644']['Purchase'].max())

Product ID seems like an important parameter that would influence a purchase behavior. as purchases for different demographics are in the same range when "P00248942" was compared to a product ID "P00371644"

In [7]:
df_train[df_train['Product_ID'].isin(df_train['Product_ID'].unique()[1:10])]

In [8]:
#To confirm this,we plot a box plot for the first 10 Product IDs
import seaborn as sns
import matplotlib.pyplot as plt
sns.factorplot(x="Product_ID", y = "Purchase",data =df_train[df_train['Product_ID'].isin(df_train['Product_ID'].unique()[1:10])], kind="box")
plt.show()

In [9]:
df_train.isnull().sum()

In [10]:
#Creating a new feature in test df
df_test['Multi_category_availability'] = 1+np.where(df_test['Product_Category_2'].isnull(), 0, 1 )+np.where(df_test['Product_Category_3'].isnull(), 0, 1 )
df_test['Multi_category_availability'].value_counts()

In [11]:
df_total = df_train.append(df_test)
df_total

In [12]:
df_train.columns

In [13]:
df_total.replace({'Age':{'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7},
                   'Stay_In_Current_City_Years':{'4+':5}},inplace = True)

In [14]:
df_total.Stay_In_Current_City_Years.value_counts()

In [15]:
df_total = pd.concat([df_total,pd.get_dummies(df_total[['Gender','City_Category']])],axis='columns')

In [16]:
#Setting Stay in current years as int as it was an object type previouly
df_total['Stay_In_Current_City_Years'] = df_total['Stay_In_Current_City_Years'].astype(int) 

In [17]:
#As our algorithms do not take string input, we need to convert Product ID to int
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_total['Product_ID'] = le.fit_transform(df_total['Product_ID'])


In [18]:
df_train = df_total[df_total['Purchase'].notnull()]
df_test =  df_total[df_total['Purchase'].isnull()]
df_train.append(df_test.head())

In [19]:
#invalidating if there is nulls in product_category2 and 3
df_train.fillna(-1,inplace = True)

In [20]:
df_train.isnull().sum()

In [21]:
df_test = df_test.drop('Purchase',axis=1).fillna(-1)

In [22]:
#checking if NaN were removed
df_test.isnull().sum()

In [23]:

X_train, X_test, y_train = df_train.drop(['Purchase','Gender','City_Category'],axis=1),df_test.drop(['Gender','City_Category'],axis=1),df_train['Purchase']

In [24]:
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost

k_fold= KFold(n_splits= 4, random_state= 42)

In [28]:
from tqdm import tqdm_notebook
algos1= {
        'Decision Tree Regressor' : {'model' : DecisionTreeRegressor(),
                                    'para' : {'criterion': ['mse'],
                                             'splitter': ['random'],'min_samples_split':[2,5,10,7]}
                                    },
        'KNN'                  : {'model' : KNeighborsRegressor(),
                                'para': {'n_neighbors': np.arange(1,38,10), 'weights': ['distance']}
                                 },
        'Random Forest Regressor' : {'model' : RandomForestRegressor(),
                                     'para' :{'criterion' : ['mse'],'max_depth' : [1,5,7],
                                              'n_estimators':[50,100],
                                              'max_features': ['auto']}
                                    }
        
        }
score1=[]
for algo, param in tqdm_notebook(algos1.items()):
    grid1= GridSearchCV(param['model'], param['para'], cv= k_fold)
    grid1.fit(X_train,y_train) 
    y_pred= grid1.predict(X_test)
    
    score1.append(pd.Series({
                'Estimator' : algo,
                'best parameter' : grid1.best_params_,
                'best score' : grid1.best_score_
                }))

#         objective='reg:squarederror
        

In [29]:
score1

We then select the best scoring algorithm and the parameter to make prediction and we use the fresh algorithm with full set