In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pandas_profiling import ProfileReport
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest,SelectFromModel
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import Lasso
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
profile_train = ProfileReport(train)
profile_test = ProfileReport(test)

In [4]:
profile_train

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [5]:
profile_test

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [6]:
combined = pd.concat([train,test],0,ignore_index=True)

In [7]:
train['Product_ID_N'] = train.Product_ID.str.replace('([A-Za-z]+)','').astype(int)
train['Product_ID_S'] = train.Product_ID.str.extract('([A-Za-z]+)')

test['Product_ID_N'] = test.Product_ID.str.replace('([A-Za-z]+)','').astype(int)
test['Product_ID_S'] = test.Product_ID.str.extract('([A-Za-z]+)')

In [8]:
# Handling Rare Labels Function
def f_find_frequent_labels (dataset, var, threshold) :
    tmp = dataset.groupby([var]).size()/dataset.shape[0]
    frequent_ls = tmp[tmp > threshold].index
    return frequent_ls

In [9]:
#New column with 0 value for the very rare product_id's
freq_products_lst = f_find_frequent_labels(train, "Product_ID_N", 0.0001)
train["Product_ID_Rare"] = np.where(train["Product_ID_N"].isin(freq_products_lst), train["Product_ID_N"], 0)

freq_products_lst_test = f_find_frequent_labels(test, "Product_ID_N", 0.0001)
test["Product_ID_Rare"] = np.where(test["Product_ID_N"].isin(freq_products_lst_test), test["Product_ID_N"], 0)

In [10]:
#New column with 0 value for the very rare product_category1
freq_prod_category1_lst = f_find_frequent_labels(train, "Product_Category_1", 0.01)
train["Product_Category_1_Rare"] = np.where(train["Product_Category_1"].isin(freq_prod_category1_lst), train["Product_Category_1"], 0 )

freq_prod_category1_lst_test = f_find_frequent_labels(test, "Product_Category_1", 0.01)
test["Product_Category_1_Rare"] = np.where(test["Product_Category_1"].isin(freq_prod_category1_lst_test), test["Product_Category_1"], 0 )

In [11]:
#New column with 0 value for the very rare product_category2
freq_prod_category2_lst = f_find_frequent_labels(train, "Product_Category_2", 0.01)
train["Product_Category_2_Rare"] = np.where(train["Product_Category_2"].isin(freq_prod_category2_lst),train["Product_Category_2"], 0)

freq_prod_category2_lst_test = f_find_frequent_labels(test, "Product_Category_2", 0.01)
test["Product_Category_2_Rare"] = np.where(test["Product_Category_2"].isin(freq_prod_category2_lst_test),test["Product_Category_2"], 0)

In [12]:
#New column with 0 value for the very rare product_category3
freq_prod_category3_lst = f_find_frequent_labels(train, "Product_Category_3", 0.01)
train["Product_Category_3_Rare"] = np.where(train["Product_Category_3"].isin(freq_prod_category3_lst), train["Product_Category_3"], 0)

freq_prod_category3_lst_test = f_find_frequent_labels(test, "Product_Category_3", 0.01)
test["Product_Category_3_Rare"] = np.where(test["Product_Category_3"].isin(freq_prod_category3_lst_test), test["Product_Category_3"], 0)

In [13]:
# Gender Encoder
train['Gender_le'] = np.where(train['Gender'] == 'M', 1, 0)

test['Gender_le'] = np.where(test['Gender'] == 'M', 1, 0)

In [14]:
# City Category one hot encoding
ohe_city = pd.get_dummies(train['City_Category'], prefix= 'ohe_city')
# adding the onehot encoding columns to X_train dataset 
train = train.merge(ohe_city, left_index= True, right_index= True)

ohe_city = pd.get_dummies(test['City_Category'], prefix= 'ohe_city')
test = test.merge(ohe_city, left_index= True, right_index= True)

In [15]:
# product total, mean, median, min, max, std
total_purchase_amt = train['Purchase'].sum()
df_product = pd.DataFrame()
df_product['f_product_tot_sale_amt'] = train.groupby(['Product_ID_Rare'])['Purchase'].sum()
df_product['f_product_id_m'] = train.groupby(['Product_ID_Rare'])['Purchase'].mean()
df_product['f_product_id_median'] = train.groupby(['Product_ID_Rare'])['Purchase'].median()
df_product['f_product_id_min'] = train.groupby(['Product_ID_Rare'])['Purchase'].min()
df_product['f_product_id_max'] = train.groupby(['Product_ID_Rare'])['Purchase'].max()
df_product['f_product_id_std'] = train.groupby(['Product_ID_Rare'])['Purchase'].std()
df_product = df_product.reset_index()
print(df_product.shape)
# adding new columns
train = pd.merge(train, df_product, how = 'left', on= ['Product_ID_Rare'] )
train.shape


test = pd.merge(test, df_product, how = 'left', on= ['Product_ID_Rare'] )

(2006, 7)


In [16]:
# gender unique users count
df_gender_unique_users_cnt = pd.DataFrame(train.groupby(['Gender'])['User_ID'].nunique())
df_gender_unique_users_cnt = df_gender_unique_users_cnt.reset_index()
df_gender_unique_users_cnt = df_gender_unique_users_cnt.rename(columns = {'User_ID' : 'f_gender_unique_users_cnt'})
print(df_gender_unique_users_cnt.shape)
# add new column
train = pd.merge(train, df_gender_unique_users_cnt, how = 'left', on= ['Gender'] )
print(train.shape)


df_gender_unique_users_cnt_test = pd.DataFrame(test.groupby(['Gender'])['User_ID'].nunique())
df_gender_unique_users_cnt_test = df_gender_unique_users_cnt_test.reset_index()
df_gender_unique_users_cnt_test = df_gender_unique_users_cnt_test.rename(columns = {'User_ID' : 'f_gender_unique_users_cnt'})
test = pd.merge(test, df_gender_unique_users_cnt_test, how = 'left', on= ['Gender'] )

(2, 2)
(550068, 29)


In [17]:
# Age Mean 
df_age = pd.DataFrame(train.groupby(['Age'])['Purchase'].mean()) 
df_age['f_age_median'] = train.groupby(['Age'])['Purchase'].median()
df_age['f_age_std'] = train.groupby(['Age'])['Purchase'].std()
df_age = df_age.reset_index()
df_age = df_age.rename(columns = {'Purchase' : 'f_age_m'})
df_age.shape

(7, 4)

In [18]:
# Age Min & Max values
age_min = []
age_max = []
for i in range(df_age.shape[0]) :
    
    if df_age['Age'][i] == '0-17' :
        age_min.append(0)
        age_max.append(17)
        
    elif df_age['Age'][i] == '18-25' :
        age_min.append(18)
        age_max.append(25)
        
    elif df_age['Age'][i] == '26-35' :
        age_min.append(26)
        age_max.append(35)
    
    elif df_age['Age'][i] == '36-45' :
        age_min.append(36)
        age_max.append(45)
    
    elif df_age['Age'][i] == '46-50' :
        age_min.append(46)
        age_max.append(50)
        
    elif df_age['Age'][i] == '51-55' :
        age_min.append(51)
        age_max.append(55)
    
    else :
        age_min.append(56)
        age_max.append(100)

# add new columns
df_age['f_Age_Min'] = age_min
df_age['f_Age_Max'] = age_max
df_age.shape

(7, 6)

In [19]:
# adding new columns
train = pd.merge(train, df_age, how = 'left', on= ['Age'] )
train.shape

test = pd.merge(test, df_age, how = 'left', on= ['Age'] )

In [20]:
# product count 
df_city_category = pd.DataFrame() 
# product mean, median, min, max, std
df_city_category['f_city_m'] = train.groupby(['City_Category'])['Purchase'].mean()
df_city_category['f_city_median'] = train.groupby(['City_Category'])['Purchase'].median()
df_city_category['f_city_min'] = train.groupby(['City_Category'])['Purchase'].min()
df_city_category['f_city_max'] = train.groupby(['City_Category'])['Purchase'].max()
df_city_category['f_city_std'] = train.groupby(['City_Category'])['Purchase'].std()
df_city_category = df_city_category.reset_index()
print(df_city_category.shape)

# adding new columns
train = pd.merge(train, df_city_category, how = 'left', on= ['City_Category'] )
train.shape

test = pd.merge(test, df_city_category, how = 'left', on= ['City_Category'] )

(3, 6)


In [21]:
# Occupation mean 
df_occupation = pd.DataFrame(train.groupby(['Occupation'])['Purchase'].mean()) 
df_occupation['f_occupation_tot_amt'] = train.groupby(['Occupation'])['Purchase'].sum()
df_occupation = df_occupation.reset_index()
df_occupation = df_occupation.rename(columns = {'Purchase' : 'f_occupation_m'})
print(df_occupation.shape)
# add new column
train = pd.merge(train, df_occupation, how = 'left', on= ['Occupation'] )
train.shape

test = pd.merge(test, df_occupation, how = 'left', on= ['Occupation'] )

(21, 3)


In [22]:
# current city period mean 
df_city_period = pd.DataFrame(train.groupby(['Stay_In_Current_City_Years'])['Purchase'].mean()) 
df_city_period['f_current_city_median'] = train.groupby(['Stay_In_Current_City_Years'])['Purchase'].median()
df_city_period['f_current_city_min'] = train.groupby(['Stay_In_Current_City_Years'])['Purchase'].min()
df_city_period['f_current_city_max'] = train.groupby(['Stay_In_Current_City_Years'])['Purchase'].max()
df_city_period['f_current_city_std'] = train.groupby(['Stay_In_Current_City_Years'])['Purchase'].std()
df_city_period = df_city_period.reset_index()
df_city_period = df_city_period.rename(columns = {'Purchase' : 'f_current_city_m'})
print(df_city_period.shape)
# add new column
train = pd.merge(train, df_city_period, how = 'left', on= ['Stay_In_Current_City_Years'] )
train.shape

test = pd.merge(test, df_city_period, how = 'left', on= ['Stay_In_Current_City_Years'] )

(5, 6)


In [23]:
# Product Category 1 mean 
df_prod_category_1 = pd.DataFrame(train.groupby(['Product_Category_1_Rare'])['Purchase'].mean())
df_prod_category_1['f_prod_category_1_tot_amt'] = train.groupby(['Product_Category_1_Rare'])['Purchase'].sum()
df_prod_category_1['f_prod_category_1_median'] = train.groupby(['Product_Category_1_Rare'])['Purchase'].median()
df_prod_category_1['f_prod_category_1_min'] = train.groupby(['Product_Category_1_Rare'])['Purchase'].min()
df_prod_category_1['f_prod_category_1_max'] = train.groupby(['Product_Category_1_Rare'])['Purchase'].max()
df_prod_category_1['f_prod_category_1_std'] = train.groupby(['Product_Category_1_Rare'])['Purchase'].std()
df_prod_category_1 = df_prod_category_1.reset_index()
df_prod_category_1 = df_prod_category_1.rename(columns = {'Purchase' : 'f_prod_category_1_m'})
print(df_prod_category_1.shape)
# add new column
train = pd.merge(train, df_prod_category_1, how = 'left', on= ['Product_Category_1_Rare'] )
train.shape

test = pd.merge(test, df_prod_category_1, how = 'left', on= ['Product_Category_1_Rare'] )

(12, 7)


In [24]:
# Product Category 2 mean 
df_prod_category_2 = pd.DataFrame(train.groupby(['Product_Category_2_Rare'])['Purchase'].mean()) 
df_prod_category_2['f_prod_category_2_tot_amt'] = train.groupby(['Product_Category_2_Rare'])['Purchase'].sum()
df_prod_category_2['f_prod_category_2_median'] = train.groupby(['Product_Category_2_Rare'])['Purchase'].median()
df_prod_category_2['f_prod_category_2_min'] = train.groupby(['Product_Category_2_Rare'])['Purchase'].min()
df_prod_category_2['f_prod_category_2_max'] = train.groupby(['Product_Category_2_Rare'])['Purchase'].max()
df_prod_category_2['f_prod_category_2_std'] = train.groupby(['Product_Category_2_Rare'])['Purchase'].std()
df_prod_category_2 = df_prod_category_2.reset_index()
df_prod_category_2 = df_prod_category_2.rename(columns = {'Purchase' : 'f_prod_category_2_m'})
print(df_prod_category_2.shape)
# add new column
train = pd.merge(train, df_prod_category_2, how = 'left', on= ['Product_Category_2_Rare'] )
train.shape

test = pd.merge(test, df_prod_category_2, how = 'left', on= ['Product_Category_2_Rare'] )

(14, 7)


In [25]:
# Product Category 3 mean 
df_prod_category_3 = pd.DataFrame(train.groupby(['Product_Category_3_Rare'])['Purchase'].mean()) 
df_prod_category_3['f_prod_category_3_tot_amt'] = train.groupby(['Product_Category_3_Rare'])['Purchase'].sum()
df_prod_category_3['f_prod_category_3_median'] = train.groupby(['Product_Category_3_Rare'])['Purchase'].median()
df_prod_category_3['f_prod_category_3_min'] = train.groupby(['Product_Category_3_Rare'])['Purchase'].min()
df_prod_category_3['f_prod_category_3_max'] = train.groupby(['Product_Category_3_Rare'])['Purchase'].max()
df_prod_category_3['f_prod_category_3_std'] = train.groupby(['Product_Category_3_Rare'])['Purchase'].std()
df_prod_category_3 = df_prod_category_3.reset_index()
df_prod_category_3 = df_prod_category_3.rename(columns = {'Purchase' : 'f_prod_category_3_m'})
print(df_prod_category_3.shape)
# add new column
train = pd.merge(train, df_prod_category_3, how = 'left', on= ['Product_Category_3_Rare'] )
train.shape

test = pd.merge(test, df_prod_category_3, how = 'left', on= ['Product_Category_3_Rare'] )

(9, 7)


In [26]:
# Age & Occupation mean 
df_age_occupation_m = pd.DataFrame(train.groupby(['Age', 'Occupation'])['Purchase'].mean()) 
df_age_occupation_m = df_age_occupation_m.reset_index()
df_age_occupation_m = df_age_occupation_m.rename(columns = {'Purchase' : 'f_age_occupation_m'})
print(df_age_occupation_m.shape)
# add new column
train = pd.merge(train, df_age_occupation_m, how = 'left', on= ['Age', 'Occupation'] )
train.shape

test = pd.merge(test, df_age_occupation_m, how = 'left', on= ['Age', 'Occupation'] )

(134, 3)


In [27]:
# Age & city category mean 
df_age_city_m = pd.DataFrame(train.groupby(['Age', 'City_Category'])['Purchase'].mean()) 
df_age_city_m = df_age_city_m.reset_index()
df_age_city_m = df_age_city_m.rename(columns = {'Purchase' : 'f_age_city_m'})
print(df_age_city_m.shape)
# add new column
train = pd.merge(train, df_age_city_m, how = 'left', on= ['Age', 'City_Category'] )
train.shape

test = pd.merge(test, df_age_city_m, how = 'left', on= ['Age', 'City_Category'] )

(21, 3)


In [28]:
# Age &  Current City mean 
df_age_current_city_m = pd.DataFrame(train.groupby(['Age', 'Stay_In_Current_City_Years'])['Purchase'].mean()) 
df_age_current_city_m = df_age_current_city_m.reset_index()
df_age_current_city_m = df_age_current_city_m.rename(columns = {'Purchase' : 'f_age_current_city_m'})
print(df_age_current_city_m.shape)
# add new column
train = pd.merge(train, df_age_current_city_m, how = 'left', on= ['Age', 'Stay_In_Current_City_Years'] )
train.shape

test = pd.merge(test, df_age_current_city_m, how = 'left', on= ['Age', 'Stay_In_Current_City_Years'] )

(35, 3)


In [29]:
# Age &  Product Category 1 mean 
df_age_prod_category_1_m = pd.DataFrame(train.groupby(['Age', 'Product_Category_1_Rare'])['Purchase'].mean()) 
df_age_prod_category_1_m = df_age_prod_category_1_m.reset_index()
df_age_prod_category_1_m = df_age_prod_category_1_m.rename(columns = {'Purchase' : 'f_age_prod_category_1_m'})
print(df_age_prod_category_1_m.shape)
# add new column
train = pd.merge(train, df_age_prod_category_1_m, how = 'left', on= ['Age', 'Product_Category_1_Rare'] )
train.shape

test = pd.merge(test, df_age_prod_category_1_m, how = 'left', on= ['Age', 'Product_Category_1_Rare'] )

(84, 3)


In [30]:
# Age &  Product Category 2 mean 
df_age_prod_category_2_m = pd.DataFrame(train.groupby(['Age', 'Product_Category_2_Rare'])['Purchase'].mean()) 
df_age_prod_category_2_m = df_age_prod_category_2_m.reset_index()
df_age_prod_category_2_m = df_age_prod_category_2_m.rename(columns = {'Purchase' : 'f_age_prod_category_2_m'})
print(df_age_prod_category_2_m.shape)
# add new column
train = pd.merge(train, df_age_prod_category_2_m, how = 'left', on= ['Age', 'Product_Category_2_Rare'] )
train.shape

test = pd.merge(test, df_age_prod_category_2_m, how = 'left', on= ['Age', 'Product_Category_2_Rare'] )

(98, 3)


In [31]:
# Age &  Product Category 3 mean 
df_age_prod_category_3_m = pd.DataFrame(train.groupby(['Age', 'Product_Category_3_Rare'])['Purchase'].mean()) 
df_age_prod_category_3_m = df_age_prod_category_3_m.reset_index()
df_age_prod_category_3_m = df_age_prod_category_3_m.rename(columns = {'Purchase' : 'f_age_prod_category_3_m'})
print(df_age_prod_category_3_m.shape)
# add new column
train = pd.merge(train, df_age_prod_category_3_m, how = 'left', on= ['Age', 'Product_Category_3_Rare'] )
train.shape

test = pd.merge(test, df_age_prod_category_3_m, how = 'left', on= ['Age', 'Product_Category_3_Rare'] )

(63, 3)


In [32]:
# Age &  Product Category 1 mean 
df_city_stay_m = pd.DataFrame(train.groupby(['City_Category', 'Stay_In_Current_City_Years'])['Purchase'].mean()) 
df_city_stay_m = df_city_stay_m.reset_index()
df_city_stay_m = df_city_stay_m.rename(columns = {'Purchase' : 'f_city_current_stay_m'})
print(df_city_stay_m.shape)
# add new column
train = pd.merge(train, df_city_stay_m, how = 'left', on= ['City_Category', 'Stay_In_Current_City_Years'] )
train.shape

test = pd.merge(test, df_city_stay_m, how = 'left', on= ['City_Category', 'Stay_In_Current_City_Years'] )

(15, 3)


In [33]:
# Occupation & City mean 
df_occupation_city_m = pd.DataFrame(train.groupby(['Occupation', 'City_Category'])['Purchase'].mean()) 
df_occupation_city_m = df_occupation_city_m.reset_index()
df_occupation_city_m = df_occupation_city_m.rename(columns = {'Purchase' : 'f_occupation_city_m'})
print(df_occupation_city_m.shape)
# add new column
train = pd.merge(train, df_occupation_city_m, how = 'left', on= ['Occupation', 'City_Category'] )
train.shape

test = pd.merge(test, df_occupation_city_m, how = 'left', on= ['Occupation', 'City_Category'] )

(63, 3)


In [34]:
# Occupation & Current City mean 
df_occupation_current_city_m = pd.DataFrame(train.groupby(['Occupation', 'Stay_In_Current_City_Years'])['Purchase'].mean()) 
df_occupation_current_city_m = df_occupation_current_city_m.reset_index()
df_occupation_current_city_m = df_occupation_current_city_m.rename(columns = {'Purchase' : 'f_occupation_current_city_m'})
print(df_occupation_current_city_m.shape)
# add new column
train = pd.merge(train, df_occupation_current_city_m, how = 'left', on= ['Occupation', 'Stay_In_Current_City_Years'] )
train.shape

test = pd.merge(test, df_occupation_current_city_m, how = 'left', on= ['Occupation', 'Stay_In_Current_City_Years'] )

(104, 3)


In [35]:
# Product Category 1 & 2 mean 
df_product_category_1_2_m = pd.DataFrame(train.groupby(['Product_Category_1_Rare', 'Product_Category_2_Rare'])['Purchase'].mean()) 
df_product_category_1_2_m = df_product_category_1_2_m.reset_index()
df_product_category_1_2_m = df_product_category_1_2_m.rename(columns = {'Purchase' : 'f_product_category_1_2_m'})
print(df_product_category_1_2_m.shape)
# add new column
train = pd.merge(train, df_product_category_1_2_m, how = 'left', on= ['Product_Category_1_Rare', 'Product_Category_2_Rare'] )
train.shape

test = pd.merge(test, df_product_category_1_2_m, how = 'left', on= ['Product_Category_1_Rare', 'Product_Category_2_Rare'] )

(83, 3)


In [36]:
# Product Category 1 & 3 mean 
df_product_category_1_3_m = pd.DataFrame(train.groupby(['Product_Category_1_Rare', 'Product_Category_3_Rare'])['Purchase'].mean()) 
df_product_category_1_3_m = df_product_category_1_3_m.reset_index()
df_product_category_1_3_m = df_product_category_1_3_m.rename(columns = {'Purchase' : 'f_product_category_1_3_m'})
print(df_product_category_1_3_m.shape)
# add new column
train = pd.merge(train, df_product_category_1_3_m, how = 'left', on= ['Product_Category_1_Rare', 'Product_Category_3_Rare'] )
train.shape

test = pd.merge(test, df_product_category_1_3_m, how = 'left', on= ['Product_Category_1_Rare', 'Product_Category_3_Rare'] )

(55, 3)


In [37]:
# Product Category 1 & 3 mean 
df_product_category_2_3_m = pd.DataFrame(train.groupby(['Product_Category_2_Rare', 'Product_Category_3_Rare'])['Purchase'].mean()) 
df_product_category_2_3_m = df_product_category_2_3_m.reset_index()
df_product_category_2_3_m = df_product_category_2_3_m.rename(columns = {'Purchase' : 'f_product_category_2_3_m'})
print(df_product_category_2_3_m.shape)
# add new column
train = pd.merge(train, df_product_category_2_3_m, how = 'left', on= ['Product_Category_2_Rare', 'Product_Category_3_Rare'] )
train.shape

test = pd.merge(test, df_product_category_2_3_m, how = 'left', on= ['Product_Category_2_Rare', 'Product_Category_3_Rare'] )

(64, 3)


In [38]:
# Product Category 1 & 3 mean 
df_occ_gender_m = pd.DataFrame(train.groupby(['Occupation', 'Gender'])['Purchase'].mean()) 
df_occ_gender_m = df_occ_gender_m.reset_index()
df_occ_gender_m = df_occ_gender_m.rename(columns = {'Purchase' : 'f_occupation_gender_m'})
print(df_occ_gender_m.shape)
# add new column
train = pd.merge(train, df_occ_gender_m, how = 'left', on= ['Occupation', 'Gender'] )
train.shape

test = pd.merge(test, df_occ_gender_m, how = 'left', on= ['Occupation', 'Gender'] )

(42, 3)


In [39]:
# Product Category 1
df_city_product_category_1 = pd.DataFrame() 
df_city_product_category_1['f_city_product_category_1_m'] = train.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].mean()
df_city_product_category_1['f_city_product_category_1_tot_amt'] = train.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].sum()
df_city_product_category_1['f_city_product_category_1_median'] = train.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].median()
df_city_product_category_1['f_city_product_category_1_min'] = train.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].min()
df_city_product_category_1['f_city_product_category_1_max'] = train.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].max()
df_city_product_category_1['f_city_product_category_1_std'] = train.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].std()
df_city_product_category_1 = df_city_product_category_1.reset_index()
print(df_city_product_category_1.shape)
# add new columns
train = pd.merge(train, df_city_product_category_1, how = 'left', on= ['City_Category', 'Product_Category_1_Rare'] )
train.shape

test = pd.merge(test, df_city_product_category_1, how = 'left', on= ['City_Category', 'Product_Category_1_Rare'] )

(36, 8)


In [40]:
# Product Category 2
df_city_product_category_2 = pd.DataFrame() 
df_city_product_category_2['f_city_product_category_2_m'] = train.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].mean()
df_city_product_category_2['f_city_product_category_2_tot_amt'] = train.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].sum()
df_city_product_category_2['f_city_product_category_2_median'] = train.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].median()
df_city_product_category_2['f_city_product_category_2_min'] = train.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].min()
df_city_product_category_2['f_city_product_category_2_max'] = train.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].max()
df_city_product_category_2['f_city_product_category_2_std'] = train.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].std()
df_city_product_category_2 = df_city_product_category_2.reset_index()
print(df_city_product_category_2.shape)
# add new columns
train = pd.merge(train, df_city_product_category_2, how = 'left', on= ['City_Category', 'Product_Category_2_Rare'] )
train.shape

test = pd.merge(test, df_city_product_category_2, how = 'left', on= ['City_Category', 'Product_Category_2_Rare'] )

(42, 8)


In [41]:
# Product Category 3
df_city_product_category_3 = pd.DataFrame() 
df_city_product_category_3['f_city_product_category_3_m'] = train.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].mean()
df_city_product_category_3['f_city_product_category_3_tot_amt'] = train.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].sum()
df_city_product_category_3['f_city_product_category_3_median'] = train.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].median()
df_city_product_category_3['f_city_product_category_3_min'] = train.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].min()
df_city_product_category_3['f_city_product_category_3_max'] = train.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].max()
df_city_product_category_3['f_city_product_category_3_std'] = train.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].std()
df_city_product_category_3 = df_city_product_category_3.reset_index()
print(df_city_product_category_3.shape)
# add new columns
train = pd.merge(train, df_city_product_category_3, how = 'left', on= ['City_Category', 'Product_Category_3_Rare'] )
train.shape

test = pd.merge(test, df_city_product_category_3, how = 'left', on= ['City_Category', 'Product_Category_3_Rare'] )

(27, 8)


In [42]:
# Product Id and Age Categories
df_productid_age = pd.DataFrame() 
df_productid_age['f_productid_age_m'] = train.groupby(['Product_ID_Rare', 'Age'])['Purchase'].mean()
df_productid_age['f_productid_age_tot_amt'] = train.groupby(['Product_ID_Rare', 'Age'])['Purchase'].sum()
df_productid_age['f_productid_age_median'] = train.groupby(['Product_ID_Rare', 'Age'])['Purchase'].median()
df_productid_age['f_productid_age_min'] = train.groupby(['Product_ID_Rare', 'Age'])['Purchase'].min()
df_productid_age['f_productid_age_max'] = train.groupby(['Product_ID_Rare', 'Age'])['Purchase'].max()
# df_productid_age['f_productid_age_std'] = train.groupby(['Product_ID_Rare', 'Age'])['Purchase'].std()
df_productid_age = df_productid_age.reset_index()
print(df_productid_age.shape)
# add new columns
train = pd.merge(train, df_productid_age, how = 'left', on= ['Product_ID_Rare', 'Age'] )
print(train.shape)

# to handle NA values for f_productid_age_min
v_productid_age_min_min = df_productid_age['f_productid_age_min'].min()


test = pd.merge(test, df_productid_age, how = 'left', on= ['Product_ID_Rare', 'Age'] )
v_productid_age_min_min_test = df_productid_age['f_productid_age_min'].min()

(13657, 7)
(550068, 100)


In [43]:
# Product Id and Age Categories
df_productid_occupation = pd.DataFrame() 
df_productid_occupation['f_productid_occupation_m'] = train.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].mean()
df_productid_occupation['f_productid_occupation_tot_amt'] = train.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].sum()
df_productid_occupation['f_productid_occupation_median'] = train.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].median()
df_productid_occupation['f_productid_occupation_min'] = train.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].min()
df_productid_occupation['f_productid_occupation_max'] = train.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].max()
#df_productid_occupation['f_productid_occupation_std'] = train.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].std()
df_productid_occupation = df_productid_occupation.reset_index()
print(df_productid_occupation.shape)
# add new columns
train = pd.merge(train, df_productid_occupation, how = 'left', on= ['Product_ID_Rare', 'Occupation'] )
train.shape

# to handle NA values for f_productid_occupation_min
v_productid_occupation_min_min = df_productid_occupation['f_productid_occupation_min'].min()

test = pd.merge(test, df_productid_occupation, how = 'left', on= ['Product_ID_Rare', 'Occupation'] )
v_productid_occupation_min_min_test = df_productid_occupation['f_productid_occupation_min'].min()

(38665, 7)


In [44]:
# Product Id and City Category Categories
df_productid_city_category = pd.DataFrame() 
df_productid_city_category['f_productid_city_cat_m'] = train.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].mean()
df_productid_city_category['f_productid_city_cat_tot_amt'] = train.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].sum()
df_productid_city_category['f_productid_city_cat_median'] = train.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].median()
df_productid_city_category['f_productid_city_cat_min'] = train.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].min()
df_productid_city_category['f_productid_city_cat_max'] = train.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].max()
df_productid_city_category['f_productid_city_cat_std'] = train.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].std()
df_productid_city_category = df_productid_city_category.reset_index()
print(df_productid_city_category.shape)
# add new columns
train = pd.merge(train, df_productid_city_category, how = 'left', on= ['Product_ID_Rare', 'City_Category'] )
train.shape

test = pd.merge(test, df_productid_city_category, how = 'left', on= ['Product_ID_Rare', 'City_Category'] )

(6018, 8)


In [45]:
train.shape,test.shape

((550068, 111), (233599, 110))

In [46]:
list(train.columns)

['User_ID',
 'Product_ID',
 'Gender',
 'Age',
 'Occupation',
 'City_Category',
 'Stay_In_Current_City_Years',
 'Marital_Status',
 'Product_Category_1',
 'Product_Category_2',
 'Product_Category_3',
 'Purchase',
 'Product_ID_N',
 'Product_ID_S',
 'Product_ID_Rare',
 'Product_Category_1_Rare',
 'Product_Category_2_Rare',
 'Product_Category_3_Rare',
 'Gender_le',
 'ohe_city_A',
 'ohe_city_B',
 'ohe_city_C',
 'f_product_tot_sale_amt',
 'f_product_id_m',
 'f_product_id_median',
 'f_product_id_min',
 'f_product_id_max',
 'f_product_id_std',
 'f_gender_unique_users_cnt',
 'f_age_m',
 'f_age_median',
 'f_age_std',
 'f_Age_Min',
 'f_Age_Max',
 'f_city_m',
 'f_city_median',
 'f_city_min',
 'f_city_max',
 'f_city_std',
 'f_occupation_m',
 'f_occupation_tot_amt',
 'f_current_city_m',
 'f_current_city_median',
 'f_current_city_min',
 'f_current_city_max',
 'f_current_city_std',
 'f_prod_category_1_m',
 'f_prod_category_1_tot_amt',
 'f_prod_category_1_median',
 'f_prod_category_1_min',
 'f_prod_cate

In [47]:
train = train.drop(labels = ['User_ID', 'Product_ID', 'Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years', 'Product_Category_1',
                                        'Product_Category_2', 'Product_Category_3', 'Product_ID_N', 'Product_ID_S'], axis = 1)
print(train.shape) 

test = test.drop(labels = ['User_ID', 'Product_ID', 'Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years', 'Product_Category_1',
                                        'Product_Category_2', 'Product_Category_3', 'Product_ID_N', 'Product_ID_S'], axis = 1)


(550068, 100)


In [48]:
# Independent & Dependent 
X_train = train.drop(labels = ['Purchase'], axis = 1)
y_train = train['Purchase']
print(X_train.shape) ; print(y_train.shape)

(550068, 99)
(550068,)


In [49]:
# identify constant or Zero Variance features
quasi_constant = VarianceThreshold(threshold= 0)
quasi_constant.fit(X_train)

VarianceThreshold(threshold=0)

In [50]:
# identifying constant columns
quasi_constant_cols = []
for col in X_train.columns :
    if col not in X_train.columns[quasi_constant.get_support()] :
        quasi_constant_cols.append(col)
        
print(len(quasi_constant_cols))
print(quasi_constant_cols)

2
['f_city_min', 'f_current_city_min']


In [51]:
X_train = X_train.drop(labels= quasi_constant_cols, axis = 1)
print(X_train.shape)

(550068, 97)


In [52]:
def correlation (dataset, threshold) :
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)) :
        for j in range(i) :
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr 

In [53]:
# highly correlated independent features list
highly_correlated_cols_lst = list(correlation(X_train, 0.8))
len(highly_correlated_cols_lst)

65

In [54]:
# dropping constant and highly correlated features 
independent_features_lst = []

for col in X_train.columns :
    if col not in (highly_correlated_cols_lst+quasi_constant_cols) :
        independent_features_lst.append(col)
len(independent_features_lst)

32

In [55]:
# ANOVA Feature Selection Method
anova_features = SelectKBest(score_func= f_regression, k = 25)
anova_features = anova_features.fit(X_train[independent_features_lst], y_train)

In [56]:
anova_features_lst = list(X_train[independent_features_lst].columns[anova_features.get_support()])
len(anova_features_lst)

25

In [57]:
# Step Backward Selection
linear_regressor = LinearRegression()
backward_selection = SFS(estimator = linear_regressor, k_features = 25, forward = False, scoring = 'r2', cv = 3)
backward_selection = backward_selection.fit(X_train[independent_features_lst], y_train)
print("R Square of the Step Backward Selection :", backward_selection.k_score_)

R Square of the Step Backward Selection : 0.698127368008059


In [58]:
# Step Backward Feature Selection list
bfs_features_lst = list(backward_selection.k_feature_names_)
len(bfs_features_lst)

25

In [59]:
# Lasso feature selection implementation
lasso_select_features = SelectFromModel(Lasso(alpha = 0.3, random_state = 1))
lasso_select_features= lasso_select_features.fit(X_train[independent_features_lst], y_train)

In [60]:
# lasso features list
lasso_features_f_list = list(X_train[independent_features_lst].columns[lasso_select_features.get_support()])
len(lasso_features_f_list)

27

In [61]:
# Gradient Boosting Regressor
from sklearn.ensemble import RandomForestRegressor
rfr_features = RandomForestRegressor(random_state= 10, criterion= 'mse', max_depth= 8, max_features= 'sqrt', max_leaf_nodes= 20, min_samples_leaf= 100, min_samples_split= 200, n_estimators= 20 )
rfr_features = rfr_features.fit(X_train[independent_features_lst],y_train)

In [62]:
# ranking the features importance
plt.figure(figsize= (14,12))
df_rfr_features = pd.Series(rfr_features.feature_importances_, index = X_train[independent_features_lst].columns)
df_rfr_features = df_rfr_features.sort_values(ascending = False)
df_rfr_features.sort_values(ascending = True).plot(kind = 'barh', )
plt.show()

In [63]:
# rfr_features_lst
rfr_features_lst = list(df_rfr_features[0:25].index)
len(rfr_features_lst) 

25

In [64]:
# feature importance method
extratreesreg = ExtraTreesRegressor(n_estimators= 20)
extratreesreg = extratreesreg.fit(X_train[independent_features_lst], y_train)

In [65]:
# ranking the features importance
plt.figure(figsize= (14,12))
extratrees_ranked_features = pd.Series(extratreesreg.feature_importances_, index = X_train[independent_features_lst].columns)
extratrees_ranked_features = extratrees_ranked_features.sort_values(ascending = False)
extratrees_ranked_features.sort_values(ascending = True).plot(kind = 'barh')
plt.show()

In [66]:
# extratrees regressor feature importance list
extratrees_feature_lst = list(extratrees_ranked_features[0:25].index)
len(extratrees_feature_lst)

25

In [67]:
X_train, X_validation, y_train, y_validation = train_test_split(train.drop(labels = ['Purchase'], axis = 1), train['Purchase'], test_size = 0.2, random_state = 0 )

In [68]:
# Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor = regressor.fit(X_train[rfr_features_lst], y_train)
regressor_R_squared = regressor.score(X_train[rfr_features_lst], y_train)
print('R squared of Regressor model:', regressor_R_squared)

R squared of Regressor model: 0.6973208344372424


In [69]:
# Prediction of X_validation Dataset
y_validation_predict_lr = regressor.predict(X_validation[rfr_features_lst])

In [70]:
# XGBoost parameter optimizing
from sklearn.model_selection import GridSearchCV

# parameter grid
param_grid = {
              "learning_rate": [0.05, 0.1],
              "min_child_weight": [5, 8],
              "gamma" : [0],
              "reg_alpha": [0.01, 1],
              "subsample": [0.8],
              'n_estimators': [200],
              'max_depth' : [5, 8],
              "colsample_bytree" : [0.8],
              }

from xgboost import XGBRegressor
xgbr = XGBRegressor( random_state = 0 )

# Grid Search 
grid_search = GridSearchCV(estimator = xgbr, param_grid = param_grid, cv = 3, n_jobs = 3, verbose = 2)

# Fitting the grid_search to the model ####
grid_search.fit(X_train[extratrees_feature_lst], y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed: 81.7min
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed: 119.5min finished


GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    num_parallel_tree=None, predictor=None,
                                    random_state=0, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                    

In [71]:
# Grid Search Best Params
grid_search.best_params_

{'colsample_bytree': 0.8,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_weight': 8,
 'n_estimators': 200,
 'reg_alpha': 1,
 'subsample': 0.8}

In [72]:
# XGBoost Regressor
from xgboost import XGBRegressor
xgbr = XGBRegressor(colsample_bytree= 0.8, gamma= 0, learning_rate= 0.1, max_depth= 8, min_child_weight= 5, n_estimators= 200,
reg_alpha= 0.01, subsample= 0.8, random_state = 0 )
xgbr = xgbr.fit(X_train[rfr_features_lst], y_train)
xgboost_R_squared = xgbr.score(X_train[rfr_features_lst], y_train)
print('R squared of xgboost model:', xgboost_R_squared)

R squared of xgboost model: 0.7593040646407299


In [73]:
# Prediction of X_validation Dataset  
y_validation_predict_xgb = xgbr.predict(X_validation[rfr_features_lst])

In [74]:
from sklearn.model_selection import GridSearchCV

# parameter grid
param_grid = {
              "iterations": [600],
              "learning_rate": [0.05, 0.1],
              "depth" : [6, 8],
              "l2_leaf_reg" : [0.1, 1],
              "bagging_temperature" : [0, 1],
              "random_strength" : [0, 1 ]
              }

from catboost import CatBoostRegressor
catboostr = CatBoostRegressor(loss_function='RMSE', random_state= 15)

# Grid Search 
grid_search = GridSearchCV(estimator = catboostr, param_grid = param_grid, cv = 3, n_jobs = 3, verbose = 1)

# Fitting the grid_search to the model ####
grid_search.fit(X_train[bfs_features_lst], y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 46.0min
[Parallel(n_jobs=3)]: Done  96 out of  96 | elapsed: 98.0min finished


0:	learn: 4687.9850227	total: 546ms	remaining: 5m 27s
1:	learn: 4396.3581510	total: 824ms	remaining: 4m 6s
2:	learn: 4142.1320625	total: 1.05s	remaining: 3m 29s
3:	learn: 3923.2083380	total: 1.32s	remaining: 3m 16s
4:	learn: 3737.7644165	total: 1.53s	remaining: 3m 1s
5:	learn: 3577.9145449	total: 1.73s	remaining: 2m 50s
6:	learn: 3440.4037679	total: 1.91s	remaining: 2m 41s
7:	learn: 3323.5233494	total: 2.32s	remaining: 2m 51s
8:	learn: 3224.7465065	total: 2.54s	remaining: 2m 46s
9:	learn: 3139.0681962	total: 2.75s	remaining: 2m 42s
10:	learn: 3067.3357176	total: 2.96s	remaining: 2m 38s
11:	learn: 3009.4271995	total: 3.15s	remaining: 2m 34s
12:	learn: 2959.5493852	total: 3.37s	remaining: 2m 32s
13:	learn: 2918.7873440	total: 3.54s	remaining: 2m 28s
14:	learn: 2881.1571573	total: 3.87s	remaining: 2m 30s
15:	learn: 2851.6904681	total: 4.04s	remaining: 2m 27s
16:	learn: 2826.8360062	total: 4.26s	remaining: 2m 26s
17:	learn: 2806.8200862	total: 4.45s	remaining: 2m 23s
18:	learn: 2789.490328

150:	learn: 2602.9041570	total: 30.3s	remaining: 1m 30s
151:	learn: 2602.4520412	total: 30.5s	remaining: 1m 29s
152:	learn: 2601.7696390	total: 30.7s	remaining: 1m 29s
153:	learn: 2601.1509482	total: 30.9s	remaining: 1m 29s
154:	learn: 2600.4442010	total: 31.1s	remaining: 1m 29s
155:	learn: 2600.1664766	total: 31.3s	remaining: 1m 29s
156:	learn: 2599.9384629	total: 31.5s	remaining: 1m 28s
157:	learn: 2599.7580917	total: 31.6s	remaining: 1m 28s
158:	learn: 2598.9202120	total: 31.8s	remaining: 1m 28s
159:	learn: 2598.4207388	total: 32s	remaining: 1m 28s
160:	learn: 2598.1646439	total: 32.2s	remaining: 1m 27s
161:	learn: 2597.6061808	total: 32.4s	remaining: 1m 27s
162:	learn: 2596.9598911	total: 32.6s	remaining: 1m 27s
163:	learn: 2596.6678275	total: 32.8s	remaining: 1m 27s
164:	learn: 2596.2558365	total: 33s	remaining: 1m 26s
165:	learn: 2595.9589159	total: 33.2s	remaining: 1m 26s
166:	learn: 2595.8128886	total: 33.4s	remaining: 1m 26s
167:	learn: 2595.0524977	total: 33.6s	remaining: 1m 

298:	learn: 2557.6502375	total: 1m	remaining: 1m
299:	learn: 2557.4335961	total: 1m	remaining: 1m
300:	learn: 2556.9447469	total: 1m	remaining: 1m
301:	learn: 2556.7866838	total: 1m	remaining: 1m
302:	learn: 2556.4909460	total: 1m 1s	remaining: 59.9s
303:	learn: 2555.8375085	total: 1m 1s	remaining: 59.7s
304:	learn: 2555.6665079	total: 1m 1s	remaining: 59.4s
305:	learn: 2555.2692866	total: 1m 1s	remaining: 59.2s
306:	learn: 2555.0637254	total: 1m 1s	remaining: 59s
307:	learn: 2554.7025029	total: 1m 2s	remaining: 58.8s
308:	learn: 2554.3397127	total: 1m 2s	remaining: 58.6s
309:	learn: 2554.1029342	total: 1m 2s	remaining: 58.4s
310:	learn: 2553.8996233	total: 1m 2s	remaining: 58.2s
311:	learn: 2553.5419352	total: 1m 2s	remaining: 58s
312:	learn: 2553.3578992	total: 1m 3s	remaining: 57.8s
313:	learn: 2553.1642577	total: 1m 3s	remaining: 57.6s
314:	learn: 2553.0273425	total: 1m 3s	remaining: 57.4s
315:	learn: 2552.7796539	total: 1m 3s	remaining: 57.2s
316:	learn: 2552.5294594	total: 1m 3s	

446:	learn: 2525.9458366	total: 1m 31s	remaining: 31.2s
447:	learn: 2525.6701621	total: 1m 31s	remaining: 31s
448:	learn: 2525.4957799	total: 1m 31s	remaining: 30.8s
449:	learn: 2525.4332310	total: 1m 31s	remaining: 30.6s
450:	learn: 2525.1804530	total: 1m 31s	remaining: 30.4s
451:	learn: 2524.8925966	total: 1m 32s	remaining: 30.2s
452:	learn: 2524.7378860	total: 1m 32s	remaining: 30s
453:	learn: 2524.5552426	total: 1m 32s	remaining: 29.8s
454:	learn: 2524.3010412	total: 1m 32s	remaining: 29.6s
455:	learn: 2524.1283682	total: 1m 32s	remaining: 29.3s
456:	learn: 2523.8405173	total: 1m 33s	remaining: 29.1s
457:	learn: 2523.5495649	total: 1m 33s	remaining: 28.9s
458:	learn: 2523.4247828	total: 1m 33s	remaining: 28.7s
459:	learn: 2523.2330188	total: 1m 33s	remaining: 28.5s
460:	learn: 2523.0991487	total: 1m 33s	remaining: 28.3s
461:	learn: 2522.9080527	total: 1m 34s	remaining: 28.1s
462:	learn: 2522.7411515	total: 1m 34s	remaining: 27.9s
463:	learn: 2522.4266366	total: 1m 34s	remaining: 27

593:	learn: 2499.4303324	total: 1m 59s	remaining: 1.21s
594:	learn: 2499.1869456	total: 2m	remaining: 1.01s
595:	learn: 2499.0117364	total: 2m	remaining: 808ms
596:	learn: 2498.7521278	total: 2m	remaining: 606ms
597:	learn: 2498.5866499	total: 2m	remaining: 404ms
598:	learn: 2498.5056255	total: 2m	remaining: 202ms
599:	learn: 2498.3942784	total: 2m 1s	remaining: 0us


GridSearchCV(cv=3,
             estimator=<catboost.core.CatBoostRegressor object at 0x0000017B93D46610>,
             n_jobs=3,
             param_grid={'bagging_temperature': [0, 1], 'depth': [6, 8],
                         'iterations': [600], 'l2_leaf_reg': [0.1, 1],
                         'learning_rate': [0.05, 0.1],
                         'random_strength': [0, 1]},
             verbose=1)

In [75]:
grid_search.best_params_

{'bagging_temperature': 0,
 'depth': 8,
 'iterations': 600,
 'l2_leaf_reg': 0.1,
 'learning_rate': 0.1,
 'random_strength': 1}

In [76]:
catboostr = CatBoostRegressor(loss_function='RMSE', random_state= 15, bagging_temperature = 0, depth= 8, iterations = 600, 
                              l2_leaf_reg = 0.1, learning_rate = 0.1, random_strength = 0 )
catboostr = catboostr.fit(X_train[rfr_features_lst], y_train)
catboost_R_squared = catboostr.score(X_train[rfr_features_lst], y_train)
print('R squared of Regressor model:', catboost_R_squared)

0:	learn: 4685.5031939	total: 226ms	remaining: 2m 15s
1:	learn: 4391.5806603	total: 449ms	remaining: 2m 14s
2:	learn: 4136.8908326	total: 680ms	remaining: 2m 15s
3:	learn: 3916.1824958	total: 896ms	remaining: 2m 13s
4:	learn: 3726.0134971	total: 1.13s	remaining: 2m 13s
5:	learn: 3564.7834341	total: 1.33s	remaining: 2m 12s
6:	learn: 3426.6186844	total: 1.55s	remaining: 2m 11s
7:	learn: 3308.6356045	total: 1.77s	remaining: 2m 11s
8:	learn: 3209.6405050	total: 1.98s	remaining: 2m 9s
9:	learn: 3125.9880841	total: 2.21s	remaining: 2m 10s
10:	learn: 3055.2319891	total: 2.4s	remaining: 2m 8s
11:	learn: 2995.8722631	total: 2.63s	remaining: 2m 8s
12:	learn: 2946.0017429	total: 2.84s	remaining: 2m 8s
13:	learn: 2902.1243899	total: 3.05s	remaining: 2m 7s
14:	learn: 2866.8866763	total: 3.26s	remaining: 2m 7s
15:	learn: 2837.2750557	total: 3.47s	remaining: 2m 6s
16:	learn: 2813.0123705	total: 3.67s	remaining: 2m 5s
17:	learn: 2792.3729655	total: 3.85s	remaining: 2m 4s
18:	learn: 2774.8811024	total:

151:	learn: 2600.3353364	total: 29.2s	remaining: 1m 26s
152:	learn: 2599.8254081	total: 29.4s	remaining: 1m 25s
153:	learn: 2599.5857685	total: 29.6s	remaining: 1m 25s
154:	learn: 2599.3402193	total: 29.8s	remaining: 1m 25s
155:	learn: 2599.1527715	total: 30s	remaining: 1m 25s
156:	learn: 2598.7077752	total: 30.1s	remaining: 1m 25s
157:	learn: 2598.3515230	total: 30.3s	remaining: 1m 24s
158:	learn: 2598.1169597	total: 30.5s	remaining: 1m 24s
159:	learn: 2597.7710170	total: 30.7s	remaining: 1m 24s
160:	learn: 2597.3392258	total: 30.9s	remaining: 1m 24s
161:	learn: 2596.9679025	total: 31.1s	remaining: 1m 24s
162:	learn: 2596.6868646	total: 31.3s	remaining: 1m 23s
163:	learn: 2596.3186949	total: 31.4s	remaining: 1m 23s
164:	learn: 2595.7889111	total: 31.6s	remaining: 1m 23s
165:	learn: 2595.3994953	total: 31.8s	remaining: 1m 23s
166:	learn: 2595.1418866	total: 32s	remaining: 1m 23s
167:	learn: 2594.8349091	total: 32.2s	remaining: 1m 22s
168:	learn: 2594.6014516	total: 32.4s	remaining: 1m 

300:	learn: 2561.9266024	total: 56.9s	remaining: 56.5s
301:	learn: 2561.8288651	total: 57.3s	remaining: 56.5s
302:	learn: 2561.7219749	total: 57.5s	remaining: 56.4s
303:	learn: 2561.3540593	total: 57.7s	remaining: 56.2s
304:	learn: 2561.1014423	total: 58s	remaining: 56.1s
305:	learn: 2560.9509203	total: 58.2s	remaining: 55.9s
306:	learn: 2560.6008462	total: 58.4s	remaining: 55.7s
307:	learn: 2560.4021788	total: 58.6s	remaining: 55.6s
308:	learn: 2560.0931443	total: 58.8s	remaining: 55.4s
309:	learn: 2559.8007837	total: 59s	remaining: 55.2s
310:	learn: 2559.6113825	total: 59.2s	remaining: 55s
311:	learn: 2559.4785246	total: 59.4s	remaining: 54.9s
312:	learn: 2559.2889527	total: 59.7s	remaining: 54.7s
313:	learn: 2559.1562575	total: 59.9s	remaining: 54.5s
314:	learn: 2559.0707796	total: 1m	remaining: 54.3s
315:	learn: 2558.9019895	total: 1m	remaining: 54.1s
316:	learn: 2558.7715814	total: 1m	remaining: 53.9s
317:	learn: 2558.5927050	total: 1m	remaining: 53.7s
318:	learn: 2558.4552112	tot

450:	learn: 2532.4181644	total: 1m 25s	remaining: 28.1s
451:	learn: 2532.2074911	total: 1m 25s	remaining: 27.9s
452:	learn: 2532.0189542	total: 1m 25s	remaining: 27.7s
453:	learn: 2531.7877638	total: 1m 25s	remaining: 27.5s
454:	learn: 2531.6165435	total: 1m 25s	remaining: 27.4s
455:	learn: 2531.4928296	total: 1m 26s	remaining: 27.2s
456:	learn: 2531.2677468	total: 1m 26s	remaining: 27s
457:	learn: 2531.0561456	total: 1m 26s	remaining: 26.8s
458:	learn: 2530.8808232	total: 1m 26s	remaining: 26.6s
459:	learn: 2530.7010569	total: 1m 26s	remaining: 26.4s
460:	learn: 2530.5600554	total: 1m 26s	remaining: 26.2s
461:	learn: 2530.4156288	total: 1m 27s	remaining: 26s
462:	learn: 2530.3509901	total: 1m 27s	remaining: 25.8s
463:	learn: 2530.1686246	total: 1m 27s	remaining: 25.6s
464:	learn: 2529.9988448	total: 1m 27s	remaining: 25.4s
465:	learn: 2529.8246449	total: 1m 27s	remaining: 25.2s
466:	learn: 2529.6919946	total: 1m 27s	remaining: 25.1s
467:	learn: 2529.5439466	total: 1m 28s	remaining: 24

597:	learn: 2508.7794528	total: 1m 53s	remaining: 378ms
598:	learn: 2508.5885912	total: 1m 53s	remaining: 189ms
599:	learn: 2508.4775208	total: 1m 53s	remaining: 0us
R squared of Regressor model: 0.7505556356144297


In [77]:
# Prediction of X_validation Dataset 7278670690446742
y_validation_predict_cbr = catboostr.predict(X_validation[rfr_features_lst])

In [78]:
import joblib
# Linear Regression
print(joblib.dump(regressor, "linear_regressor.pkl"))

# XGBoost Regressor
print(joblib.dump(xgbr, "xgbr.pkl"))

# catboost 
print(joblib.dump(catboostr, "catboost.pkl"))

['linear_regressor.pkl']
['xgbr.pkl']
['catboost.pkl']


In [81]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer()
d = imp.fit_transform(test)
test = pd.DataFrame(d, index = test.index, columns=test.columns)

In [83]:
pred_lr = regressor.predict(test[rfr_features_lst])

In [84]:
pred_xgb = xgbr.predict(test[rfr_features_lst])

In [85]:
pred_cbr = catboostr.predict(test[rfr_features_lst])

In [86]:
testcopy = pd.read_csv('test.csv')

In [87]:
sub=pd.concat([testcopy['User_ID'],testcopy['Product_ID'],pd.DataFrame(pred_lr,columns=['Purchase'])],1)
sub.to_csv('lr.csv',index=False)

In [88]:
sub=pd.concat([testcopy['User_ID'],testcopy['Product_ID'],pd.DataFrame(pred_xgb,columns=['Purchase'])],1)
sub.to_csv('xgb.csv',index=False)

In [89]:
sub=pd.concat([testcopy['User_ID'],testcopy['Product_ID'],pd.DataFrame(pred_cbr,columns=['Purchase'])],1)
sub.to_csv('cbr.csv',index=False)