# Recommendation-Based Purchase Prediction

Loading libraries

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 

Reading the data

In [41]:
df = pd.read_csv("C:/Users/USER/Downloads/online-shoppers-data.csv")
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


Summary Statistics

In [42]:
df.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


Information on data

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

# Data Preprocessing

Checking for missing values

In [44]:
df.isnull().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

Removing unwanted features

In [45]:
corr_matrix = df.corr().abs()
threshold = 0.9 
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]
print(f"Features to drop: {to_drop}")
new_df = df.drop(columns=to_drop)

Features to drop: ['ExitRates']


Handling Categorical values

In [46]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
new_df["Month"] = label_encoder.fit_transform(new_df["Month"])  
new_df["VisitorType"] = label_encoder.fit_transform(new_df["VisitorType"])  
new_df["Weekend"] = new_df["Weekend"].astype(int)  
new_df["Revenue"] = new_df["Revenue"].astype(int)  

Standardize numerical columns

In [47]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_cols = ["Administrative_Duration", "Informational_Duration", "ProductRelated_Duration","BounceRates", "ExitRates", 
                  "PageValues", "SpecialDay"]
new_df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [48]:
new_df

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue,ExitRates
0,0,-0.457191,0,-0.244931,1,-0.624348,3.667189,-0.317178,-0.308821,2,1,1,1,1,2,0,0,3.229316
1,0,-0.457191,0,-0.244931,2,-0.590903,-0.457683,-0.317178,-0.308821,2,2,2,1,2,2,0,0,1.171473
2,0,-0.457191,0,-0.244931,1,-0.624348,3.667189,-0.317178,-0.308821,2,4,1,9,3,2,0,0,3.229316
3,0,-0.457191,0,-0.244931,2,-0.622954,0.573535,-0.317178,-0.308821,2,3,2,2,4,2,0,0,1.994610
4,0,-0.457191,0,-0.244931,10,-0.296430,-0.045196,-0.317178,-0.308821,2,3,3,1,4,2,1,0,0.142551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0.363075,0,-0.244931,53,0.307822,-0.310366,0.342125,-0.308821,1,4,6,1,1,2,1,0,-0.288966
12326,0,-0.457191,0,-0.244931,5,-0.380957,-0.457683,-0.317178,-0.308821,7,3,2,1,8,2,1,0,-0.447364
12327,0,-0.457191,0,-0.244931,6,-0.528063,1.261014,-0.317178,-0.308821,7,3,2,1,13,2,1,0,0.897093
12328,4,-0.032916,0,-0.244931,15,-0.443536,-0.457683,-0.317178,-0.308821,7,2,2,3,11,2,0,0,-0.453140
