In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#from xgboost import XGBRegressor

sns.set(rc={'figure.figsize': [9, 9]}, font_scale=1.2)

In [2]:
df = pd.read_csv('black_friday.csv')
df

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,13,B,1,1,20,,,368
550064,1006035,P00375436,F,26-35,1,C,3,0,20,,,371
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,,,137
550066,1006038,P00375436,F,55+,1,C,2,0,20,,,365


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [4]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [None]:
sns.pairplot(df)

In [6]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,0,0-17,10,1,2,0,3,,,8370
1,1000001,P00248942,0,0-17,10,1,2,0,1,6.0,14.0,15200
2,1000001,P00087842,0,0-17,10,1,2,0,12,,,1422
3,1000001,P00085442,0,0-17,10,1,2,0,12,14.0,,1057
4,1000002,P00285442,1,55+,16,3,4+,0,8,,,7969


In [5]:
df['Gender'] = df['Gender'].replace(['F','M'],['0','1'])
df['City_Category'] = df['City_Category'].replace(['A','B','C'],['1','2','3'])

In [10]:
x = df.drop(['Purchase','User_ID','Product_ID','Age','Marital_Status','Stay_In_Current_City_Years'], axis=1)
x

y = df['Purchase']
y

0          8370
1         15200
2          1422
3          1057
4          7969
          ...  
550063      368
550064      371
550065      137
550066      365
550067      490
Name: Purchase, Length: 550068, dtype: int64

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
#df.info()

In [None]:
#df.describe()

In [12]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,0,0-17,10,1,2,0,3,,,8370
1,1000001,P00248942,0,0-17,10,1,2,0,1,6.0,14.0,15200
2,1000001,P00087842,0,0-17,10,1,2,0,12,,,1422
3,1000001,P00085442,0,0-17,10,1,2,0,12,14.0,,1057
4,1000002,P00285442,1,55+,16,3,4+,0,8,,,7969


In [13]:
scaler= StandardScaler()

scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [14]:
models = {
    "LR": LinearRegression(),
    "KNNR" : KNeighborsRegressor(), 
    "SVR": SVR(),
    "DT": DecisionTreeRegressor(),
    "RF": RandomForestRegressor(),
##"XGBR": XGBRegressor()
}

In [15]:
for name, model in models.items():
    print(f'Using model: {name}')
    model.fit(x_train, y_train)
    print(f'Training Score: {model.score(x_train, y_train)}')
    print(f'Test Score: {model.score(x_test, y_test)}')  
    print('-'*10)

Using model: LR


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').