In [1]:
#Importing basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re
pd.set_option('display.max_colwidth', None)

#Making pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score,mean_squared_error

from sklearn.model_selection import GridSearchCV

import pickle

In [2]:
df=pd.read_csv('../data/transformed_data.csv')
df.head(2)

Unnamed: 0,Brand,Processor_Brand,Processor_Type,Storage_SSD,Storage_HDD,RAM_GB,RAM_Type,OS,Display_Size,Display_Type,Office,Warranty,Price
0,HP,Intel,Celeron Dual Core,256,0,8,DDR4,64 bit Windows 11,14.0,Normal,Not Available,1 Year Onsite Warranty,25990
1,ASUS,Intel,Core i3 - 10th Gen,512,0,8,DDR4,64 bit Windows 11,15.6,Normal,Office Home & Student 2021,1 Year Onsite Warranty,33990


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 809 entries, 0 to 808
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Brand            809 non-null    object 
 1   Processor_Brand  809 non-null    object 
 2   Processor_Type   809 non-null    object 
 3   Storage_SSD      809 non-null    int64  
 4   Storage_HDD      809 non-null    int64  
 5   RAM_GB           809 non-null    int64  
 6   RAM_Type         809 non-null    object 
 7   OS               809 non-null    object 
 8   Display_Size     809 non-null    float64
 9   Display_Type     809 non-null    object 
 10  Office           809 non-null    object 
 11  Warranty         809 non-null    object 
 12  Price            809 non-null    int64  
dtypes: float64(1), int64(4), object(8)
memory usage: 82.3+ KB


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Storage_SSD,809.0,564.212608,301.984417,128.0,512.0,512.0,512.0,2048.0
Storage_HDD,809.0,73.097651,261.492104,0.0,0.0,0.0,0.0,1024.0
RAM_GB,809.0,11.866502,5.743871,4.0,8.0,8.0,16.0,32.0
Display_Size,809.0,15.071446,0.968884,11.6,14.0,15.6,15.6,17.3
Price,809.0,88144.001236,59576.222912,19490.0,48990.0,69160.0,104990.0,395190.0


In [5]:
cat_features=[]
num_features=[]
for feature in df.columns:
    if df[feature].dtype=='O':
        cat_features.append(feature)  
    else:
        num_features.append(feature)   

In [6]:
X = df.drop('Price',axis=1)
y = df['Price']

In [7]:
X.head(5)

Unnamed: 0,Brand,Processor_Brand,Processor_Type,Storage_SSD,Storage_HDD,RAM_GB,RAM_Type,OS,Display_Size,Display_Type,Office,Warranty
0,HP,Intel,Celeron Dual Core,256,0,8,DDR4,64 bit Windows 11,14.0,Normal,Not Available,1 Year Onsite Warranty
1,ASUS,Intel,Core i3 - 10th Gen,512,0,8,DDR4,64 bit Windows 11,15.6,Normal,Office Home & Student 2021,1 Year Onsite Warranty
2,Lenovo,Intel,Core i3 - 11th Gen,512,0,8,DDR4,64 bit Windows 11,15.6,Normal,Office Home & Student 2021,2 Year Onsite Warranty
3,HP,Intel,Core i3 - 11th Gen,512,0,8,DDR4,64 bit Windows 11,14.0,Normal,Office Home & Student 2021,1 Year Onsite Warranty
4,Lenovo,Intel,Core i5 - 12th Gen,512,0,8,DDR4,64 bit Windows 11,15.6,Normal,Not Available,2 Year Onsite Warranty


In [8]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=10)

In [9]:
print(f'Training features shape :{x_train.shape}\n'
       f'Training output shape :{y_train.shape}\n'
       f'Testing features shape :{x_test.shape}\n'
       f'Testin output shape : {y_test.shape}')

Training features shape :(606, 12)
Training output shape :(606,)
Testing features shape :(203, 12)
Testin output shape : (203,)


In [10]:
print(f'Numerical features : {num_features}')
num_features.pop(-1)
print(f'Numerical features after removing Price: {num_features}')

Numerical features : ['Storage_SSD', 'Storage_HDD', 'RAM_GB', 'Display_Size', 'Price']
Numerical features after removing Price: ['Storage_SSD', 'Storage_HDD', 'RAM_GB', 'Display_Size']


In [11]:
column_trans = ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(drop='first',sparse=False, handle_unknown='ignore'), cat_features),
    ('tnf2',StandardScaler(),num_features)],
    remainder='passthrough')

In [12]:
column_trans.fit_transform(x_train)

array([[ 0.        ,  0.        ,  0.        , ..., -0.26481029,
         0.71551075, -1.08962957],
       [ 0.        ,  1.        ,  0.        , ..., -0.26481029,
        -0.66537943,  0.53556246],
       [ 0.        ,  0.        ,  0.        , ..., -0.26481029,
         0.71551075,  0.94186046],
       ...,
       [ 0.        ,  1.        ,  0.        , ..., -0.26481029,
         0.71551075, -1.08962957],
       [ 0.        ,  1.        ,  0.        , ..., -0.26481029,
         0.71551075,  0.53556246],
       [ 0.        ,  0.        ,  0.        , ..., -0.26481029,
        -0.66537943, -1.08962957]])

In [13]:
pipeline_lr = Pipeline(steps=[('preprocessor', column_trans), ('linear_regressor', LinearRegression())])
pipeline_dt = Pipeline(steps=[('preprocessor',column_trans),('decision_tree_regressor', DecisionTreeRegressor())])
pipeline_knn = Pipeline(steps=[('preprocessor',column_trans),('knn',KNeighborsRegressor(n_neighbors=7))])
pipeline_rf = Pipeline(steps=[('preprocessor',column_trans),('random_forest',RandomForestRegressor())])
pipeline_gb = Pipeline(steps=[('preprocessor',column_trans),('gradient_boosting', GradientBoostingRegressor())])

In [14]:
pipelines = [pipeline_dt,pipeline_gb,pipeline_knn,pipeline_lr,pipeline_rf]

In [15]:
pipe_dict = {0:'Decision Tree',1:'Gradient boosting',2:'KNN',3:'Linear Regression',4:'Random forest'}

In [16]:
for pipe in pipelines:
    pipe.fit(x_train,y_train)

In [17]:
pipelines

[Pipeline(steps=[('preprocessor',
                  ColumnTransformer(remainder='passthrough',
                                    transformers=[('tnf1',
                                                   OneHotEncoder(drop='first',
                                                                 handle_unknown='ignore',
                                                                 sparse=False),
                                                   ['Brand', 'Processor_Brand',
                                                    'Processor_Type', 'RAM_Type',
                                                    'OS', 'Display_Type',
                                                    'Office', 'Warranty']),
                                                  ('tnf2', StandardScaler(),
                                                   ['Storage_SSD', 'Storage_HDD',
                                                    'RAM_GB',
                                                    'Display_Siz

In [18]:
m,r2,rmse=[],[],[]

In [19]:
for i, model in enumerate(pipelines):
    #print("{} Test accuarcy: {}".format(pipe_dict[i],model.score(x_test,y_test)))
    m.append(pipe_dict[i])
    r2.append(round(model.score(x_test,y_test),2))
    rmse.append(round(mean_squared_error(y_test, model.predict(x_test), squared=False),2))



In [20]:
models = pd.DataFrame({
    'Model' : m,
    'RMSE' : rmse,
    'r2_score' : r2
})

models.sort_values(by='RMSE', ascending=True)

Unnamed: 0,Model,RMSE,r2_score
1,Gradient boosting,25521.75,0.8
4,Random forest,26716.81,0.78
2,KNN,29830.99,0.72
0,Decision Tree,34846.8,0.62
3,Linear Regression,1.040807e+16,-3.374714e+22


In [21]:
pickle.dump(pipeline_gb, open('../gb_model.pkl', 'wb'))

In [22]:
pipeline_gb.score(x_test,y_test)



0.7970833183003826

In [23]:
y_pred = pipeline_gb.predict(x_test)
y_pred[0]



61178.005641660915

In [24]:
x_test

Unnamed: 0,Brand,Processor_Brand,Processor_Type,Storage_SSD,Storage_HDD,RAM_GB,RAM_Type,OS,Display_Size,Display_Type,Office,Warranty
482,DELL,Intel,Core i5 - 12th Gen,512,0,8,DDR4,64 bit Windows 11,16.00,Normal,Office Home & Student 2021,1 Year Onsite Warranty
387,Nokia,Intel,Core i5 - 10th Gen,512,0,8,DDR4,64 bit Windows 10,14.00,Normal,Not Available,1 Year Onsite Warranty
233,MSI,Intel,Core i7 - 11th Gen,1024,0,16,DDR4,64 bit Windows 10,15.60,Normal,Not Available,2 Year Onsite Warranty
405,Lenovo,Intel,Core i5 - 11th Gen,512,0,8,DDR4,64 bit Windows 10,15.60,Normal,Not Available,1 Year Warranty + 1 Year Premium Care + 1 Year Accidental Damage Protection
546,DELL,Intel,Core i3 - 10th Gen,256,0,8,DDR4,64 bit Windows 10,14.96,Normal,Office Home & Student 2019,1 Year Onsite Warranty
...,...,...,...,...,...,...,...,...,...,...,...,...
161,HP,AMD,Ryzen 5 Hexa Core - 5th Gen,512,0,8,DDR4,64 bit Windows 11,15.60,Normal,Not Available,1 Year Onsite Warranty
782,DELL,Intel,Core i3 - 11th Gen,256,1024,8,DDR4,Windows 11,15.00,Normal,Not Available,1 Year Onsite Warranty
223,Lenovo,AMD,Ryzen 5 Hexa Core,512,0,8,DDR4,64 bit Windows 10,15.60,Normal,Office Trial Only,1 Year Warranty + 1 Year Premium Care + 1 Year Accidental Damage Protection
568,acer,Intel,Core i7 - 10th Gen,2048,0,16,DDR4,64 bit Windows 10,15.60,Normal,Not Available,1 Year International Travelers Warranty


In [25]:
y_test

482     69160
387     50490
233    123490
405     66990
546     40090
        ...  
161     45500
782     46900
223     60490
568    159990
592     35399
Name: Price, Length: 203, dtype: int64

In [26]:
df.columns

Index(['Brand', 'Processor_Brand', 'Processor_Type', 'Storage_SSD',
       'Storage_HDD', 'RAM_GB', 'RAM_Type', 'OS', 'Display_Size',
       'Display_Type', 'Office', 'Warranty', 'Price'],
      dtype='object')

In [27]:
df['Brand'].value_counts().keys()

Index(['ASUS', 'DELL', 'HP', 'Lenovo', 'acer', 'MSI', 'APPLE', 'Infinix',
       'GIGABYTE', 'Vaio', 'ALIENWARE', 'Avita', 'Nokia', 'Mi', 'realme',
       'RedmiBook', 'MICROSOFT', 'Ultimus'],
      dtype='object')

In [28]:
df['Processor_Brand'].value_counts().keys()

Index(['Intel', 'AMD', 'Apple'], dtype='object')

In [29]:
df['Processor_Type'].value_counts().keys()

Index(['Core i5  - 11th Gen', 'Core i3  - 11th Gen', 'Ryzen 5 Hexa Core',
       'Ryzen 7 Octa Core', 'Core i7  - 12th Gen', 'Core i5  - 12th Gen',
       'Core i5  - 10th Gen', 'Core i7  - 10th Gen', 'Core i7  - 11th Gen',
       'Core i3  - 10th Gen', 'Ryzen 9 Octa Core', 'Ryzen 3 Dual Core',
       'Core i9  - 12th Gen', 'Ryzen 5 Quad Core', 'Celeron Dual Core',
       'Core i3  - 12th Gen', 'Core i7  - 8th Gen', 'Core i5  - 8th Gen',
       'Ryzen 3 Quad Core', 'Pentium Quad Core', 'M1', 'M2',
       'Athlon Dual Core', 'Core i5  - 9th Gen',
       'Ryzen 7 Quad Core  - 10th Gen', 'Pentium Silver',
       'Core i9  - 10th Gen', 'Core i9  - 11th Gen', 'Core i5  - 7th Gen',
       'Ryzen 5 Hexa Core  - 5th Gen', 'M1 Pro', 'Ryzen 7 Quad Core',
       'Celeron Dual Core  - 10th Gen', 'Core i7  - 7th Gen',
       'Ryzen 5 Hexa Core  - 10th Gen', 'Ryzen 3 Dual Core  - 3rd Gen',
       'M1 Max', 'Ryzen 7 Octa Core  - 5th Gen', 'Celeron Quad Core',
       'Ryzen 9 Octa Core  - 5th Gen', 'R

In [30]:
df['Storage_SSD'].value_counts().keys()

Int64Index([512, 256, 1024, 128, 2048], dtype='int64')

In [31]:
df['Storage_HDD'].value_counts().keys()

Int64Index([0, 1024, 512, 256], dtype='int64')

In [32]:
df.columns

Index(['Brand', 'Processor_Brand', 'Processor_Type', 'Storage_SSD',
       'Storage_HDD', 'RAM_GB', 'RAM_Type', 'OS', 'Display_Size',
       'Display_Type', 'Office', 'Warranty', 'Price'],
      dtype='object')

In [33]:
df['RAM_GB'].value_counts().keys()

Int64Index([8, 16, 4, 32], dtype='int64')

In [34]:
df['RAM_Type'].value_counts().keys()

Index(['DDR4', 'DDR5', 'LPDDR4X', 'LPDDR5', 'Unified Memory', 'LPDDR3', 'DDR3',
       'LPDDR4'],
      dtype='object')

In [35]:
df['OS'].value_counts().keys()

Index(['64 bit Windows 11', '64 bit Windows 10', 'Windows 11', 'Mac',
       'Windows 10', 'DOS', '32 bit Windows 11', '64 bit Mac', 'Chrome',
       '64 bit DOS', '64 bit Chrome'],
      dtype='object')

In [36]:
df.columns

Index(['Brand', 'Processor_Brand', 'Processor_Type', 'Storage_SSD',
       'Storage_HDD', 'RAM_GB', 'RAM_Type', 'OS', 'Display_Size',
       'Display_Type', 'Office', 'Warranty', 'Price'],
      dtype='object')

In [37]:
df['Display_Size'].value_counts().keys()

Float64Index([ 15.6,  14.0,  13.3,  16.0,  17.3,  16.1,  13.4,  15.0,  13.6,
               14.1, 14.96,  11.6,  16.2,  14.5,  13.5,  13.0,  16.6,  14.2,
               15.3,  12.0],
             dtype='float64')

In [38]:
df['Display_Size'].max()

17.3

In [39]:
df['Display_Size'].min()

11.6

In [40]:
df['Display_Type'].value_counts().keys()

Index(['Normal', 'Touchscreen'], dtype='object')

In [41]:
df['Office'].value_counts().keys()

Index(['Not Available', 'Office Home & Student 2019',
       'Office Home & Student 2021', 'Office Home & Student', 'Office 2021',
       'Office Home & Student 2016', 'Office 365', 'Office Trial Only',
       'Office 2019 & Office 365', 'Office Home 2019 & Office 365',
       'Office 2019', 'Office 2013 11 Trial', 'Office'],
      dtype='object')

In [43]:
df['Office'].value_counts()

Not Available                    538
Office Home & Student 2019       121
Office Home & Student 2021       112
Office Home & Student             15
Office 2021                        5
Office Home & Student 2016         5
Office 365                         4
Office Trial Only                  3
Office 2019 & Office 365           2
Office Home 2019 & Office 365      1
Office 2019                        1
Office 2013 11 Trial               1
Office                             1
Name: Office, dtype: int64

In [42]:
df['Warranty'].value_counts().keys()

Index(['1 Year Onsite Warranty', '1 Year International Travelers Warranty',
       '2 Year Onsite Warranty', '1 Year Warranty', '1 Year Carry In Warranty',
       '2 Year Carry In Warranty',
       '1 Year Onsite Warranty + 1 Year Premium Care + 1 Year Accidental Damage Protection',
       '1 Year Limited Warranty', '2 Year Warranty',
       '2 Year Onsite & Carry In Warranty',
       '1 Year Limited Hardware Warranty + In Home Service After Remote Diagnosis',
       '1 Year Warranty + 1 Year Premium Care + 1 Year Accidental Damage Protection',
       '1 Year Domestic Warranty', '1 Year Manufacturer Warranty',
       '1 Year Manufacturer Warranty On The Device + 0.5 Year Manufacturer Warranty On Included Accessories',
       '3 Year Manufacturer Warranty On The Device + 0.5 Year Manufacturer Warranty On Included Accessories',
       '3 Year Premier Support Warranty',
       '3 Year Warranty + 1 Year Premium Care + 1 Year Accidental Damage Protection',
       '3 Year Carry In Warranty',

In [50]:
sorted(df[df['Processor_Brand']=='AMD']['Processor_Type'].unique())

['Athlon Dual Core',
 'Athlon Dual Core  - 11th Gen',
 'Dual Core',
 'Ryzen 3 Dual Core',
 'Ryzen 3 Dual Core  - 3rd Gen',
 'Ryzen 3 Hexa Core  - 4th Gen',
 'Ryzen 3 Quad Core',
 'Ryzen 5 Dual Core',
 'Ryzen 5 Dual Core  - 10th Gen',
 'Ryzen 5 Hexa Core',
 'Ryzen 5 Hexa Core  - 10th Gen',
 'Ryzen 5 Hexa Core  - 5th Gen',
 'Ryzen 5 Quad Core',
 'Ryzen 7 Dual Core  - 7th Gen',
 'Ryzen 7 Hexa Core',
 'Ryzen 7 Octa Core',
 'Ryzen 7 Octa Core  - 10th Gen',
 'Ryzen 7 Octa Core  - 4th Gen',
 'Ryzen 7 Octa Core  - 5th Gen',
 'Ryzen 7 Quad Core',
 'Ryzen 7 Quad Core  - 10th Gen',
 'Ryzen 9 Octa Core',
 'Ryzen 9 Octa Core  - 10th Gen',
 'Ryzen 9 Octa Core  - 5th Gen',
 'Ryzen 9 Octa Core  - 9th Gen']

In [51]:
sorted(df[df['Processor_Brand']=='Apple']['Processor_Type'].unique())

['M1', 'M1 Max', 'M1 Pro', 'M2']

In [52]:
sorted(df[df['Processor_Brand']=='Intel']['Processor_Type'].unique())

['Celeron Dual Core',
 'Celeron Dual Core  - 10th Gen',
 'Celeron Dual Core  - 4th Gen',
 'Celeron Quad Core',
 'Core i3  - 10th Gen',
 'Core i3  - 11th Gen',
 'Core i3  - 12th Gen',
 'Core i5  - 10th Gen',
 'Core i5  - 11th Gen',
 'Core i5  - 12th Gen',
 'Core i5  - 5th Gen',
 'Core i5  - 7th Gen',
 'Core i5  - 8th Gen',
 'Core i5  - 9th Gen',
 'Core i7  - 10th Gen',
 'Core i7  - 11th Gen',
 'Core i7  - 12th Gen',
 'Core i7  - 6th Gen',
 'Core i7  - 7th Gen',
 'Core i7  - 8th Gen',
 'Core i7  - 9th Gen',
 'Core i9  - 10th Gen',
 'Core i9  - 11th Gen',
 'Core i9  - 12th Gen',
 'Core i9  - 8th Gen',
 'Hexa Core i5  - 10th Gen',
 'Pentium Gold',
 'Pentium Quad Core',
 'Pentium Quad Core  - 11th Gen',
 'Pentium Silver']