Table of Contents

    1 Introduction
    2 Data Analysis
    3 Data Preprocessing
    

# Introduction

In this notebook, we will try to predict the price of a laptop using different specifications about the laptop.

# Data Analysis

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score


In [2]:
#Load Dataset
mydata=pd.read_csv('Laptop.csv')
mydata.head()

Unnamed: 0.1,Unnamed: 0,Name,Processor,RAM,Operating System,Storage,Display,Warranty,Price,rating
0,0,Lenovo Ideapad S145 Core i5 10th Gen - (8 GB/1...,Intel Core i5 Processor (10th Gen),8 GB DDR4 RAM,64 bit Windows 10 Operating System,1 TB HDD,39.62 cm (15.6 inch) Display,1 Year Onsite Warranty,"₹43,990",3.9
1,1,Lenovo IdeaPad Core i3 11th Gen - (8 GB/256 GB...,Intel Core i3 Processor (11th Gen),8 GB DDR4 RAM,64 bit Windows 10 Operating System,256 GB SSD,35.56 cm (14 Inch) Display,1 Year Onsite Warranty,"₹43,990",4.2
2,2,HP Pentium Quad Core - (8 GB/256 GB SSD/Window...,Intel Pentium Quad Core Processor,8 GB DDR4 RAM,64 bit Windows 10 Operating System,256 GB SSD,35.56 cm (14 inch) Display,1 Year Onsite Warranty,"₹31,490",4.6
3,3,HP 14s Core i3 11th Gen - (8 GB/256 GB SSD/Win...,Intel Core i3 Processor (11th Gen),8 GB DDR4 RAM,64 bit Windows 10 Operating System,256 GB SSD,35.56 cm (14 inch) Display,1 Year Onsite Warranty,"₹40,990",4.1
4,4,HP 15s Athlon Dual Core - (4 GB/1 TB HDD/Windo...,AMD Athlon Dual Core Processor,4 GB DDR4 RAM,64 bit Windows 10 Operating System,1 TB HDD,39.62 cm (15.6 inch) Display,1 Year Onsite Warranty,"₹27,490",4.1


In [3]:
mydata.shape

(550, 10)

In [4]:
mydata.dtypes

Unnamed: 0            int64
Name                 object
Processor            object
RAM                  object
Operating System     object
Storage              object
Display              object
Warranty             object
Price                object
rating              float64
dtype: object

In [5]:
mydata.isna().sum()

Unnamed: 0          0
Name                0
Processor           0
RAM                 0
Operating System    0
Storage             0
Display             0
Warranty            0
Price               0
rating              0
dtype: int64

# Data Preprocessing

In [6]:
mydata.drop(['Unnamed: 0'],inplace=True,axis=1)
mydata.shape

(550, 9)

In [7]:
#Operating System
x=mydata[mydata['Operating System'].astype(str).str.contains("Windows")]['Operating System'].tolist()
y=mydata[mydata['Operating System'].astype(str).str.contains("Mac")]['Operating System'].tolist()
z=mydata[mydata['Operating System'].astype(str).str.contains("Chrome")]['Operating System'].tolist()

c=mydata['Operating System'].count()
#z.value_counts()
for items in range(c):
    if mydata['Operating System'][items] in x:
        mydata['Operating System'][items]='Windows'
    elif mydata['Operating System'][items] in y:
        mydata['Operating System'][items]='Mac'
    elif mydata['Operating System'][items] in z:
        mydata['Operating System'][items]='Chrome'
    else:
        mydata['Operating System'][items]='DOS'
        

mydata['Operating System'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mydata['Operating System'][items]='Windows'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mydata['Operating System'][items]='Mac'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mydata['Operating System'][items]='DOS'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mydata['Operating System'][items]='Chrome'


Windows    525
Mac         20
Chrome       3
DOS          2
Name: Operating System, dtype: int64

In [8]:
#RAM Capacity
mydata['RAMCapacity']=mydata.RAM.str.replace('\s.*','')
mydata.RAMCapacity.value_counts()

  mydata['RAMCapacity']=mydata.RAM.str.replace('\s.*','')


8             321
16            150
4              68
32              9
Upgradable      1
12              1
Name: RAMCapacity, dtype: int64

In [9]:
#RAM Type
mydata['RAM']=mydata['RAM'].astype(str)
mydata['RAMType']=mydata['RAM'].str.slice(5,20,1)
mydata.RAMType.unique()

array(['DDR4 RAM', ' DDR4 RAM', 'LPDDR4X RAM', 'LPDDR3 RAM',
       ' LPDDR4X RAM', ' LPDDR3 RAM', 'DDR3 RAM', ' DDR3 RAM',
       'dable SSD Upto '], dtype=object)

In [10]:
mydata.drop(['RAM'],inplace=True,axis=1)
mydata.columns

Index(['Name', 'Processor', 'Operating System', 'Storage', 'Display',
       'Warranty', 'Price', 'rating', 'RAMCapacity', 'RAMType'],
      dtype='object')

In [11]:
#Brand
mydata['Brand']=mydata['Name'].apply(lambda x: x.split(' ')[0])
mydata.Brand.unique()

array(['Lenovo', 'HP', 'acer', 'ASUS', 'MSI', 'Avita', 'APPLE', 'DELL',
       'LG', 'MICROSOFT', 'Nokia', 'Nexstgo', 'ALIENWARE', 'Vaio'],
      dtype=object)

In [12]:
mydata.drop(['Name'],inplace=True,axis=1)
mydata.columns

Index(['Processor', 'Operating System', 'Storage', 'Display', 'Warranty',
       'Price', 'rating', 'RAMCapacity', 'RAMType', 'Brand'],
      dtype='object')

In [13]:
#Price
mydata['Price']=mydata['Price'].apply(lambda x: x.split('₹')[1])
mydata['Price'] = mydata['Price'].str.replace(',', '')
mydata['Price'].value_counts()

43990     47
109990    26
116990    24
59990     15
60990     15
          ..
57440      1
148990     1
89149      1
71990      1
87990      1
Name: Price, Length: 235, dtype: int64

In [14]:
mydata["Price"] = mydata["Price"].astype(str).astype(int)

In [15]:
#Warranty
mydata['Warranty']=mydata.Warranty.str.replace('\s.*','')
mydata['Warranty']=mydata.Warranty.str.replace('One-year','1')
mydata['Warranty']=mydata.Warranty.str.replace('Onsite','1')
mydata["Warranty"] = mydata["Warranty"].astype(str).astype(int)
mydata.Warranty.value_counts()

  mydata['Warranty']=mydata.Warranty.str.replace('\s.*','')


1     461
2      57
3      21
18     10
24      1
Name: Warranty, dtype: int64

In [16]:
#Storage
mydata['Storage']=mydata.Storage.str.replace('1 TB','1024 GB')
mydata.Storage.value_counts()

512 GB SSD                                                 213
1024 GB HDD|256 GB SSD                                      98
1024 GB HDD                                                 86
256 GB SSD                                                  77
1024 GB SSD                                                 58
1024 GB HDD|128 GB SSD                                       8
1024 GB HDD|512 GB SSD                                       2
128 GB NVMe PCIe 3.0 x4 SSD                                  2
128 GB SSD                                                   2
512 GB HDD|512 GB SSD                                        1
512 GB SSD for Reduced Boot Up Time and in Game Loading      1
128 GB SSD for Reduced Boot Up Time and in Game Loading      1
M.2 Slot for SSD Upgrade                                     1
Name: Storage, dtype: int64

In [17]:
#Display
mydata['Display(inch)']=mydata['Display'].copy()
mydata['Display(inch)']

0                           39.62 cm (15.6 inch) Display
1                             35.56 cm (14 Inch) Display
2                             35.56 cm (14 inch) Display
3                             35.56 cm (14 inch) Display
4                           39.62 cm (15.6 inch) Display
                             ...                        
545    Matrix Display, Dragon Center, Cooler Boost 5,...
546               35.56 cm (14 inch) Touchscreen Display
547                         33.78 cm (13.3 inch) Display
548                         39.62 cm (15.6 inch) Display
549                           35.56 cm (14 inch) Display
Name: Display(inch), Length: 550, dtype: object

In [18]:
x=mydata[mydata['Display'].astype(str).str.contains(r"\((.*?)\)")]['Display'].tolist()
c=mydata['Display'].count()
#len(x)
for items in range(c):
    if mydata['Display'][items] in x:
        z=mydata['Display'][items]
        i1=z.index('(')
        i2=z.index(')')
        mydata['Display(inch)'][items]=z[i1+1:i2:1]
    
mydata['Display(inch)'].value_counts()

  return func(self, *args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mydata['Display(inch)'][items]=z[i1+1:i2:1]


15.6 inch                                                              242
14 inch                                                                150
13.3 inch                                                               54
Matrix Display, Dragon Center, Cooler Boost 5, Nahimic 3                22
14 Inch                                                                 22
Extend                                                                  10
144Hz, 45% NTSC Color Gamut                                             10
15.6 Inch                                                                5
13 inch                                                                  5
14 inches                                                                4
13.3 Inch                                                                4
16 inch                                                                  3
13.4 inch                                                                3
17.3 inch                

In [19]:
mydata['Display(inch)']=mydata['Display(inch)'].str.replace('Extend','0')
mydata['Display(inch)']=mydata['Display(inch)'].str.replace('144Hz, 45% NTSC Color Gamut','15.6 inch')
mydata['Display(inch)']=mydata['Display(inch)'].str.replace('60Hz, 45% NTSC Color Gamut','15.6 inch')
mydata['Display(inch)']=mydata['Display(inch)'].str.replace('Matrix Display, Dragon Center, Cooler Boost 5, Nahimic 3','0')
mydata['Display(inch)']=mydata['Display(inch)'].str.replace('Full HD LED Backlit Display','1')
mydata['Display(inch)']=mydata['Display(inch)'].str.replace('Full HD LED Backlit Anti-glare Display for Better Visual Experience','1')
mydata['Display(inch)'].value_counts()

15.6 inch    253
14 inch      150
13.3 inch     54
0             32
14 Inch       22
15.6 Inch      5
13 inch        5
14 inches      4
13.3 Inch      4
16 inch        3
13.4 inch      3
1              3
17.3 inch      3
12 inch        2
13.5 inch      1
11.6 inch      1
10.1 inch      1
10 inch        1
13.6 inch      1
15 inch        1
12.3 inch      1
Name: Display(inch), dtype: int64

In [20]:
mydata['Display(inch)']=mydata['Display(inch)'].astype(str).apply(lambda x: x.split('i')[0])
mydata['Display(inch)']=mydata['Display(inch)'].astype(str).apply(lambda x: x.split('I')[0])
mydata['Display(inch)'] = mydata['Display(inch)'].astype(str).astype(float)
mydata['Display(inch)'].value_counts()

15.6    258
14.0    176
13.3     58
0.0      32
13.0      5
13.4      3
17.3      3
16.0      3
1.0       3
12.0      2
11.6      1
15.0      1
13.6      1
10.0      1
13.5      1
12.3      1
10.1      1
Name: Display(inch), dtype: int64

In [21]:
mydata.drop(['Display'],axis=1,inplace=True)
mydata.shape

(550, 10)

In [22]:
#Processor
mydata['ProcessorCore']=mydata['Processor'].astype(str).apply(lambda x: x.split('(')[0])
mydata.drop(['Processor'],inplace=True,axis=1)
mydata.dtypes

Operating System     object
Storage              object
Warranty              int32
Price                 int32
rating              float64
RAMCapacity          object
RAMType              object
Brand                object
Display(inch)       float64
ProcessorCore        object
dtype: object

In [23]:
mydata.drop(mydata.loc[mydata['RAMType']=='dable SSD Upto '].index,inplace=True)
mydata['RAMCapacity'] = mydata['RAMCapacity'].astype(str).astype(int)
mydata.dtypes

Operating System     object
Storage              object
Warranty              int32
Price                 int32
rating              float64
RAMCapacity           int32
RAMType              object
Brand                object
Display(inch)       float64
ProcessorCore        object
dtype: object

# Data Visualization

In [24]:
mydata.head()

Unnamed: 0,Operating System,Storage,Warranty,Price,rating,RAMCapacity,RAMType,Brand,Display(inch),ProcessorCore
0,Windows,1024 GB HDD,1,43990,3.9,8,DDR4 RAM,Lenovo,15.6,Intel Core i5 Processor
1,Windows,256 GB SSD,1,43990,4.2,8,DDR4 RAM,Lenovo,14.0,Intel Core i3 Processor
2,Windows,256 GB SSD,1,31490,4.6,8,DDR4 RAM,HP,14.0,Intel Pentium Quad Core Processor
3,Windows,256 GB SSD,1,40990,4.1,8,DDR4 RAM,HP,14.0,Intel Core i3 Processor
4,Windows,1024 GB HDD,1,27490,4.1,4,DDR4 RAM,HP,15.6,AMD Athlon Dual Core Processor


# Dummy Variables

In [25]:
# Defining the function
def dummies(x,df):
    temp = pd.get_dummies(df[x], drop_first = True)
    df = pd.concat([df, temp], axis = 1)
    df.drop([x], axis = 1, inplace = True)
    return df
# Applying the function to mydata
mydata = dummies('Operating System',mydata)
mydata = dummies('Storage',mydata)
mydata = dummies('RAMType',mydata)
mydata = dummies('Brand',mydata)
mydata = dummies('ProcessorCore',mydata)

In [26]:
mydata.head()

Unnamed: 0,Warranty,Price,rating,RAMCapacity,Display(inch),DOS,Mac,Windows,1024 GB HDD|128 GB SSD,1024 GB HDD|256 GB SSD,...,AMD Ryzen 9 Octa Core Processor,Intel Celeron Dual Core Processor,Intel Core i3 Processor,Intel Core i5 Processor,Intel Core i7 Processor,Intel Core i9 Processor,Intel Evo platform feat 11th Gen Intel Core i5 processor,Intel Evo platform feat 11th Gen Intel Core i7 processor,Intel Pentium Quad Core Processor,Microsoft Core i5 Processor
0,1,43990,3.9,8,15.6,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,43990,4.2,8,14.0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,31490,4.6,8,14.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,40990,4.1,8,14.0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,27490,4.1,4,15.6,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#Train Test Splitting
np.random.seed(0)
mydata_train, mydata_test = train_test_split(mydata,test_size = 0.3, random_state = 100)

In [31]:
#Scaling
scaler = MinMaxScaler()
num_vars = ['Warranty', 'Price', 'rating', 'RAMCapacity', 'Display(inch)']
mydata_train[num_vars] = scaler.fit_transform(mydata_train[num_vars])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mydata_train[num_vars] = scaler.fit_transform(mydata_train[num_vars])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [32]:
mydata_train.head()

Unnamed: 0,Warranty,Price,rating,RAMCapacity,Display(inch),DOS,Mac,Windows,1024 GB HDD|128 GB SSD,1024 GB HDD|256 GB SSD,...,AMD Ryzen 9 Octa Core Processor,Intel Celeron Dual Core Processor,Intel Core i3 Processor,Intel Core i5 Processor,Intel Core i7 Processor,Intel Core i9 Processor,Intel Evo platform feat 11th Gen Intel Core i5 processor,Intel Evo platform feat 11th Gen Intel Core i7 processor,Intel Pentium Quad Core Processor,Microsoft Core i5 Processor
357,0.0,0.037383,0.875,0.0,0.901734,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
46,0.0,0.018692,0.875,0.0,0.901734,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
50,0.0,0.149533,0.825,0.428571,0.809249,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
363,0.0,0.080997,1.0,0.142857,0.809249,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
19,0.0,0.084112,0.85,0.142857,0.901734,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [34]:
mydata_train.describe()

Unnamed: 0,Warranty,Price,rating,RAMCapacity,Display(inch),DOS,Mac,Windows,1024 GB HDD|128 GB SSD,1024 GB HDD|256 GB SSD,...,AMD Ryzen 9 Octa Core Processor,Intel Celeron Dual Core Processor,Intel Core i3 Processor,Intel Core i5 Processor,Intel Core i7 Processor,Intel Core i9 Processor,Intel Evo platform feat 11th Gen Intel Core i5 processor,Intel Evo platform feat 11th Gen Intel Core i7 processor,Intel Pentium Quad Core Processor,Microsoft Core i5 Processor
count,384.0,384.0,384.0,384.0,384.0,384.0,384.0,384.0,384.0,384.0,...,384.0,384.0,384.0,384.0,384.0,384.0,384.0,384.0,384.0,384.0
mean,0.022305,0.199331,0.821484,0.219494,0.795249,0.002604,0.039062,0.950521,0.013021,0.1875,...,0.007812,0.018229,0.151042,0.388021,0.25,0.015625,0.002604,0.002604,0.002604,0.002604
std,0.105894,0.142171,0.182152,0.17481,0.211498,0.051031,0.193996,0.217149,0.113511,0.390822,...,0.088157,0.133954,0.358557,0.487935,0.433578,0.124181,0.051031,0.051031,0.051031,0.051031
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.087227,0.75,0.142857,0.809249,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.155763,0.875,0.142857,0.809249,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.271495,0.90625,0.428571,0.901734,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
#Correlation using heatmap
#plt.figure(figsize = (30, 25))
#sns.heatmap(mydata_train.corr(),annot=True)
#plt.show()


In [45]:
#Splitting to X and y
X_train=mydata_train.drop(['Price'],axis=1)
y_train=mydata_train['Price']

In [50]:
X_test=mydata_test.drop(['Price'],axis=1)
y_test=mydata_test['Price']

In [52]:
len(X_test)

165

In [74]:
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)
lr_model.score(X_train,y_train)

0.5130798288004899

In [48]:
model=DecisionTreeRegressor()
model.fit(X_train,y_train)
model.score(X_train,y_train)

0.9940259107783836

In [61]:
y_pred=model.predict(X_test)

In [63]:
from sklearn import metrics
print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_pred))
print('RMSE:',np.sqrt(metrics.mean_absolute_error(y_test, y_pred)))


MAE: 79646.7582346833
MSE: 8264520299.2426405
RMSE: 282.2175725122078


In [59]:
from sklearn.ensemble import RandomForestRegressor
reg=RandomForestRegressor(n_estimators=100,random_state=8)
reg.fit(X_train,y_train)
print(reg.score(X_train,y_train))
pred1=reg.predict(X_test)

0.8862718001119886


In [64]:

print('MAE:',metrics.mean_absolute_error(y_test,pred1))
print('MSE:',metrics.mean_squared_error(y_test,pred1))
print('RMSE:',np.sqrt(metrics.mean_absolute_error(y_test,pred1)))


MAE: 79646.82134490437
MSE: 8264529211.538507
RMSE: 282.2176843234746
