In [289]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [290]:
data=pd.read_csv('laptop_data.csv')

In [291]:
data.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [292]:
data.columns

Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')

In [293]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price             1303 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 122.3+ KB


In [294]:
#Data preprocessing

In [295]:
data.reset_index(inplace=True)

In [296]:
data.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

In [297]:
data.describe()

Unnamed: 0,index,Inches,Price
count,1303.0,1303.0,1303.0
mean,651.0,15.017191,59870.04291
std,376.28801,1.426304,37243.201786
min,0.0,10.1,9270.72
25%,325.5,14.0,31914.72
50%,651.0,15.6,52054.56
75%,976.5,15.6,79274.2464
max,1302.0,18.4,324954.72


In [298]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(0,1))

In [299]:
columns_to_scale=['Inches','Price']

In [300]:
data[columns_to_scale]=scaler.fit_transform(data[columns_to_scale])

In [301]:
data['Ram']=data['Ram'].str.replace('GB','')

In [302]:
data['Ram']=data['Ram'].astype('int32')

In [303]:
data['Weight']=data['Weight'].str.replace('kg','')

In [304]:
data['Weight']=data['Weight'].astype('float')

In [305]:
columns_to_scale=['Ram','Weight']
data[columns_to_scale]=scaler.fit_transform(data[columns_to_scale])

In [306]:
def processor(x):
  if 'Intel Core i7' in x:
    return 'Intel Core i7'
  elif  'Intel Core i5' in x:
    return 'Intel Core i5'
  elif  'Intel Core i3' in x:
    return 'Intel Core i3'
  else:
    return 'Other'

In [307]:
data['Processor']=data['Cpu'].apply(processor)

In [308]:
# Apply split to each element in the 'Gpu' column and extract the first word
data['Gpu'] = data['Gpu'].apply(lambda x: str(x).split()[0] if pd.notnull(x) else x)


In [309]:
data.head()

Unnamed: 0,index,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Processor
0,0,Apple,Ultrabook,0.385542,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,0.096774,128GB SSD,Intel,macOS,0.169576,0.196741,Intel Core i5
1,1,Apple,Ultrabook,0.385542,1440x900,Intel Core i5 1.8GHz,0.096774,128GB Flash Storage,Intel,macOS,0.162095,0.122353,Intel Core i5
2,2,HP,Notebook,0.662651,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,0.096774,256GB SSD,Intel,No OS,0.291771,0.067679,Intel Core i5
3,3,Apple,Ultrabook,0.638554,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,0.225806,512GB SSD,AMD,macOS,0.284289,0.398895,Intel Core i7
4,4,Apple,Ultrabook,0.385542,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,0.096774,256GB SSD,Intel,macOS,0.169576,0.275038,Intel Core i5


In [310]:
import re
# Extract only the SSD portion using regex with error handling
data['SSD'] = data['Memory'].apply(lambda x: re.search(r'(\d+GB) SSD', str(x)).group(1) if pd.notnull(x) and re.search(r'(\d+GB) SSD', str(x)) else None)


In [311]:
import re
# Extract only the HDD portion using regex with error handling
data['HDD'] = data['Memory'].apply(lambda x: re.search(r'(\d+GB) HDD', str(x)).group(1) if pd.notnull(x) and re.search(r'(\d+GB) HDD', str(x)) else None)


In [312]:
def ips(text):
  if 'IPS' in text:
    return 1
  else:
    return 0

In [313]:
def Screen(text):
  if 'Touchscreen' in text:
    return 1
  else:
    return 0

In [314]:
data['IPS']=data['ScreenResolution'].apply(ips)
data['Touchscreen']=data['ScreenResolution'].apply(Screen)

In [315]:
import re
# Assuming 'data' is your DataFrame
data['ScreenResolution'] = data['ScreenResolution'].apply(lambda x: re.search(r'\d+x\d+', str(x)).group() if pd.notnull(x) else x)

In [316]:
data.drop(columns=['Cpu','Memory','index'], inplace=True)

In [317]:
data=data.rename(columns={'Company':'Brand','TypeName':'Laptop Type','Inches':'Screen Size','OpSys':'OS','Hard Drive':'HDD','IPS':'Ips Display'})

In [318]:
data=data.rename(columns={'HDD':'Hard Drive'})

In [319]:
data

Unnamed: 0,Brand,Laptop Type,Screen Size,ScreenResolution,Ram,Gpu,OS,Weight,Price,Processor,SSD,Hard Drive,Ips Display,Touchscreen
0,Apple,Ultrabook,0.385542,2560x1600,0.096774,Intel,macOS,0.169576,0.196741,Intel Core i5,128GB,,1,0
1,Apple,Ultrabook,0.385542,1440x900,0.096774,Intel,macOS,0.162095,0.122353,Intel Core i5,,,0,0
2,HP,Notebook,0.662651,1920x1080,0.096774,Intel,No OS,0.291771,0.067679,Intel Core i5,256GB,,0,0
3,Apple,Ultrabook,0.638554,2880x1800,0.225806,AMD,macOS,0.284289,0.398895,Intel Core i7,512GB,,1,0
4,Apple,Ultrabook,0.385542,2560x1600,0.096774,Intel,macOS,0.169576,0.275038,Intel Core i5,256GB,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,0.469880,1920x1080,0.032258,Intel,Windows 10,0.276808,0.078312,Intel Core i7,128GB,,1,1
1299,Lenovo,2 in 1 Convertible,0.385542,3200x1800,0.225806,Intel,Windows 10,0.152120,0.223629,Intel Core i7,512GB,,1,1
1300,Lenovo,Notebook,0.469880,1366x768,0.000000,Intel,Windows 10,0.201995,0.009283,Other,,,0,0
1301,HP,Notebook,0.662651,1366x768,0.064516,AMD,Windows 10,0.374065,0.099578,Intel Core i7,,,0,0


In [320]:
data['Hard Drive']=data['Hard Drive'].str.replace('GB','')

In [321]:
data['SSD']=data['SSD'].str.replace('GB','')

In [322]:
columns_to_scale=['SSD','Hard Drive']
data[columns_to_scale]=scaler.fit_transform(data[columns_to_scale])

In [323]:
len(data)

1303

In [324]:
data['Hard Drive'].isnull().sum()

1167

In [325]:
data['SSD'].isnull().sum()

476

In [326]:
data['Hard Drive']=data['Hard Drive'].fillna(0)

In [327]:
data['SSD']=data['SSD'].fillna(0)

In [328]:
data['Hard Drive'].value_counts()

Unnamed: 0_level_0,count
Hard Drive,Unnamed: 1_level_1
0.0,1168
1.0,134
0.205128,1


In [329]:
data.columns

Index(['Brand', 'Laptop Type', 'Screen Size', 'ScreenResolution', 'Ram', 'Gpu',
       'OS', 'Weight', 'Price', 'Processor', 'SSD', 'Hard Drive',
       'Ips Display', 'Touchscreen'],
      dtype='object')

In [330]:
data['ScreenResolution'].value_counts()

Unnamed: 0_level_0,count
ScreenResolution,Unnamed: 1_level_1
1920x1080,841
1366x768,308
3840x2160,43
3200x1800,27
2560x1440,23
1600x900,23
2560x1600,6
2304x1440,6
2256x1504,6
1920x1200,5


In [331]:
data.head()

Unnamed: 0,Brand,Laptop Type,Screen Size,ScreenResolution,Ram,Gpu,OS,Weight,Price,Processor,SSD,Hard Drive,Ips Display,Touchscreen
0,Apple,Ultrabook,0.385542,2560x1600,0.096774,Intel,macOS,0.169576,0.196741,Intel Core i5,0.238095,0.0,1,0
1,Apple,Ultrabook,0.385542,1440x900,0.096774,Intel,macOS,0.162095,0.122353,Intel Core i5,0.0,0.0,0,0
2,HP,Notebook,0.662651,1920x1080,0.096774,Intel,No OS,0.291771,0.067679,Intel Core i5,0.492063,0.0,0,0
3,Apple,Ultrabook,0.638554,2880x1800,0.225806,AMD,macOS,0.284289,0.398895,Intel Core i7,1.0,0.0,1,0
4,Apple,Ultrabook,0.385542,2560x1600,0.096774,Intel,macOS,0.169576,0.275038,Intel Core i5,0.492063,0.0,1,0


In [332]:
data = data.sample(frac=1, random_state=42)

In [333]:
data['Gpu'].value_counts()

Unnamed: 0_level_0,count
Gpu,Unnamed: 1_level_1
Intel,722
Nvidia,400
AMD,180
ARM,1


In [334]:
data = pd.get_dummies(data, columns=['Brand', 'Laptop Type', 'ScreenResolution', 'Gpu', 'OS', 'Processor'])

In [335]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1303 entries, 479 to 1126
Data columns (total 65 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Screen Size                     1303 non-null   float64
 1   Ram                             1303 non-null   float64
 2   Weight                          1303 non-null   float64
 3   Price                           1303 non-null   float64
 4   SSD                             1303 non-null   float64
 5   Hard Drive                      1303 non-null   float64
 6   Ips Display                     1303 non-null   int64  
 7   Touchscreen                     1303 non-null   int64  
 8   Brand_Acer                      1303 non-null   bool   
 9   Brand_Apple                     1303 non-null   bool   
 10  Brand_Asus                      1303 non-null   bool   
 11  Brand_Chuwi                     1303 non-null   bool   
 12  Brand_Dell                      1303 

In [336]:
data.head()

Unnamed: 0,Screen Size,Ram,Weight,Price,SSD,Hard Drive,Ips Display,Touchscreen,Brand_Acer,Brand_Apple,...,OS_Mac OS X,OS_No OS,OS_Windows 10,OS_Windows 10 S,OS_Windows 7,OS_macOS,Processor_Intel Core i3,Processor_Intel Core i5,Processor_Intel Core i7,Processor_Other
479,0.385542,0.096774,0.089776,0.252827,0.492063,0.0,1,0,False,False,...,False,False,True,False,False,False,False,True,False,False
1022,0.385542,0.096774,0.104738,0.164557,0.492063,0.0,1,0,False,False,...,False,False,True,False,False,False,False,True,False,False
298,0.662651,0.064516,0.426434,0.054852,0.0,0.0,0,0,False,False,...,False,False,True,False,False,False,False,False,False,True
1265,0.662651,0.096774,0.476309,0.122363,0.0,0.0,1,0,False,False,...,False,False,True,False,False,False,False,False,True,False
774,0.289157,0.032258,0.142145,0.180591,0.492063,0.0,0,0,False,False,...,False,False,True,False,False,False,False,True,False,False


In [337]:
data.columns

Index(['Screen Size', 'Ram', 'Weight', 'Price', 'SSD', 'Hard Drive',
       'Ips Display', 'Touchscreen', 'Brand_Acer', 'Brand_Apple', 'Brand_Asus',
       'Brand_Chuwi', 'Brand_Dell', 'Brand_Fujitsu', 'Brand_Google',
       'Brand_HP', 'Brand_Huawei', 'Brand_LG', 'Brand_Lenovo', 'Brand_MSI',
       'Brand_Mediacom', 'Brand_Microsoft', 'Brand_Razer', 'Brand_Samsung',
       'Brand_Toshiba', 'Brand_Vero', 'Brand_Xiaomi',
       'Laptop Type_2 in 1 Convertible', 'Laptop Type_Gaming',
       'Laptop Type_Netbook', 'Laptop Type_Notebook', 'Laptop Type_Ultrabook',
       'Laptop Type_Workstation', 'ScreenResolution_1366x768',
       'ScreenResolution_1440x900', 'ScreenResolution_1600x900',
       'ScreenResolution_1920x1080', 'ScreenResolution_1920x1200',
       'ScreenResolution_2160x1440', 'ScreenResolution_2256x1504',
       'ScreenResolution_2304x1440', 'ScreenResolution_2400x1600',
       'ScreenResolution_2560x1440', 'ScreenResolution_2560x1600',
       'ScreenResolution_2736x1824', '

In [338]:
from sklearn.model_selection import train_test_split
x=data.drop(columns=['Price'])
y=data['Price']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape: (1042, 64)
x_test shape: (261, 64)
y_train shape: (1042,)
y_test shape: (261,)


In [339]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Fit the model
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(x_test)


In [340]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(x_test)


In [341]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate MSE and R²
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)


print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R²: {r2}')


MSE: 0.0019843533380916626
RMSE: 0.0445460810632278
R²: 0.8464690277072039
