In [124]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error

### Loading CSV

In [19]:
laptop= pd.read_csv('data.csv')

#### checking if there's any missing values

In [44]:
laptop.isna().sum()

Unnamed: 0.1         0
Unnamed: 0           0
brand                0
name                 0
price                0
spec_rating          0
processor            0
CPU                  0
Ram                  0
Ram_type             0
ROM                  0
ROM_type             0
GPU                  0
display_size         0
resolution_width     0
resolution_height    0
OS                   0
warranty             0
dtype: int64

In [115]:
laptop

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,brand,name,price,spec_rating,processor,CPU,Ram,Ram_type,ROM,ROM_type,GPU,display_size,resolution_width,resolution_height,OS,warranty,ROM SIZE FORMAT
0,0,0,HP,Victus 15-fb0157AX Gaming Laptop,49900,73.0,5th Gen AMD Ryzen 5 5600H,"Hexa Core, 12 Threads",8,DDR4,512,SSD,4GB AMD Radeon RX 6500M,15.6,1920.0,1080.0,Windows 11 OS,1,GB
1,1,1,HP,15s-fq5007TU Laptop,39900,60.0,12th Gen Intel Core i3 1215U,"Hexa Core (2P + 4E), 8 Threads",8,DDR4,512,SSD,Intel UHD Graphics,15.6,1920.0,1080.0,Windows 11 OS,1,GB
2,2,2,Acer,One 14 Z8-415 Laptop,26990,69.0,11th Gen Intel Core i3 1115G4,"Dual Core, 4 Threads",8,DDR4,512,SSD,Intel Iris Xe Graphics,14.0,1920.0,1080.0,Windows 11 OS,1,GB
3,3,3,Lenovo,Yoga Slim 6 14IAP8 82WU0095IN Laptop,59729,66.0,12th Gen Intel Core i5 1240P,"12 Cores (4P + 8E), 16 Threads",16,LPDDR5,512,SSD,Intel Integrated Iris Xe,14.0,2240.0,1400.0,Windows 11 OS,1,GB
4,4,4,Apple,MacBook Air 2020 MGND3HN Laptop,69990,69.0,Apple M1,Octa Core (4P + 4E),8,DDR4,256,SSD,Apple M1 Integrated Graphics,13.3,2560.0,1600.0,Mac OS,1,GB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,926,1015,Asus,Vivobook 15X 2023 K3504VAB-NJ321WS Laptop,44990,69.0,13th Gen ‎Intel Core i3 1315U,"Hexa Core (2P + 4E), 8 Threads",8,DDR4,512,SSD,Integrated Intel UHD Graphics,15.6,1920.0,1080.0,Windows 11 OS,1,GB
889,927,1016,Asus,TUF A15 FA577RM-HQ032WS Laptop,110000,71.0,6th Gen AMD Ryzen 7 6800H,"Octa Core, 16 Threads",16,DDR,1000,SSD,6GB NVIDIA GeForce RTX 3060,15.6,2560.0,1440.0,Windows 11 OS,1,TB
890,928,1017,Asus,ROG Zephyrus G14 2023 GA402XV-N2034WS Gaming L...,189990,89.0,7th Gen AMD Ryzen 9 7940HS,"Octa Core, 16 Threads",32,DDR5,1000,SSD,8GB NVIDIA GeForce RTX 4060,14.0,2560.0,1600.0,Windows 11 OS,1,TB
891,929,1018,Asus,TUF Gaming F15 2023 FX507VU-LP083WS Gaming Laptop,129990,73.0,13th Gen Intel Core i7 13700H,"14 Cores (6P + 8E), 20 Threads",16,DDR4,512,SSD,6GB NVIDIA GeForce RTX 4050,15.6,1920.0,1080.0,Windows 11 OS,1,GB


##### Cleaning the dataset

In [47]:
laptop['spec_rating']=laptop['spec_rating'].round()

In [50]:
laptop['Ram']=laptop['Ram'].str.replace('[GB,TB]','',regex=True).astype('int64')

In [78]:
laptop['ROM SIZE FORMAT']=laptop['ROM'].str.extract(r'([A-Z a-z]+)')

In [79]:
laptop['ROM']=laptop['ROM'].str.extract(r'(\d+)').astype('int64')
#laptop['ROM']=laptop['ROM'].str.replace('GB|TB','',regex=True)


In [84]:
laptop.loc[laptop['ROM SIZE FORMAT'].str.contains('TB'),'ROM']*=1000

#### Assigning Train and Test data

In [101]:
X=laptop[['brand','spec_rating','Ram','Ram_type','ROM','ROM_type','display_size','OS','warranty','processor']]

In [116]:
Y=laptop['price']

### Encoding the categorical column to numerical using onehotencoder with pipline

In [108]:
category_transform=Pipeline(steps=[('onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [111]:
categories=['brand','Ram_type','ROM_type','OS']
transformer=ColumnTransformer(transformers=[('category encoding',category_transform,categories)])

In [114]:
np.random.seed(42)
model=Pipeline(steps=[('preprocess',transformer),('model',RandomForestRegressor(n_estimators=100))])

#### spliting training and testing set of data

In [117]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2)

#### training model

In [118]:
model.fit(x_train,y_train)

#### model prediction

In [125]:
y_pred=model.predict(x_test)

#### model estimation

In [126]:
model.score(x_test,y_test)
print(f'Mean Absolute Error:{mean_absolute_error(y_test,y_pred)}')

Mean Absolute Error:27992.007417155142


In [127]:
print(f'Mean Squared Error:{mean_squared_error(y_test,y_pred)}')

Mean Squared Error:1954231304.1126058


#### choosing best hyperparameters

In [132]:
pipe_grid = {
    'model__n_estimators': [100, 200, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': ['sqrt', None],
    'model__min_samples_split': [2, 4]
}
gs_model=GridSearchCV(model, param_grid=pipe_grid, cv=5, verbose=2, n_jobs=1)

In [133]:
gs_model.fit(x_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=200; total time=   0.9s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=200; total time=   0.9s
[CV] END model__max_depth=No

In [135]:
gs_model.best_params

{'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__min_samples_split': 4,
 'model__n_estimators': 100}

#### best hyperparameter is not best enough :)
![](bonk.jpg)

In [136]:
gs_pred=gs_model.predict(x_test)
print(f'Mean Absolute Error:{mean_absolute_error(y_test,gs_pred)}')
print(f'Mean Squared Error:{mean_squared_error(y_test,gs_pred)}')


Mean Absolute Error:28030.287961841346
Mean Squared Error:1923562759.908359


In [139]:
y_pred[:5],y_test[:5]

(array([111930.02937921,  60520.38745989,  63403.08965165,  63403.08965165,
        147399.56204882]),
 710     83090
 440     57580
 525     58990
 721    142990
 39      74990
 Name: price, dtype: int64)