# Price Prediction using Machine Learning

# 1- Import data  :

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
import re

In [3]:
df=pd.read_csv("data_laptop_clean")
df.head(5)

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,IPS/Touchscreen,width,height,Memory_plusCapacite,Mermory_plusType,Mermory_Type
0,Apple,Ultrabook,13.3,Intel Core i5,8,128.0,Intel Iris,OS,1.37,71378.6832,IPS,2560,1600,0.0,Any,SSD
1,Apple,Ultrabook,13.3,Intel Core i5,8,128.0,Intel HD,OS,1.34,47895.5232,Touchscreen,1440,900,0.0,Any,Flash
2,HP,Notebook,15.6,Intel Core i5,8,256.0,Intel HD,No OS,1.86,30636.0,Touchscreen,1920,1080,0.0,Any,SSD
3,Apple,Ultrabook,15.4,Intel Core i7,16,512.0,AMD Radeon,OS,1.83,135195.336,IPS,2880,1800,0.0,Any,SSD
4,Apple,Ultrabook,13.3,Intel Core i5,8,256.0,Intel Iris,OS,1.37,96095.808,IPS,2560,1600,0.0,Any,SSD


# 2 Split data

Splitting the data into an **output**, which is the **price**, and **inputs**, which are the remaining columns.  
Additionally, the data will be divided into a **training set** and a **testing set**.

In [4]:
X = df[['Company', 'TypeName', 'Inches', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys',
       'Weight', 'Memory_plusCapacite', 'Mermory_plusType',
       'Mermory_Type', 'IPS/Touchscreen', 'width', 'height']]
y = df['Price']

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

The columns will be divided into **those with numerical values** and **those with textual values**.

In [7]:
numeric_features = ['Inches', 'Ram', 'Memory','Weight', 'Memory_plusCapacite', 'width', 'height']
categorical_features = ['Company', 'TypeName', 'Cpu','Gpu', 'OpSys','Mermory_plusType','Mermory_Type', 'IPS/Touchscreen']

In [8]:
# Import package sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [9]:
# Create Pipeline 
num_pipeline=Pipeline(steps=[
    ("inpute",SimpleImputer(strategy="mean")),
    ("scale",StandardScaler())
])
cat_pipeline=Pipeline(steps=[
    ("inpute",SimpleImputer(strategy="most_frequent")),
    ("one-hot-encoder",OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
# Create preprocessor

preprocessor=ColumnTransformer(transformers=[
    ('num_pipeline',num_pipeline,numeric_features),
    ('cat_pipeline',cat_pipeline,categorical_features)
 ])

In [11]:
# Import package Models regrssion 
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor ,GradientBoostingRegressor
from sklearn.linear_model import Ridge ,Lasso ,ElasticNet,LinearRegression


In [12]:
# Grouping Models 
models = {
    'lg': LinearRegression(),
    'rd': Ridge(),
    'ls': Lasso(max_iter=5000),
    'en': ElasticNet(max_iter=10000),
    'svr': SVR(),
    'dtg': DecisionTreeRegressor(),
    'rfr': RandomForestRegressor(),
    'gbr': GradientBoostingRegressor()
}
for key, value in models.items():
    print(f"{key} = {value.__class__.__name__}()")

lg = LinearRegression()
rd = Ridge()
ls = Lasso()
en = ElasticNet()
svr = SVR()
dtg = DecisionTreeRegressor()
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()


In [13]:
# Hyperparameters Tuning
param_grids = {
    'lg': {},
    'rd': {
        'cls__alpha': [0.01, 0.1, 1, 10]},  # معلمات Ridge
    'ls': {
        'cls__alpha': [0.01, 0.1, 1, 10]},  # معلمات Lasso
    'en':{
        'cls__alpha': [0.01, 0.1, 1, 10, 100],
        'cls__l1_ratio': [0.1, 0.5, 0.7, 0.9, 1]},  # معلمات ElasticNet
    'svr': {
        'cls__C': [1, 2, 3],
        'cls__epsilon': [0.5, 1.0, 1.5]},  # معلمات SVR
    'dtg': {
        'cls__max_depth': [3, 5, 10, None],
        'cls__min_samples_split': [2, 5, 10],
       # 'cls__min_samples_leaf': [1, 2, 4],
       # 'cls__max_features': [None, 'sqrt', 'log2'],
        #'cls__criterion': ['squared_error', 'friedman_mse', 'absolute_error']
    }, 
    'rfr': {
        'cls__n_estimators': [50, 100, 200],
        'cls__max_depth': [None, 10, 20, 30],
        'cls__min_samples_split': [2, 5, 10],
        #'cls__min_samples_leaf': [1, 2, 4],
        #'cls__max_features': ['sqrt', 'log2'],
        #'cls__bootstrap': [True, False],
        #'cls__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
        },  
    'gbr': {
        'cls__n_estimators': [50, 100, 200],
        'cls__learning_rate': [0.01, 0.1, 0.2],
        'cls__max_depth': [3, 5, 10],
        #'cls__min_samples_split': [2, 5, 10],
        #'cls__min_samples_leaf': [1, 2, 4],
        #'cls__subsample': [0.8, 1.0],
        #'cls__loss': ['squared_error', 'absolute_error'],
        #'cls__max_features': ['sqrt', 'log2'],
        #'cls__warm_start': [True, False]
    }  
}


# Model Re-Building

**Model LinearRegression()  'lg'**

In [14]:
#Create Pipepline with model linear regression 
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    
    ('cls', models['lg'])])

In [15]:
pipeline.fit(X_train, y_train)

In [16]:
param_grid = param_grids['lg']

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [19]:
grid.fit(X_train, y_train)

In [20]:
y_pred = grid.predict(X_test)

In [21]:
# Evalution model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [22]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [23]:
eval_linearRegression=[mse , rmse ,r2 , mae]

**Model Ridge() 'rd'**

In [24]:
#Create Pipepline with model ridge
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    
    ('cls', models['rd'])])

In [25]:
pipeline.fit(X_train, y_train)

In [26]:
param_grid = param_grids['rd']

In [27]:
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [28]:
grid.fit(X_train, y_train)

In [29]:
best_model_rd = grid.best_estimator_


In [30]:
best_model_rd

In [31]:
y_pred = best_model_rd.predict(X_test)

In [32]:
# Evalution Model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)


In [33]:
eval_ridge=[mse , rmse ,r2 , mae]       

**Model ElasticNet() 'en'**

In [34]:
#Create Pipepline with model ElasticNet
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    
    ('cls', models['en'])])

In [35]:
pipeline.fit(X_train, y_train)

In [36]:
param_grid = param_grids['en']

In [37]:
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [38]:
grid.fit(X_train, y_train)

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


In [39]:
grid.fit(X_train, y_train)

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


In [40]:
best_model = grid.best_estimator_

In [41]:
y_pred = best_model.predict(X_test)

In [42]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)


In [43]:
eval_ElasticNet=[mse , rmse ,r2 , mae]     

**Model Lasso() 'ls'**

In [44]:
#Create Pipepline with model lasso
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    
    ('cls', models['rd'])])

In [45]:
pipeline.fit(X_train, y_train)

In [46]:
param_grid = param_grids['rd']

In [47]:
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [48]:
grid.fit(X_train, y_train)

In [49]:
best_model = grid.best_estimator_

In [50]:
y_pred = best_model.predict(X_test)

In [51]:
# Evalution Model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [52]:
eval_Lasso=[mse , rmse ,r2 , mae]         

**Model SVR() 'svr'**

In [53]:
#Create Pipepline with model SVR
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    
    ('cls', models['svr'])])

In [54]:
pipeline.fit(X_train, y_train)

In [55]:
param_grid = param_grids['svr']

In [56]:
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [57]:
grid.fit(X_train, y_train)

In [58]:
best_model = grid.best_estimator_

In [59]:
y_pred = best_model.predict(X_test)

In [60]:
# Evalution Model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [61]:
eval_SVR=[mse , rmse ,r2 , mae]                   


**Model DecisionTreeRegressor() 'dtg'**

In [62]:
#Create Pipepline with model DecisionTreeRegressor()
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    
    ('cls', models['dtg'])])

In [63]:
pipeline.fit(X_train, y_train)

In [64]:
param_grid = param_grids['dtg']

In [65]:
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [66]:
grid.fit(X_train, y_train)

In [67]:
best_model = grid.best_estimator_

In [68]:
y_pred = best_model.predict(X_test)

In [69]:
# Evalution Model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [70]:
eval_DecisionTreeRegressor=[mse , rmse ,r2 , mae]       

**Model RandomForestRegressor() 'rfr'**

In [71]:
#Create Pipepline with model RandomForestRegressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    
    ('cls', models['rfr'])])

In [72]:
pipeline.fit(X_train, y_train)

In [73]:
param_grid = param_grids['rfr']

In [74]:
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [75]:
grid.fit(X_train, y_train)

In [76]:
best_model = grid.best_estimator_

In [77]:
y_pred = best_model.predict(X_test)

In [78]:
# Evalution Model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [79]:
eval_RandomForestRegressor=[mse , rmse ,r2 , mae]            

**Model GradientBoostingRegressor() 'gbr'**

In [80]:
#Create Pipepline with model GradientBoostingRegressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    
    ('cls', models['gbr'])])

In [81]:
pipeline.fit(X_train, y_train)

In [82]:
param_grid = param_grids['gbr']

In [83]:
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [84]:
grid.fit(X_train, y_train)

In [85]:
best_model = grid.best_estimator_

In [86]:
y_pred = best_model.predict(X_test)

In [87]:
# Evalution Model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [88]:
eval_GradientBoostingRegressor=[mse , rmse ,r2 , mae]

In [89]:
evalution_table = pd.DataFrame(
    [eval_linearRegression, eval_ridge, eval_ElasticNet,
     eval_Lasso, eval_SVR, eval_DecisionTreeRegressor,
     eval_RandomForestRegressor, eval_GradientBoostingRegressor], 
    index=['eval_linearRegression', 'eval_ridge', 'eval_ElasticNet',
           'eval_Lasso', 'eval_SVR', 'eval_DecisionTreeRegressor',
           'eval_RandomForestRegressor', 'eval_GradientBoostingRegressor'],
    columns=['MSE', 'RMSE', 'R²', 'MAE']
)


In [90]:
pd.options.display.float_format = '{:.2f}'.format

In [91]:
evalution_table

Unnamed: 0,MSE,RMSE,R²,MAE
eval_linearRegression,348920196.99,18679.41,0.77,13174.24
eval_ridge,371755603.87,19280.96,0.76,13478.15
eval_ElasticNet,363857859.93,19075.06,0.76,13369.09
eval_Lasso,371755603.87,19280.96,0.76,13478.15
eval_SVR,1605626681.55,40070.27,-0.06,27922.14
eval_DecisionTreeRegressor,516223482.71,22720.55,0.66,13848.25
eval_RandomForestRegressor,296242606.43,17211.7,0.81,10875.05
eval_GradientBoostingRegressor,228230634.81,15107.3,0.85,10102.83


Let's evaluate the models based on the provided metrics:

Metrics Explanation:
MSE (Mean Squared Error): Measures the average squared difference between actual and predicted values. Lower is better.
RMSE (Root Mean Squared Error): Square root of MSE; interpretable in the same units as the target variable. Lower is better.
R² (Coefficient of Determination): Measures how well the model explains the variance in the target variable. Values closer to 1 are better, and negative values indicate poor performance.
MAE (Mean Absolute Error): Measures the average absolute difference between actual and predicted values. Lower is better.

# Recommended Model:
Gradient Boosting Regressor is the clear winner:
Best R² (0.85), indicating it explains 85% of the variance in the target variable.
Lowest MSE, RMSE, and MAE among all models.
If computational efficiency and interpretability are priorities, Random Forest Regressor is also a strong candidate, performing slightly worse but still reliable.

# Saving the model for creating an application.

In [95]:
import pickle
with open('model_GradientBoostingRegressor.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# In the end, this equation combines the previous steps into a single code implementation.

In [93]:
def train_and_evaluate(models, param_grids, X_train, y_train, X_test, y_test):
    best_models = {}
    results = []

    for model_name in models:
        model = models[model_name]
        param_grid = param_grids.get(model_name, {})
        
        
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('cls', model)])
        
        grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, error_score='raise')
        try:
            grid_search.fit(X_train, y_train)
        except ValueError as e:
            print(f"Error in model {model_name}: {e}")
            continue
        
    
        best_params = grid_search.best_params_
        
        best_model = grid_search.best_estimator_

        y_pred = best_model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2,
            'MAE': mae
        })
        
        best_models[model_name] = best_model

    return results, best_models

In [94]:
results, best_models = train_and_evaluate(models, param_grids, X_train, y_train, X_test, y_test)

for result in results:
    print(f"Model: {result['Model']}")
    print(f"Best Params: {result['Best Params']}")
    print(f"MSE: {result['MSE']}")
    print(f"RMSE: {result['RMSE']}")
    print(f"R2: {result['R2']}")
    print(f"MAE: {result['MAE']}")
    print("="*50)

Model: lg
Best Params: {}
MSE: 348920196.9923889
RMSE: 18679.405691627046
R2: 0.7704620362395747
MAE: 13174.2416143293
Model: rd
Best Params: {'cls__alpha': 10}
MSE: 371755603.87114125
RMSE: 19280.96480654278
R2: 0.7554397106712328
MAE: 13478.145351958252
Model: ls
Best Params: {'cls__alpha': 10}
MSE: 351181764.11233866
RMSE: 18739.844292638576
R2: 0.7689742590455474
MAE: 13171.621830321343
Model: en
Best Params: {'cls__alpha': 0.01, 'cls__l1_ratio': 0.5}
MSE: 363857859.9255298
RMSE: 19075.058582492737
R2: 0.7606352599091477
MAE: 13369.091153991369
Model: svr
Best Params: {'cls__C': 3, 'cls__epsilon': 0.5}
MSE: 1605626681.552376
RMSE: 40070.27179284383
R2: -0.05626524981865866
MAE: 27922.137332805112
Model: dtg
Best Params: {'cls__max_depth': None, 'cls__min_samples_split': 10}
MSE: 512587386.03091484
RMSE: 22640.39279762864
R2: 0.6627931949683561
MAE: 13459.993118348557
Model: rfr
Best Params: {'cls__max_depth': 30, 'cls__min_samples_split': 2, 'cls__n_estimators': 50}
MSE: 300862719.