In [44]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [45]:
columns = [
    "symboling", "normalized_losses", "make", "fuel_type", "aspiration",
    "num_of_doors", "body_style", "drive_wheels", "engine_location",
    "wheel_base", "length", "width", "height", "curb_weight",
    "engine_type", "num_of_cylinders", "engine_size", "fuel_system",
    "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
    "city_mpg", "highway_mpg", "price"
]

df=pd.read_csv('imports-85.csv',names=columns)

In [46]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


## Data Cleaning

In [47]:
df['normalized_losses'].value_counts()

normalized_losses
?      41
161    11
91      8
150     7
134     6
128     6
104     6
85      5
94      5
65      5
102     5
74      5
168     5
103     5
95      5
106     4
93      4
118     4
148     4
122     4
83      3
125     3
154     3
115     3
137     3
101     3
119     2
87      2
89      2
192     2
197     2
158     2
81      2
188     2
194     2
153     2
129     2
108     2
110     2
164     2
145     2
113     2
256     1
107     1
90      1
231     1
142     1
121     1
78      1
98      1
186     1
77      1
Name: count, dtype: int64

In [48]:
df.replace('?', np.nan, inplace=True)

In [49]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [50]:
df.isnull().sum()

symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_of_doors          2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_of_cylinders      0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [51]:
numeric_cols = [
    "symboling", "normalized_losses", "wheel_base", "length", "width",
    "height", "curb_weight", "engine_size", "bore", "stroke",
    "compression_ratio", "horsepower", "peak_rpm", "city_mpg",
    "highway_mpg", "price"
]

In [52]:
# Convert to numeric
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [53]:
# Fill missing numeric values
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

In [54]:
# Fill missing categorical
df['num_of_doors'].fillna(df['num_of_doors'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['num_of_doors'].fillna(df['num_of_doors'].mode()[0], inplace=True)


In [55]:
# Drop rows where target is missing (if any)
df.dropna(subset=['price'], inplace=True)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  205 non-null    float64
 2   make               205 non-null    object 
 3   fuel_type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num_of_doors       205 non-null    object 
 6   body_style         205 non-null    object 
 7   drive_wheels       205 non-null    object 
 8   engine_location    205 non-null    object 
 9   wheel_base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    object 
 15  num_of_cylinders   205 non-null    object 
 16  engine_size        205 non

In [59]:
X=df.drop(columns='price',axis=1)
y=df['price']

numeric_features = X.select_dtypes(exclude=['object']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [60]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

In [61]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in categorical_features:
    X[col] = le.fit_transform(X[col])

In [62]:
X.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
0,1.74347,-0.176503,0,1,0,1,0,2,0,-1.690772,...,2,0.074449,5,0.520489,-1.841345,-0.288349,0.173309,-0.264983,-0.646553,-0.546059
1,1.74347,-0.176503,0,1,0,1,0,2,0,-1.690772,...,2,0.074449,5,0.520489,-1.841345,-0.288349,0.173309,-0.264983,-0.646553,-0.546059
2,0.133509,-0.176503,0,1,0,1,2,2,0,-0.708596,...,3,0.604046,5,-2.403313,0.683683,-0.288349,1.263761,-0.264983,-0.953012,-0.691627
3,0.93849,1.367901,1,1,0,0,3,1,0,0.173698,...,2,-0.431076,5,-0.515795,0.459947,-0.035973,-0.054925,0.785723,-0.186865,-0.109354
4,0.93849,1.367901,1,1,0,0,3,0,0,0.10711,...,1,0.218885,5,-0.515795,0.459947,-0.540725,0.274747,0.785723,-1.106241,-1.2739


In [63]:
X.corr()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
symboling,1.0,0.457484,-0.118794,0.194311,-0.059866,0.663595,-0.596135,-0.041671,0.212471,-0.531954,...,0.197762,-0.10579,0.091163,-0.132563,-0.004928,-0.178515,0.071064,0.273851,-0.035823,0.034606
normalized_losses,0.457484,1.0,-0.231427,0.104668,-0.011273,0.34885,-0.244639,0.287316,-0.02151,-0.073709,...,0.171325,0.072536,0.197349,-0.050428,0.046739,-0.114772,0.166964,0.241134,-0.18953,-0.149357
make,-0.118794,-0.231427,1.0,-0.113191,0.054265,-0.151516,0.089494,-0.004317,0.054608,0.078505,...,-0.039937,-0.070918,0.146624,0.252095,-0.202465,0.138828,-0.055032,-0.217386,0.053642,0.050022
fuel_type,0.194311,0.104668,-0.113191,1.0,-0.401397,0.188496,-0.147853,-0.132257,0.04007,-0.308346,...,0.110617,-0.069594,0.041529,-0.054923,-0.241083,-0.984356,0.164393,0.477507,-0.255963,-0.191392
aspiration,-0.059866,-0.011273,0.054265,-0.401397,1.0,-0.052803,0.063028,0.066465,-0.057191,0.257611,...,-0.133119,0.108217,0.288086,0.213281,0.221932,0.295541,0.241193,-0.184326,-0.202362,-0.254416
num_of_doors,0.663595,0.34885,-0.151516,0.188496,-0.052803,1.0,-0.68564,0.104877,0.139129,-0.439635,...,0.155968,-0.013919,0.00698,-0.110133,0.009353,-0.171797,0.127846,0.240473,0.014271,0.037452
body_style,-0.596135,-0.244639,0.089494,-0.147853,0.063028,-0.68564,1.0,-0.155745,-0.277009,0.401362,...,-0.048408,-0.073352,-0.065079,0.011585,-0.016889,0.136243,-0.153428,-0.108709,0.031697,-0.00717
drive_wheels,-0.041671,0.287316,-0.004317,-0.132257,0.066465,0.104877,-0.155745,1.0,0.147865,0.459745,...,0.223238,0.524307,0.424686,0.48005,0.074231,0.127479,0.518147,-0.04062,-0.449581,-0.45222
engine_location,0.212471,-0.02151,0.054608,0.04007,-0.057191,0.139129,-0.277009,0.147865,1.0,-0.18779,...,0.135541,0.196826,0.105971,0.185208,-0.138705,-0.019762,0.317805,0.19819,-0.153487,-0.102026
wheel_base,-0.531954,-0.073709,0.078505,-0.308346,0.257611,-0.439635,0.401362,0.459745,-0.18779,1.0,...,-0.184596,0.569329,0.384601,0.489556,0.159684,0.249786,0.352876,-0.361338,-0.470414,-0.544082


In [64]:
y.head()

0    13495.0
1    16500.0
2    16500.0
3    13950.0
4    17450.0
Name: price, dtype: float64

In [68]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [79]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [80]:
models={
    "Linear Regression":LinearRegression(),
    "Ada boost":AdaBoostRegressor(),
    "Gradient boost":GradientBoostingRegressor(),
    "Random Forest":RandomForestRegressor(),
    "SVR":SVR(),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
    "Lasso": Lasso(),
    "Ridge": Ridge()
}
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2
for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate performance
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)
    
    # Print results
    print(f"Model: {name}")
    print("Training set performance:")
    print(f"  MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}")
    print("Test set performance:")
    print(f"  MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}")
    print("="*40)




Model: Linear Regression
Training set performance:
  MAE: 1868.0093, RMSE: 2482.6659, R2: 0.8930
Test set performance:
  MAE: 2405.8555, RMSE: 3756.3838, R2: 0.8201




Model: Ada boost
Training set performance:
  MAE: 1511.9080, RMSE: 1734.2731, R2: 0.9478
Test set performance:
  MAE: 2289.6320, RMSE: 3054.2940, R2: 0.8811




Model: Gradient boost
Training set performance:
  MAE: 470.3119, RMSE: 617.9982, R2: 0.9934
Test set performance:
  MAE: 1810.9595, RMSE: 2707.3746, R2: 0.9066




Model: Random Forest
Training set performance:
  MAE: 644.4857, RMSE: 1205.3459, R2: 0.9748
Test set performance:
  MAE: 1486.5212, RMSE: 2390.4505, R2: 0.9272
Model: SVR
Training set performance:
  MAE: 5305.9897, RMSE: 8027.1334, R2: -0.1187
Test set performance:
  MAE: 5473.0029, RMSE: 9303.7784, R2: -0.1035
Model: XGBoost
Training set performance:
  MAE: 47.8756, RMSE: 217.0533, R2: 0.9992
Test set performance:
  MAE: 1845.7546, RMSE: 2810.2908, R2: 0.8993
Model: Lasso
Training set performance:
  MAE: 1871.2840, RMSE: 2483.1872, R2: 0.8929
Test set performance:
  MAE: 2398.1411, RMSE: 3737.6177, R2: 0.8219
Model: Ridge
Training set performance:
  MAE: 1925.9081, RMSE: 2549.9665, R2: 0.8871
Test set performance:
  MAE: 2397.5966, RMSE: 3675.5840, R2: 0.8278




In [81]:
rf_param = {
    "max_depth": [5, 10, None],
    "max_features": ['sqrt', 'auto'],
    "min_samples_split": [2, 10],
    "n_estimators": [100, 300]
}

gradient_param = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5],
    "max_features": ['sqrt', 'log2']
}

xg_param = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5],
    "colsample_bytree": [0.8, 1.0]
}

svr_param = {
    "kernel": ['linear', 'rbf'],
    "C": [1, 10],
    "gamma": ['scale', 0.1]
}

In [83]:
randomcv_models = {
    "Random Forest": (RandomForestRegressor(random_state=42), rf_param),
    "Gradient Boosting": (GradientBoostingRegressor(random_state=42), gradient_param),
    "XGBoost": (xgb.XGBRegressor(objective='reg:squarederror', random_state=42), xg_param),
    "SVR": (SVR(), svr_param)
}

In [86]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}

for name, (model, params) in randomcv_models.items():  # <-- unpack tuple from dict values
    print(f"Tuning {name}...")
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=20,  # reduced for faster execution
        cv=3,
        scoring='r2',
        verbose=2,
        n_jobs=-1,
        random_state=42
    )
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

# Print best parameters
for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Tuning Random Forest...
Fitting 3 folds for each of 20 candidates, totalling 60 fits


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Public\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Public\anaconda3\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\Public\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Public\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

Tuning Gradient Boosting...
Fitting 3 folds for each of 16 candidates, totalling 48 fits




Tuning XGBoost...
Fitting 3 folds for each of 16 candidates, totalling 48 fits




Tuning SVR...
Fitting 3 folds for each of 8 candidates, totalling 24 fits




---------------- Best Params for Random Forest -------------------
{'n_estimators': 100, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 10}
---------------- Best Params for Gradient Boosting -------------------
{'n_estimators': 200, 'max_features': 'log2', 'max_depth': 3, 'learning_rate': 0.05}
---------------- Best Params for XGBoost -------------------
{'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
---------------- Best Params for SVR -------------------
{'kernel': 'linear', 'gamma': 'scale', 'C': 10}


In [87]:
    models = {
        "Random Forest": RandomForestRegressor(
            n_estimators=100,min_samples_split=2,max_features='sqrt',max_depth=10,random_state=42
        ),
        "Gradient Boosting": GradientBoostingRegressor(
            n_estimators=200, max_features='log2',max_depth=3, learning_rate=0.05, random_state=42
        ),
        "XGBoost": xgb.XGBRegressor( n_estimators=200, max_depth=3, learning_rate=0.05, colsample_bytree=0.8, objective='reg:squarederror',
            random_state=42 ),
        "SVR": SVR(kernel='linear',gamma='scale',C=10
        )
    }
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model
    
        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    
        model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    
        model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
        
        print(list(models.keys())[i])
        
        print('Model performance for Training set')
        print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
        print("- R2 Score: {:.4f}".format(model_train_r2))
    
        print('----------------------------------')
        
        print('Model performance for Test set')
        print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        print("- R2 Score: {:.4f}".format(model_test_r2))
        
        print('='*35)
        print('\n')



Random Forest
Model performance for Training set
- Root Mean Squared Error: 1144.8404
- Mean Absolute Error: 667.0035
- R2 Score: 0.9772
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 2979.2323
- Mean Absolute Error: 1732.9332
- R2 Score: 0.8869






Gradient Boosting
Model performance for Training set
- Root Mean Squared Error: 721.4149
- Mean Absolute Error: 539.9133
- R2 Score: 0.9910
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 3076.7771
- Mean Absolute Error: 1846.0074
- R2 Score: 0.8793


XGBoost
Model performance for Training set
- Root Mean Squared Error: 758.0175
- Mean Absolute Error: 575.0254
- R2 Score: 0.9900
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 2893.2822
- Mean Absolute Error: 1907.4002
- R2 Score: 0.8933


SVR
Model performance for Training set
- Root Mean Squared Error: 4848.6326
- Mean Absolute Error: 2591.0960
- R2 Score: 0.5919
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5770.5257
- Mean Absolute Error: 3345.0184
- R2 Score: 0.5755




