In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
import joblib

In [19]:
df = pd.read_csv("D:\Project\Car-Evaluation\Dataset\Final\Remove-noise-not-outlier-and-fill-null.csv")

In [20]:
# Save'price_in_billion' individually
price_column = df['price_in_billion']
df = df.drop(['price_in_billion','ad_id','brand','url','grade','condition','price'], axis=1)

# Define columns that need encoded and scaled
columns_for_encoding = ['origin','car_model','exterior_color','interior_color','engine','transmission','drive_type','car_name']
columns_for_scaling = ['num_of_doors','seating_capacity','engine_capacity','fuel_consumption','mileage','year_of_manufacture']

# Define the preprocessing steps for numerical features
numerical_transformer = StandardScaler()

# Define the preprocessing steps for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False)),
    ('svd', TruncatedSVD(n_components= 500, random_state=42))
])

# Combine preprocessing steps into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, columns_for_scaling),
        ('cat', categorical_transformer, columns_for_encoding)
    ]
)

# Combine the preprocessor into a Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
df_processed = pipeline.fit_transform(df)

joblib.dump(pipeline, 'process_data.pkl')

X_train, X_test, y_train, y_test = train_test_split(df_processed, price_column, test_size=0.3, random_state=42)



In [14]:
X_train

array([[ 8.09025596e-01, -3.51095751e-01, -7.06355567e-01, ...,
        -2.09771398e-03, -5.05992512e-03,  1.32458295e-03],
       [-7.91068130e-01, -3.51095751e-01,  1.11940457e+00, ...,
        -4.49008166e-04,  8.31075719e-04, -2.03080889e-03],
       [ 8.09025596e-01, -9.89777471e-01, -9.77688554e-02, ...,
         2.86636129e-04,  2.96889876e-03, -4.70932945e-03],
       ...,
       [-7.91068130e-01, -3.51095751e-01, -9.77688554e-02, ...,
         3.30587562e-02,  1.64308749e-01, -1.63740733e-02],
       [ 8.09025596e-01, -3.51095751e-01,  1.72799128e+00, ...,
        -2.95592005e-02,  8.30604562e-02, -3.90200806e-02],
       [-7.91068130e-01, -3.51095751e-01,  1.11940457e+00, ...,
        -3.97200860e-03,  1.69350960e-03, -8.27288435e-04]])

In [15]:
#Define pipeline
pipeline = Pipeline(steps=[
    ('svr', SVR())
])


In [16]:
#Define hyperparameters of pipeline that need tuning and the range values of them
param_grid = {'svr__C': [10,20,50,100,200,300,400,500],  
              'svr__gamma': [0.001,0.01,0.1,1.0], 
              'svr__kernel': ['poly','rbf'],
              'svr__epsilon':[0.01,0.1,0.25,1.0,4.0,16.0,64.0,256.0],
             }
  

In [17]:
# Define the halving grid search with cross-validation
halving_grid_search = HalvingGridSearchCV(pipeline, param_grid, scoring="r2", cv=5, verbose=6, factor=3,error_score='raise')
halving_grid_search.fit(X_train, y_train)


n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 88
max_resources_: 21456
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 512
n_resources: 88
Fitting 5 folds for each of 512 candidates, totalling 2560 fits
[CV 1/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.096, test=-0.049) total time=   0.0s
[CV 2/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.114, test=-0.121) total time=   0.0s
[CV 3/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.072, test=-0.048) total time=   0.0s
[CV 4/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.101, test=-0.186) total time=   0.0s
[CV 5/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.097, test=-0.081) total time=   0.0s
[CV 1/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kerne

KeyboardInterrupt: 

In [9]:
best_params = halving_grid_search.best_params_
best_model = halving_grid_search.best_estimator_
best_score = halving_grid_search.best_score_

In [10]:
# Save the best model
joblib_file = "best_model.joblib"
joblib.dump(best_model, joblib_file)
print(f"Model saved to {joblib_file}")

# Load the model from the file
loaded_model = joblib.load(joblib_file)

Model saved to best_model.joblib


In [11]:
y_pred = loaded_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Best parameters: {best_params}")
print(f"Best scores: {best_score}")
print(f"Test MSE: {mse}")
print(f"Model score: {best_model.score(X_test, y_test)}")


Best parameters: {'svr__C': 20, 'svr__epsilon': 0.01, 'svr__gamma': 0.1, 'svr__kernel': 'rbf'}
Best scores: 0.8639279857761399
Test MSE: 0.6157878085505033
Model score: 0.851871086940417
