In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
# Load the datasets
df = pd.read_csv('data/train_cleaned_feature_engineering.csv')

In [6]:


# Split the data into features and target
X = df.drop('price_range', axis=1)
y = df['price_range']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.94      0.96      0.95       105
           1       0.87      0.85      0.86        91
           2       0.78      0.85      0.81        92
           3       0.94      0.87      0.90       112

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.89      0.88      0.88       400

[[101   4   0   0]
 [  7  77   7   0]
 [  0   8  78   6]
 [  0   0  15  97]]


In [7]:
from sklearn.svm import SVC

# Train a Support Vector Classifier
svc_model = SVC(kernel='linear', random_state=42)
svc_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svc_model.predict(X_val)
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       105
           1       0.89      0.93      0.91        91
           2       0.88      0.88      0.88        92
           3       0.95      0.93      0.94       112

    accuracy                           0.93       400
   macro avg       0.92      0.92      0.92       400
weighted avg       0.93      0.93      0.93       400

[[100   5   0   0]
 [  3  85   3   0]
 [  0   5  81   6]
 [  0   0   8 104]]


## It opvious that the SVC classifier give a high accuracy compared to the one of random forest, so we will continue to optimize SVC model by tunning hyperparameters

In [9]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVC
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

# Initialize GridSearchCV with SVC
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5, scoring='accuracy')

# Fit the model to the data
grid.fit(X_train, y_train)

# Print the best parameters and estimator
print("Best Parameters: ", grid.best_params_)
print("Best Estimator: ", grid.best_estimator_)

# Evaluate the optimized model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_val)
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.1s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.1s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.1s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.1s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.1s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.2s
[CV] END .....................C=0.1, gamma=1, k

In [10]:
grid.best_score_

np.float64(0.909375)

In [12]:
import joblib

# Save the best model
joblib.dump(best_model, 'models/SVC_model.pkl')

['models/SVC_model.pkl']