In [23]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [24]:
# retrieving preprocessed file
df = pd.read_csv('../data/preprocessed_data.csv')
df = df.drop(columns=['Unnamed: 0'])
df = df.apply(pd.to_numeric, errors='coerce')
df.head()

Unnamed: 0,Make,Model,Year,Price,Cylinders,Transmission,Fuel Type,Rank,Coupe,Crossover,...,Grey,Orange,Other Color,Purple,Red,Silver,Tan,Teal,White,Yellow
0,3.375284,0.721931,1.899647,47819,4,1.0,2.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.659975,3.098285,1.381562,61250,4,1.0,2.0,4.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.434631,0.940014,3.108513,31861,4,1.0,2.0,5.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.600628,0.451207,1.899647,110322,4,1.0,2.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,3.375284,2.03043,2.590428,139994,4,0.0,2.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 38 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Make                  10000 non-null  float64
 1   Model                 10000 non-null  float64
 2   Year                  10000 non-null  float64
 3   Price                 10000 non-null  int64  
 4   Cylinders             10000 non-null  int64  
 5   Transmission          10000 non-null  float64
 6   Fuel Type             10000 non-null  float64
 7   Rank                  10000 non-null  float64
 8   Coupe                 10000 non-null  float64
 9   Crossover             10000 non-null  float64
 10  Hard Top Convertible  10000 non-null  float64
 11  Hatchback             10000 non-null  float64
 12  Other                 10000 non-null  float64
 13  Pick Up Truck         10000 non-null  float64
 14  SUV                   10000 non-null  float64
 15  Sedan               

In [26]:
# instantiating features and label vectors
X = df.drop(columns=['Price'])
y = df['Price']

In [27]:
# creating train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [28]:
# instantiating linear regression, Support Vector Machine, and Decision Tree Regressor models
lin_model = LinearRegression()
svr_model = SVR()
decision_tree_model = DecisionTreeRegressor()

#### <b>Fitting data into each model</b>

In [29]:
# Linear Regression Fit
lin_model.fit(X_train, y_train)

In [30]:
# Support Vector Classification Fit
svr_model.fit(X_train, y_train)

In [31]:
# Decision Tree Regressor Fit
decision_tree_model.fit(X_train, y_train)

In [32]:
# model predictions
lin_preds = lin_model.predict(X_test)
svc_preds = svr_model.predict(X_test)
decision_tree_preds = decision_tree_model.predict(X_test)

In [33]:
lin_preds

array([372861.71118607, 203318.89332165, 205812.67536896, ...,
       367869.56945446,  38376.67314228, 340151.07804368], shape=(3000,))

In [34]:
y_test

6252     294670
4684      21786
1731      98031
4742     257844
4521    1220193
         ...   
8014      80785
1074      53261
3063      68197
6487      36329
4705      42737
Name: Price, Length: 3000, dtype: int64

In [35]:
# linear regression R2 score
print(f"Linear Regression Base Score: {r2_score(y_test, lin_preds):.2f}")

Linear Regression Base Score: 0.24


In [36]:
# Support Vector Regressor R2 score
print(f"Support Vector Regressor Base Score: {r2_score(y_test, svc_preds):.2f}")

Support Vector Regressor Base Score: -0.09


In [37]:
# Decision Tree Regressor R2 score
print(f"Decision Tree Regressor Base Score: {r2_score(y_test, decision_tree_preds):.2f}")

Decision Tree Regressor Base Score: 0.62


In [38]:
# preserving training and test sets
joblib.dump(X_train, '../models/base/X_train.pkl')
joblib.dump(X_test, '../models/base/X_test.pkl')
joblib.dump(y_train, '../models/base/y_train.pkl')
joblib.dump(y_test, '../models/base/y_test.pkl')

# saving base models
joblib.dump(lin_model, '../models/base/linear_regression.pkl')
joblib.dump(svr_model, '../models/base/support_vector_regressor.pkl')
joblib.dump(decision_tree_model, '../models/base/decision_tree_regressor.pkl')

['../models/base/decision_tree_regressor.pkl']