In [6]:
from libs.model_administrator import store_model, extract_model, model_ready
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
# read dataset
dtypes = {
    'Year': 'int', 
    'Mileage': 'int', 
    'City': 'category', 
    'State': 'category', 
    'Vin': 'category', 
    'Make': 'category', 
    'Model': 'category', 
    'Price': 'int'
}
df = pd.read_csv('libs/data/true_car_listings.csv', dtype=dtypes)
print(df.columns)
df.head()

Index(['Price', 'Year', 'Mileage', 'City', 'State', 'Vin', 'Make', 'Model'], dtype='object')


Unnamed: 0,Price,Year,Mileage,City,State,Vin,Make,Model
0,8995,2014,35725,El Paso,TX,19VDE2E53EE000083,Acura,ILX6-Speed
1,10888,2013,19606,Long Island City,NY,19VDE1F52DE012636,Acura,ILX5-Speed
2,8995,2013,48851,El Paso,TX,19VDE2E52DE000025,Acura,ILX6-Speed
3,10999,2014,39922,Windsor,CO,19VDE1F71EE003817,Acura,ILX5-Speed
4,14799,2016,22142,Lindon,UT,19UDE2F32GA001284,Acura,ILXAutomatic


In [9]:
# split dataset
X = df[[
    'Year', 
    # 'Mileage', 
        # 'City', 'State', 'Make', 
        'Model']]
y = df.Price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [10]:
# show train
pd.concat([X_train, y_train], axis=1, sort=False).head()

Unnamed: 0,Year,Model,Price
88659,2017,MalibuLT,17933
659865,2017,RogueS,19799
101427,2014,EquinoxAWD,16599
403674,2016,Accord,25900
134166,2012,TraverseFWD,16195


In [11]:
# show test
pd.concat([X_test, y_test], axis=1, sort=False).head()

Unnamed: 0,Year,Model,Price
760001,2014,CamryL,14991
325901,2015,F-1504WD,28995
566347,2013,GS,27995
150780,2016,ColoradoCrew,30970
204096,2008,Caliber4dr,4499


In [12]:
pd.concat([X_train, y_train], axis=1, sort=False).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 681697 entries, 88659 to 404826
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   Year    681697 non-null  int64   
 1   Model   681697 non-null  category
 2   Price   681697 non-null  int64   
dtypes: category(1), int64(2)
memory usage: 17.0 MB


In [13]:
t = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'), [
        #'City', 'State', 'Make', 
        'Model'
    ]),
    ('scale', StandardScaler(), [
        'Year', 
        # 'Mileage'
    ])
], remainder='passthrough')

# Transform the features
features = t.fit_transform(X_train)

In [14]:
# Train the linear regression model
reg = LinearRegression()
model = reg.fit(features, y_train)
print(reg.score(features, y_train))
print(reg.coef_)
print(reg.intercept_)

0.8431953141237413
[-10360.93242657 -12557.81229696  -7664.69365229 ... -20509.32560444
 -20776.98153706   7274.79244596]
30950.37464607249


In [15]:
# Generate a prediction
test_features = t.transform(X_test)
prediction = model.predict(test_features)
print(reg.score(test_features, y_test))
print(r2_score(prediction, y_test))
print(mean_squared_error(prediction, y_test))

0.8403699849449666
0.8122193477559747
29768213.071919564


In [16]:
lr = LinearRegression()
lr.coef_ = reg.coef_
lr.intercept_ = reg.intercept_
test_features = t.transform(X_test)
prediction = lr.predict(test_features)
print(reg.score(test_features, y_test))
print(r2_score(prediction, y_test))
print(mean_squared_error(prediction, y_test))

0.8403699849449666
0.8122193477559747
29768213.071919564


In [17]:
dict_model = {
    "coef_": reg.coef_,
    "intercept_": reg.intercept_
}
store_model(dict_model)

In [18]:
a = extract_model()
print(a.coef_)

[-10360.93242657 -12557.81229696  -7664.69365229 ... -20509.32560444
 -20776.98153706   7274.79244596]


In [19]:
prediction = a.predict(test_features)
print(reg.score(test_features, y_test))
print(r2_score(prediction, y_test))
print(mean_squared_error(prediction, y_test))

0.8403699849449666
0.8122193477559747
29768213.071919564
