In [3]:
# from libs.model_administrator import store_model, extract_model, model_ready
import pandas as pd
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
!pip freeze

appnope==0.1.0
attrs==19.3.0
backcall==0.1.0
bleach==3.1.5
boto3==1.13.6
botocore==1.16.6
cycler==0.10.0
decorator==4.4.2
defusedxml==0.6.0
docutils==0.15.2
entrypoints==0.3
future==0.18.2
importlib-metadata==1.6.0
ipykernel==5.2.1
ipython==7.14.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.17.0
Jinja2==2.11.2
jmespath==0.9.5
joblib==0.14.1
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.3
jupyter-console==6.1.0
jupyter-core==4.6.3
kiwisolver==1.2.0
lightgbm==2.3.1
MarkupSafe==1.1.1
matplotlib==3.2.1
missingno==0.4.2
mistune==0.8.4
nbconvert==5.6.1
nbformat==5.0.6
notebook==6.0.3
numpy==1.18.4
packaging==20.3
pandas==1.0.3
pandocfilters==1.4.2
parso==0.7.0
patsy==0.5.1
pexpect==4.8.0
pickleshare==0.7.5
prometheus-client==0.7.1
prompt-toolkit==3.0.5
ptyprocess==0.6.0
PyAthena==1.10.6
Pygments==2.6.1
pyparsing==2.4.7
pyrsistent==0.16.0
python-dateutil==2.8.1
pytz==2020.1
pyzmq==19.0.1
qtconsole==4.7.3
QtPy==1.9.0
s3transfe

In [4]:
# read dataset
dtypes = {
    'Year': 'int', 
    'Mileage': 'int', 
    'City': 'category', 
    'State': 'category', 
    'Vin': 'category', 
    'Make': 'category', 
    'Model': 'category', 
    'Price': 'int'
}
df = pd.read_csv('libs/data/true_car_listings.csv', dtype=dtypes)
print(df.columns)
df.head()

Index(['Price', 'Year', 'Mileage', 'City', 'State', 'Vin', 'Make', 'Model'], dtype='object')


Unnamed: 0,Price,Year,Mileage,City,State,Vin,Make,Model
0,8995,2014,35725,El Paso,TX,19VDE2E53EE000083,Acura,ILX6-Speed
1,10888,2013,19606,Long Island City,NY,19VDE1F52DE012636,Acura,ILX5-Speed
2,8995,2013,48851,El Paso,TX,19VDE2E52DE000025,Acura,ILX6-Speed
3,10999,2014,39922,Windsor,CO,19VDE1F71EE003817,Acura,ILX5-Speed
4,14799,2016,22142,Lindon,UT,19UDE2F32GA001284,Acura,ILXAutomatic


In [5]:
# split dataset
X = df[[
    # 'Year', 
    'Mileage', 
        # 'City', 'State', 'Make', 
        'Model']]
y = df.Price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [4]:
# show train
pd.concat([X_train, y_train], axis=1, sort=False).head()

Unnamed: 0,Mileage,Model,Price
630757,117201,Galant4dr,4275
500648,44107,CherokeeLatitude,17000
370869,70422,Sierra,17917
95073,19638,CruzeSedan,14900
700985,61942,CayenneAWD,42988


In [5]:
# show test
pd.concat([X_test, y_test], axis=1, sort=False).head()

Unnamed: 0,Mileage,Model,Price
531772,105834,Rio4dr,6450
139721,37812,Silverado,26477
630434,27500,OutlanderSE,21273
221981,8394,ChallengerR/T,25891
33737,42288,3,26500


In [6]:
pd.concat([X_train, y_train], axis=1, sort=False).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 681697 entries, 630757 to 706915
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype   
---  ------   --------------   -----   
 0   Mileage  681697 non-null  int64   
 1   Model    681697 non-null  category
 2   Price    681697 non-null  int64   
dtypes: category(1), int64(2)
memory usage: 17.0 MB


In [7]:
t = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'), [
        #'City', 'State', 'Make', 
        'Model'
    ]),
    ('scale', StandardScaler(), [
        # 'Year', 
        'Mileage'
    ])
], remainder='passthrough')
reg = LinearRegression()

model = Pipeline([('transform', t), ('model', reg)])
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.steps[1][1].coef_)
print(model.steps[1][1].intercept_)

0.8247611048510722
[-11075.93955469  -7863.21011261  -5526.50138401 ... -18589.91455109
 -18622.77170332  -5476.71931515]
28003.997663778307


In [8]:
# Generate a prediction
prediction = model.predict(X_test)
print(model.score(X_test, y_test))
print(r2_score(prediction, y_test))
print(mean_squared_error(prediction, y_test))

0.8247576159166241
0.7865446426217411
32882731.433600593


In [9]:
features = X_train.columns.to_list()
result = (features, model, model.get_params())
result

(['Mileage', 'Model'],
 Pipeline(memory=None,
          steps=[('transform',
                  ColumnTransformer(n_jobs=None, remainder='passthrough',
                                    sparse_threshold=0.3,
                                    transformer_weights=None,
                                    transformers=[('onehot',
                                                   OneHotEncoder(categories='auto',
                                                                 drop=None,
                                                                 dtype=<class 'numpy.float64'>,
                                                                 handle_unknown='ignore',
                                                                 sparse=True),
                                                   ['Model']),
                                                  ('scale',
                                                   StandardScaler(copy=True,
                                           

In [1]:
from libs.machine_learning import Params

In [11]:
# Params('test_pipeline').set_model(model)

In [6]:
model_v2 = Params('test_pipeline').get_model()
# Generate a prediction

prediction = model_v2.predict(X_test)
print(model_v2.score(X_test, y_test))
print(r2_score(prediction, y_test))
print(mean_squared_error(prediction, y_test))

0.828199096049607
0.7909053127492035
31940421.941635564
