<a href="https://colab.research.google.com/github/evan-grinalds/dash-template/blob/master/notebooks/Tesla_Friday_pkl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/evan-grinalds/Unit-2-Build/master/data/'
    !pip install category_encoders==2.*

In [2]:
import pandas as pd
from sklearn.pipeline import make_pipeline
# read in data:

df = pd.read_csv(DATA_PATH+'model_s.csv')

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Read train_features.csv
train = pd.read_csv(DATA_PATH+'training.csv')
                
# Read test_features.csv
test = pd.read_csv(DATA_PATH+'test.csv')

# Split train into train & val
train, val = train_test_split(train, train_size=0.80, test_size=0.20) 

In [4]:
# Remove symbols, convert to integer
train['mileage'] = (
train['mileage']
.str.replace('mi.','')
.astype(int)
)

val['mileage'] = (
val['mileage']
.str.replace('mi.','')
.astype(int)
)

test['mileage'] = (
test['mileage']
.str.replace('mi.','')
.astype(int)
)

df['mileage'] = (
df['mileage']
.str.replace('mi.','')
.astype(int)
)

In [29]:
# Arrange data into X features matrix and y target vector
features = ['year', 'battery', 'ludacris_mode', 'all_wheel_drive', 'mileage']
target = 'price'
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

In [30]:
import category_encoders as ce

encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)
X_test = encoder.transform(X_test)

In [32]:
# Define the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
pipeline = make_pipeline(
    ce.TargetEncoder(), 
    SimpleImputer(),
    StandardScaler(), 
    RandomForestRegressor(random_state=42)
)

param_distributions = {
    'targetencoder__min_samples_leaf': randint(1, 1000),     
    'simpleimputer__strategy': ['mean', 'median'], 
    'randomforestregressor__n_estimators': randint(50, 500), 
    'randomforestregressor__max_depth': [5, 10, 15, 20, None], 
    'randomforestregressor__max_features': uniform(0, 1), 
}

# If you're on Colab, decrease n_iter & cv parameters
search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_distributions, 
    n_iter=100, 
    cv=10, 
    scoring='neg_mean_absolute_error', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train, y_train);

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   26.9s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:   45.2s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   

In [33]:
print('Random Forest MAE: $', -search.best_score_)

Random Forest MAE: $ 3884.929870732295


In [34]:
import pickle

In [35]:
# Save the Model to file in the current working directory

Pkl_Filename = "Tesla_Final.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(search, file)

In [36]:
# Load the Model back from file
with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

Pickled_LR_Model

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('targetencoder',
                                              TargetEncoder(cols=None,
                                                            drop_invariant=False,
                                                            handle_missing='value',
                                                            handle_unknown='value',
                                                            min_samples_leaf=1,
                                                            return_df=True,
                                                            smoothing=1.0,
                                                            verbose=0)),
                                             ('simpleimputer',
                                              SimpleImputer(add_indicator=False,
                                                            copy=True,
   

In [37]:
score = Pickled_LR_Model.score(X_test, y_test)  
# Print the Score
print("Test score: ${0:.2f} ".format(1 * -score))  

Test score: $2766.86 


In [38]:
y_pred = Pickled_LR_Model.predict(X_test) 
y_pred

array([45700.64887153, 34535.30353776, 47159.66970486, 40596.16308904,
       33696.82421875, 54798.25260417, 59235.69568452, 66117.38802083,
       32907.01649306, 35823.74220963, 46462.74414062, 33348.06640625,
       42033.6422309 , 50035.59933036, 40190.75390625, 49516.2390873 ,
       38093.33072917, 61334.12326389, 57919.53472222, 53960.62847222,
       42348.23177083, 34264.65625   , 81227.6796875 , 35243.79858321,
       33969.81684028, 34804.31272735, 45802.60394965, 51163.953125  ,
       28622.12543403, 37887.91437295, 32318.51909722, 54819.55989583,
       37158.32146991, 52040.67013889, 45843.01818576, 35767.84898047,
       36338.18003472, 49843.18229167, 40585.83333333, 43078.63151042,
       36945.67708333, 37134.04548448, 49129.4296875 , 50298.49581473,
       44586.65277778, 51983.44357639, 52169.77690972, 41736.47430556,
       41009.12239583, 46913.84570312, 72060.66536458, 36571.38541667,
       26621.67447917, 36653.72352431, 36713.64438657, 64044.32421875,
      

In [39]:
import joblib
joblib.dump(search, 'Tesla_Final.pkl')

['Tesla_Final.pkl']

In [40]:
model = joblib.load('Tesla_Final.pkl')

In [41]:
# 5. Apply the model to new data
year = 2015
battery = 60
ludacris_mode_Yes = 1
ludacris_mode_No = 0
all_wheel_drive_Yes = 1
all_wheel_drive_No = 0
mileage = 20000
X_test = [[ year, battery, ludacris_mode_Yes, ludacris_mode_No, all_wheel_drive_Yes, all_wheel_drive_No, mileage ]]

y_pred = model.predict(X_test)
y_pred

array([42854.22395833])

In [43]:
def predict(year, battery, ludacris_mode_Yes, ludacris_mode_No, all_wheel_drive_Yes, all_wheel_drive_No, mileage):
    y_pred = model.predict([[year, battery, ludacris_mode_Yes, ludacris_mode_No, all_wheel_drive_Yes, all_wheel_drive_No, mileage]])
    estimate = y_pred
    result = f'{year:.0f} Tesla Model S: ${estimate[0]:,.0f}'
    return result

print(predict(2015, 60, 1, 0, 1, 0, 20000))

2015 Tesla Model S: $42,854
