In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import neural_network
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import dill as pickle

In [3]:
df = pd.read_csv('../data/aggregated/processed_data/data-4-7-18-full.csv')

class PreprocessCars:
    """Code to preprocess the dataframe in preparation for training"""
    
    def get_features_encoded(self, orig_df, pandas_obj):
        # Replace missing continuous values
        df = orig_df.copy(deep=True)
        df['mileage'] = df['mileage'].fillna((df['mileage'].mean()))
        df['year'] = df['year'].fillna((df['year'].mean()))
        
        # Replace missing categorical values
        df['make'] = df['make'].fillna(df['make'].value_counts().index[0])
        df['model'] = df['model'].fillna(df['model'].value_counts().index[0])
        df['state'] = df['state'].fillna(df['state'].value_counts().index[0])
        df['transmission'] = df['transmission'].fillna(df['transmission'].value_counts().index[0])
        
        # Normalize the continuous values
        df['mileage'] = (df['mileage']-df['mileage'].mean())/df['mileage'].std() # This approach puts between 0 and 1, is that an issue?
        df['year'] = (df['year']-df['year'].mean())/df['year'].std()
        
        # Only select a subset of features
        features = df[['make', 'model', 'mileage', 'state', 'transmission', 'year']]
        labels = df[['price']]
        
        # TODO - encode categorical features
        features_encoded = pandas_obj.get_dummies(features, columns=['make', 'model', 'state', 'transmission'])
        return features_encoded, labels
    
    def get_train_test(self, orig_df, pandas_obj):
        features_encoded, labels = self.get_features_encoded(orig_df, pandas_obj)
        X_train, X_test, Y_train, Y_test = train_test_split(features_encoded, labels, test_size=0.2, train_size=0.8)
        return X_train, X_test, Y_train, Y_test


In [4]:
preprocessor = PreprocessCars()
X_train, X_test, Y_train, Y_test = preprocessor.get_train_test(df, pd)

# Train MLP Model

In [16]:
mlp_model = neural_network.MLPRegressor(hidden_layer_sizes=(100,100))
mlp_model.fit(X_train, Y_train)
mlp_model.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.88138514464489481

# Train Gradient Boosting Model

In [21]:
from sklearn import ensemble
gb_model = ensemble.GradientBoostingRegressor(learning_rate=.6, n_estimators=200, max_depth=4)
gb_model.fit(X_train, Y_train)
gb_model.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.86665746630277041

# Train Random Forest Model

In [22]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=20, n_jobs=-1)
rf_model.fit(X_train, Y_train)
rf_model.score(X_test, Y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


0.85940818970574373

# Save Models
Dump the preprocessor class and trained models:

In [12]:
with open('../webapp/feature-preprocessor.pkl', 'wb') as preprocessorFile:
    pickle.dump(preprocessor, preprocessorFile)
with open('../webapp/mlp-model.pkl', 'wb') as modelFile:
    pickle.dump(mlp_model, modelFile)
with open('../webapp/gb-model.pkl', 'wb') as modelFile:
    pickle.dump(gb_model, modelFile)
with open('../webapp/rf-model.pkl', 'wb') as modelFile:
    pickle.dump(rf_model, modelFile)