In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import env
from sqlalchemy import text, create_engine


In [4]:
url = env.get_db_url('used_cars')

In [5]:
# defining function to read a sql query
def read_sql_query(query, db):
    """
    This function will 
    - accept two strings: an sql query, and the database name
    - read the query from the database into a dataframe
    - return the dataframe
    """
    # using "new" (May 2023) version of reading sql queries with pandas

    # define the database url
    url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/{db}'
    # create the connection
    engine = create_engine(url)
    connection = engine.connect()
    # create the query using text() and the string that has the sql query
    query_t = text(query)

    df = pd.read_sql(query_t, connection)

    return df

In [8]:
from pydataset import data
df = data('mpg')
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [9]:
df.describe()

Unnamed: 0,displ,year,cyl,cty,hwy
count,234.0,234.0,234.0,234.0,234.0
mean,3.471795,2003.5,5.888889,16.858974,23.440171
std,1.291959,4.509646,1.611534,4.255946,5.954643
min,1.6,1999.0,4.0,9.0,12.0
25%,2.4,1999.0,4.0,14.0,18.0
50%,3.3,2003.5,6.0,17.0,24.0
75%,4.6,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 1 to 234
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  234 non-null    object 
 1   model         234 non-null    object 
 2   displ         234 non-null    float64
 3   year          234 non-null    int64  
 4   cyl           234 non-null    int64  
 5   trans         234 non-null    object 
 6   drv           234 non-null    object 
 7   cty           234 non-null    int64  
 8   hwy           234 non-null    int64  
 9   fl            234 non-null    object 
 10  class         234 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 21.9+ KB


In [14]:
# fl is the type of fuel used regular gas, premium, ethanol, diesel, compressed natural gas
df.fl.value_counts()

r    168
p     52
e      8
d      5
c      1
Name: fl, dtype: int64

In [15]:
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [17]:
# I'm going to drop manufacturer and model, but I'm going to encode drv, fl, and class
# in an attempt to predict trans

# first make trans either auto or manual
df.trans = np.where(df.trans.str.startswith('auto'), 'auto', 'manual')

In [21]:
df = df.drop(columns = ['manufacturer', 'model'])
df.head()

Unnamed: 0,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,1.8,1999,4,auto,f,18,29,p,compact
2,1.8,1999,4,manual,f,21,29,p,compact
3,2.0,2008,4,manual,f,20,31,p,compact
4,2.0,2008,4,auto,f,21,30,p,compact
5,2.8,1999,6,auto,f,16,26,p,compact


In [23]:
df = df.rename(columns = {'class': 'cls'})

In [25]:
df.head()

Unnamed: 0,displ,year,cyl,trans,drv,cty,hwy,fl,cls
1,1.8,1999,4,auto,f,18,29,p,compact
2,1.8,1999,4,manual,f,21,29,p,compact
3,2.0,2008,4,manual,f,20,31,p,compact
4,2.0,2008,4,auto,f,21,30,p,compact
5,2.8,1999,6,auto,f,16,26,p,compact


In [26]:
dummies = pd.get_dummies(df, columns=['drv', 'fl', 'cls'], drop_first=True)

In [27]:
dummies.head()

Unnamed: 0,displ,year,cyl,trans,cty,hwy,drv_f,drv_r,fl_d,fl_e,fl_p,fl_r,cls_compact,cls_midsize,cls_minivan,cls_pickup,cls_subcompact,cls_suv
1,1.8,1999,4,auto,18,29,1,0,0,0,1,0,1,0,0,0,0,0
2,1.8,1999,4,manual,21,29,1,0,0,0,1,0,1,0,0,0,0,0
3,2.0,2008,4,manual,20,31,1,0,0,0,1,0,1,0,0,0,0,0
4,2.0,2008,4,auto,21,30,1,0,0,0,1,0,1,0,0,0,0,0
5,2.8,1999,6,auto,16,26,1,0,0,0,1,0,1,0,0,0,0,0


In [33]:
# Now we'll jump into modeling; first split into X and y
X = dummies.drop('trans', axis=1)
X.head()

Unnamed: 0,displ,year,cyl,cty,hwy,drv_f,drv_r,fl_d,fl_e,fl_p,fl_r,cls_compact,cls_midsize,cls_minivan,cls_pickup,cls_subcompact,cls_suv
1,1.8,1999,4,18,29,1,0,0,0,1,0,1,0,0,0,0,0
2,1.8,1999,4,21,29,1,0,0,0,1,0,1,0,0,0,0,0
3,2.0,2008,4,20,31,1,0,0,0,1,0,1,0,0,0,0,0
4,2.0,2008,4,21,30,1,0,0,0,1,0,1,0,0,0,0,0
5,2.8,1999,6,16,26,1,0,0,0,1,0,1,0,0,0,0,0


In [34]:
y = dummies.trans

In [35]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

In [37]:
X_train.shape, X_test.shape

((187, 17), (47, 17))

In [36]:
# typically we'd split train into train and validation, however
# here we're using cross validation, so we won't

In [39]:
# ACTUALLY, I just saw a different way to encode, so let's try that
from sklearn.preprocessing import LabelEncoder

for col in ['drv', 'fl', 'cls']:
    le = LabelEncoder().fit(df[col])
    df[col] = le.transform(df[col])

df.head()

Unnamed: 0,displ,year,cyl,trans,drv,cty,hwy,fl,cls
1,1.8,1999,4,auto,1,18,29,3,1
2,1.8,1999,4,manual,1,21,29,3,1
3,2.0,2008,4,manual,1,20,31,3,1
4,2.0,2008,4,auto,1,21,30,3,1
5,2.8,1999,6,auto,1,16,26,3,1


In [40]:
df.drv.value_counts()

1    106
0    103
2     25
Name: drv, dtype: int64

In [41]:
# So, I'm going to try this, but I'm not sure about it 
# since there isn't an inherent value of the drv/fl/cls cols
# I can always use the dummies df later

In [42]:
# Redo the X,y and splits
X = df.drop('trans', axis=1)
y = df.trans
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=42)

In [43]:
X_train.head()

Unnamed: 0,displ,year,cyl,drv,cty,hwy,fl,cls
118,2.0,2008,4,1,20,28,4,5
156,3.8,1999,6,1,16,26,3,2
149,3.0,1999,6,1,19,25,4,2
159,5.3,2008,8,1,16,25,3,2
232,2.8,1999,6,1,16,26,3,2


In [44]:
X_train.shape, X_test.shape

((187, 8), (47, 8))

# Cross validation

In [46]:
# import cross_val_score to run a simple first example
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

In [47]:
tree = DecisionTreeClassifier(max_depth=3)

In [49]:
# can use cross validation to see different scores on different train/val splits
cross_val_score(tree, X_train, y_train, cv=5)

array([0.65789474, 0.63157895, 0.59459459, 0.72972973, 0.54054054])

In [77]:
# the mean of those values is what the overall score will be later in GridSearchCV
# can use cross validation to see different scores on different train/val splits
cross_val_score(tree, X_train, y_train, cv=5).mean()

0.6953058321479375

In [52]:
# can also use a different scoring metric (default above is accuracy)
# i think precision should have worked, but chatGPT said precision wasn't valid
# for the classification I was doing and it recommended 'precision_macro' which spit this out
cross_val_score(tree, X_train, y_train, cv=5, scoring='precision_macro')

  _warn_prf(average, modifier, msg_start, len(result))


array([0.32894737, 0.44166667, 0.52797203, 0.70982143, 0.52647059])

In [54]:
# what about recall?
cross_val_score(tree, X_train, y_train, cv=5, scoring='recall_macro')

array([0.5       , 0.45692308, 0.52666667, 0.79666667, 0.61      ])

# Grid Search Cross Validation

In [55]:
from sklearn.model_selection import GridSearchCV

In [56]:
# Decision Tree first
# set range of parameters to send in to Decision Tree model
params = {'max_depth' : range(1,11),
          'max_features': [None, 1, 3]}

# make the Decision tree with no hyperparameters set
tree = DecisionTreeClassifier()

# make the grid object by passing in the tree, our range of parameters, and the number of folds (cv)
grid = GridSearchCV(tree, params, cv=5)

# fit the grid model with our train data
grid.fit(X_train, y_train)

In [58]:
# Our GridSearchCV object (grid) now has severable attributes we can look at
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [60]:
# lots of properties, but let's focus on two: mean_test_score, and params
test_scores = results['mean_test_score']
test_scores

array([0.62019915, 0.65803698, 0.62019915, 0.62560455, 0.65206259,
       0.62019915, 0.66330014, 0.6311522 , 0.62532006, 0.65220484,
       0.56628734, 0.59345661, 0.61450925, 0.60953058, 0.6257468 ,
       0.6571835 , 0.60369844, 0.67411095, 0.65234708, 0.61536273,
       0.62588905, 0.68961593, 0.62588905, 0.63627312, 0.7113798 ,
       0.59331437, 0.62076814, 0.69530583, 0.60910384, 0.6311522 ])

In [61]:
params = results['params']
params

[{'max_depth': 1, 'max_features': None},
 {'max_depth': 1, 'max_features': 1},
 {'max_depth': 1, 'max_features': 3},
 {'max_depth': 2, 'max_features': None},
 {'max_depth': 2, 'max_features': 1},
 {'max_depth': 2, 'max_features': 3},
 {'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 1},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 1},
 {'max_depth': 4, 'max_features': 3},
 {'max_depth': 5, 'max_features': None},
 {'max_depth': 5, 'max_features': 1},
 {'max_depth': 5, 'max_features': 3},
 {'max_depth': 6, 'max_features': None},
 {'max_depth': 6, 'max_features': 1},
 {'max_depth': 6, 'max_features': 3},
 {'max_depth': 7, 'max_features': None},
 {'max_depth': 7, 'max_features': 1},
 {'max_depth': 7, 'max_features': 3},
 {'max_depth': 8, 'max_features': None},
 {'max_depth': 8, 'max_features': 1},
 {'max_depth': 8, 'max_features': 3},
 {'max_depth': 9, 'max_features': None},
 {'max_depth': 9, 'max_

In [64]:
# we can combine these into a df
for p, s in zip(params, test_scores):
    p['score'] = s

param_score_df = pd.DataFrame(params)
param_score_df.sort_values(by='score', ascending=False).head()

Unnamed: 0,max_depth,max_features,score
24,9,,0.71138
27,10,,0.695306
21,8,,0.689616
17,6,3.0,0.674111
6,3,,0.6633


In [76]:
# Use grid.best_estimator to save best hyperparameters
grid.best_params_

{'max_depth': 10, 'max_features': None, 'score': 0.6954480796586059}

In [67]:
dt_model = grid.best_estimator_
dt_model

In [68]:
# run best model on test with this code (but not yet because I want to look at other models
# dt_model.score(X_test, y_test)

In [69]:
# I want to make a series of functions to do this for each of the models:
# tree, KNN, Random Forest, Logistic Regression, and maybe some others

In [96]:
# Making a function to run through cross validation for DecisionTreeClassifier
# need to set a random seed somewhere
def cross_val_dtree(X_train, y_train):
    """
    This function will accept
    - X_train (numeric, i.e. encoded/scaled as required)
    - y_train (categorical)
    prints best model hyperparameters and accuracy score
    returns
    - GridSearchCV model of DecisionTreeClassifier with params hardcoded below
    """
    # Set random seed to make this repeatable
    np.random.seed(42)
    # set range of parameters to send in to Decision Tree model
    params = {'max_depth' : range(1,11),
              'max_features': [None, 1, 3]}

    # make the Decision tree with no hyperparameters set
    tree = DecisionTreeClassifier()

    # make the grid object by passing in the tree, our range of parameters, and the number of folds (cv)
    # Note default score is accuracy; other values 'precision'/'precision_macro' or 'accuracy
    grid = GridSearchCV(tree, params, cv=5)

    # fit the grid model with our train data
    grid.fit(X_train, y_train)

    # print out best estimator parameters/score
    print(f'Best Model Parameters: {grid.best_params_}')
    print(f'Best Model Score: {grid.best_score_}')
    
    return grid

In [113]:
grid = cross_val_dtree(X_train, y_train)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=['params']).sort_values(by='rank_test_score').head()

Best Model Parameters: {'max_depth': 8, 'max_features': None}
Best Model Score: 0.7002844950213372


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
21,0.000621,1.1e-05,0.000376,5e-06,8,,0.815789,0.631579,0.675676,0.648649,0.72973,0.700284,0.066634,1
27,0.00063,9e-06,0.000379,1.1e-05,10,,0.789474,0.552632,0.702703,0.675676,0.756757,0.695448,0.08178,2
18,0.000708,6.1e-05,0.00045,5.1e-05,7,,0.736842,0.605263,0.675676,0.675676,0.72973,0.684637,0.04737,3
15,0.000604,2e-06,0.000374,3e-06,6,,0.710526,0.684211,0.594595,0.702703,0.702703,0.678947,0.043056,4
9,0.000646,1e-05,0.000417,7e-06,4,,0.763158,0.578947,0.648649,0.756757,0.621622,0.673826,0.073782,5


In [100]:
grid.best_score_

0.7002844950213372

In [101]:
# Making a function to run through cross validation for DecisionTreeClassifier
# need to set a random seed somewhere
def cross_val_knn(X_train, y_train):
    """
    This function will accept
    - X_train (numeric, i.e. encoded/scaled as required)
    - y_train (categorical)
    prints best model hyperparameters and accuracy score
    returns
    - GridSearchCV model of KNeighbors with params hardcoded below
    """
    # Set random seed to make this repeatable
    np.random.seed(42)
    # set range of parameters to send in to Decision Tree model
    params = {'n_neighbors' : range(1,21),
              'weights' : ['uniform', 'distance']}

    # make the Decision tree with no hyperparameters set
    knn = KNeighborsClassifier()

    # make the grid object by passing in the tree, our range of parameters, and the number of folds (cv)
    # Note default score is accuracy; other values 'precision'/'precision_macro' or 'accuracy
    grid = GridSearchCV(knn, params, cv=5)

    # fit the grid model with our train data
    grid.fit(X_train, y_train)

    # print out best estimator parameters/score
    print(f'Best Model Parameters: {grid.best_params_}')
    print(f'Best Model Score: {grid.best_score_}')
    
    return grid

In [103]:
import warnings
warnings.filterwarnings('ignore')

In [112]:
grid = cross_val_knn(X_train, y_train)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=['params']).sort_values(by='rank_test_score').head()

Best Model Parameters: {'n_neighbors': 9, 'weights': 'distance'}
Best Model Score: 0.6953058321479374


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_weights,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
17,0.000537,7e-06,0.000578,4e-06,9,distance,0.736842,0.631579,0.756757,0.648649,0.702703,0.695306,0.048568,1
15,0.000579,3.9e-05,0.000622,6.2e-05,8,distance,0.736842,0.657895,0.72973,0.621622,0.702703,0.689758,0.043909,2
13,0.000527,1e-06,0.000564,4e-06,7,distance,0.789474,0.631579,0.756757,0.621622,0.648649,0.689616,0.069497,3
21,0.00053,3e-06,0.000579,1.3e-05,11,distance,0.684211,0.605263,0.756757,0.648649,0.702703,0.679516,0.050975,4
5,0.000757,8.3e-05,0.000774,4.4e-05,3,distance,0.710526,0.605263,0.72973,0.594595,0.756757,0.679374,0.066595,5


In [107]:
# Making a function to run through cross validation for DecisionTreeClassifier
# need to set a random seed somewhere
def cross_val_logit(X_train, y_train):
    """
    This function will accept
    - X_train (numeric, i.e. encoded/scaled as required)
    - y_train (categorical)
    prints best model hyperparameters and accuracy score
    returns
    - GridSearchCV model of LogisticRegression with params hardcoded below
    """
    # Set random seed to make this repeatable
    np.random.seed(42)
    # set range of parameters to send in to Decision Tree model
    params = {'C' : [.01, .1, 1.0, 10.0, 100.0],
              'class_weight' : ['None', 'balanced']}

    # make the Decision tree with no hyperparameters set
    logit = LogisticRegression()

    # make the grid object by passing in the tree, our range of parameters, and the number of folds (cv)
    # Note default score is accuracy; other values 'precision'/'precision_macro' or 'accuracy
    grid = GridSearchCV(logit, params, cv=5)

    # fit the grid model with our train data
    grid.fit(X_train, y_train)

    # print out best estimator parameters/score
    print(f'Best Model Parameters: {grid.best_params_}')
    print(f'Best Model Score: {grid.best_score_}')
    
    return grid

In [108]:
grid = cross_val_logit(X_train, y_train)

Best Model Parameters: {'C': 1.0, 'class_weight': 'balanced'}
Best Model Score: 0.6627311522048365


In [111]:
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=['params']).sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,0.003527,0.000601,0.000435,6.7e-05,1.0,balanced,0.842105,0.552632,0.648649,0.675676,0.594595,0.662731,0.099303,1
9,0.003245,0.000949,0.000389,5e-06,100.0,balanced,0.842105,0.552632,0.675676,0.675676,0.567568,0.662731,0.103623,1
7,0.003802,0.000212,0.000394,2e-06,10.0,balanced,0.842105,0.5,0.621622,0.675676,0.594595,0.646799,0.113038,3
0,0.006273,0.006603,0.000742,0.0002,0.01,,0.710526,0.657895,0.594595,0.621622,0.648649,0.646657,0.038851,4
3,0.002657,0.000408,0.000421,1.9e-05,0.1,balanced,0.842105,0.552632,0.648649,0.675676,0.513514,0.646515,0.114555,5


In [115]:
# Making a function to run through cross validation for DecisionTreeClassifier
# need to set a random seed somewhere
def cross_val_rf(X_train, y_train):
    """
    This function will accept
    - X_train (numeric, i.e. encoded/scaled as required)
    - y_train (categorical)
    prints best model hyperparameters and accuracy score
    returns
    - GridSearchCV model of RandomForest with params hardcoded below
    """
    # Set random seed to make this repeatable
    np.random.seed(42)
    # set range of parameters to send in to model
    params = {
        'max_depth':range(1,21),
        'min_samples_leaf':range(1,11),
        'criterion': ["gini", "entropy", "log_loss"]
    }
    # make the Decision tree with no hyperparameters set
    rf = RandomForestClassifier()

    # make the grid object by passing in the tree, our range of parameters, and the number of folds (cv)
    # Note default score is accuracy; other values 'precision'/'precision_macro' or 'accuracy
    grid = GridSearchCV(rf, params, cv=5)

    # fit the grid model with our train data
    grid.fit(X_train, y_train)

    # print out best estimator parameters/score
    print(f'Best Model Parameters: {grid.best_params_}')
    print(f'Best Model Score: {grid.best_score_}')
    
    return grid

In [None]:
# running code below to call cross_val_rf takes 1-2 minutes

In [116]:
grid = cross_val_rf(X_train, y_train)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=['params']).sort_values(by='rank_test_score').head()

Best Model Parameters: {'criterion': 'entropy', 'max_depth': 17, 'min_samples_leaf': 2}
Best Model Score: 0.6849217638691323


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
361,0.078651,0.077633,0.003034,8.8e-05,entropy,17,2,0.710526,0.578947,0.594595,0.756757,0.783784,0.684922,0.083641,1
190,0.040077,0.000303,0.003014,9e-06,gini,20,1,0.763158,0.552632,0.675676,0.675676,0.756757,0.68478,0.076096,2
487,0.03866,0.000477,0.00298,0.000109,log_loss,9,8,0.631579,0.605263,0.675676,0.783784,0.702703,0.679801,0.062024,3
320,0.040653,9.2e-05,0.003018,1e-05,entropy,13,1,0.710526,0.552632,0.675676,0.702703,0.756757,0.679659,0.068673,4
283,0.039202,0.000379,0.002986,7.8e-05,entropy,9,4,0.631579,0.578947,0.702703,0.783784,0.675676,0.674538,0.068865,5


In [None]:
# make functions for NaiveBayes classifiers and xgboost next