In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.utils import resample
import time
from alchemy_conn import alchemy_engine

#inherently multiclass:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#multiclass as One-vs-One:
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier #set multi_class="one_vs_one"

#multiclass as One-Vs-All:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron

import pickle as pickle

np.random.seed(1000) #to get consistent results every time--used to test hyperparameter

In [2]:
from my_libraries import *
capstone_folder, images_folder = folders()


In [3]:
#read data into df

#use imported alchemy_conn program to generate sqlalchemy connection to summitsdb
engine = alchemy_engine()

#load summits table into pandas df
df = pd.read_sql_query('''SELECT * FROM summits WHERE type_str IN ('mount', 'mountain', 'peak') ORDER BY summit_id;''', con=engine)
    # df = pd.read_csv('~/dsi/Capstone/summits.csv')

In [4]:
mountain_mask = df.type_str == 'mountain'
mount_mask = df.type_str == 'mount'
peak_mask = df.type_str == 'peak'

df_mountain = df[mountain_mask]
df_mount = df[mount_mask]
df_peak = df[peak_mask]

In [5]:
numrows_in_each_class = 5000

# print("Before resampling: #mount={}, #mountains={}, #peaks={}"
#       .format(df_mount.summit_id.count(), df_mountain.summit_id.count(), df_peak.summit_id.count()))

num_mount_samples = numrows_in_each_class - df_mount.summit_id.count()
num_mountain_samples = numrows_in_each_class - df_mountain.summit_id.count()
num_peak_samples = numrows_in_each_class - df_peak.summit_id.count()

df_mount_upsample = resample(df_mount, replace=True, n_samples=num_mount_samples)
df_mountain_upsample = resample(df_mountain, replace=True, n_samples=num_mountain_samples)
df_peak_upsample = resample(df_peak, replace=True, n_samples=num_peak_samples)

df3 = pd.concat([df_mountain, df_mountain_upsample, df_mount, df_mount_upsample, df_peak, df_peak_upsample]) 

X3 = df3[['elevation','isolation', 'prominence']]
y3 = df3['type']

#normalize features data to range 0 - 1
X3_min = X3.min(axis=0) #axis=0 -> up and down columns
X3_max = X3.max(axis=0)
X3 = (X3 - X3_min) / (X3_max - X3_min)
print("len(X3={}, len(y3)={})".format(len(X3), len(y3)))

len(X3=15000, len(y3)=15000)


In [6]:
numrows_in_each_class = 8000

# print("Before resampling: #mountains={}, #peaks={}"
#       .format(df_mountain.summit_id.count(),  df_peak.summit_id.count()))

num_mountain_samples = numrows_in_each_class - df_mountain.summit_id.count()
num_peak_samples = numrows_in_each_class - df_peak.summit_id.count()

df_mountain_upsample = resample(df_mountain, replace=True, n_samples=num_mountain_samples)
df_peak_upsample = resample(df_peak, replace=True, n_samples=num_peak_samples)

df2 = pd.concat([df_mountain, df_mountain_upsample, df_peak, df_peak_upsample]) 

X2 = df2[['elevation','isolation', 'prominence']]
y2 = df2['type']

#normalize features data to range 0 - 1
X2_min = X2.min(axis=0) #axis=0 -> up and down columns
X2_max = X2.max(axis=0)
X2 = (X2 - X2_min) / (X2_max - X2_min)

print("len(X2={}, len(y2)={})".format(len(X2), len(y2)))

len(X2=16000, len(y2)=16000)


In [7]:
# # #split into train, test
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.20)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.20)


In [8]:
def pick_best_classifier(X_train, X_test, y_train, y_test):
    '''
    INPUT: four DataFrames
    OUTPUT: fitted model (the one with the best f1 score), DataFrame showing score results for each classifier

    Loops through multiple classifiers, performing GridSearch with cross validation on training data, and prints train and test results (f1, precision, recall, and accuracy scores) for each. The classifier (fitted on the training data) with the best f1 score is returned.
    '''

    # these are the classifiers we are testing:
    names = ['GradientBoostingClassifier', 'LogisticRegression', 'LogisticRegression--liblinear/ovr', 'DecisionTreeClassifier', 'GaussianNB',
             'LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis',
             'LinearSVC', 'MLPClassifier', 'RandomForestClassifier', 'SVC',
             'SGDClassifier', 'Perceptron']

    classifiers = [
        GradientBoostingClassifier(),
        LogisticRegression(random_state=1, max_iter=1000),
        LogisticRegression(random_state=1, max_iter=1000, multi_class='ovr', solver='liblinear'),
        DecisionTreeClassifier(random_state=1),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        LinearSVC(random_state=1),
        MLPClassifier(),
        RandomForestClassifier(),
        SVC(),
        SGDClassifier(),
        Perceptron()
    ]

    #params for GridSearchCV
    params=[
        {'loss': ['deviance'], 'n_estimators': [50,100,200], 'max_depth': [2,3,5,7],
         'criterion': ['friedman_mse'], 'max_features': [None, 'auto', 'sqrt', 'log2']}, #GradientBoostingClassifier
        {'multi_class': ['ovr', 'multinomial'],
             'solver': ['lbfgs', 'sag', 'saga', 'newton-cg'],
             'class_weight': [None, 'balanced']}, #LogisticRegression

        {'class_weight': [None, 'balanced']}, #LogisticRegression: liblinear/ovr
        {}, #DecisionTreeClassifier
        {}, #GaussianNB
        {'solver': ['svd', 'lsqr', 'eigen']}, #LinearDiscriminantAnalysis
        {}, #QuadraticDiscriminantAnalysis
        {'multi_class': ['ovr', 'crammer_singer'], 'class_weight': [None, 'balanced']}, #LinearSVC
        {}, #MLPClassifier
    #     {}, #RadiusNeighborsClassifier
        {}, #RandomForestClassifier
        {}, #SVC
        {}, #SGDClassifier
        {}  #Perceptron
            ]

    #starting default values
    best_test_score = -999.9
    worst_test_score = 999.9
    best_estimator = ''
    longest_time = -1
    scores = dict()

    #loop through each classifier
    # for i in range(1): #for testing
    print()
    i=0
    for i in range(len(params)):
        print("=============================== {}. {} ==================================".format(i+1, names[i]))
        score_type = 'accuracy'
        start_time = time.time()

        #GridSearchCV below uses 3 fold cross validation, and searches through parameters in param_grid above for each classifier
        gs = GridSearchCV(estimator=classifiers[i], param_grid=params[i], cv=8, n_jobs=-1, scoring=score_type)
        gs = gs.fit(X_train, y_train)
        seconds = time.time() - start_time
        if seconds > longest_time:
            longest_time = seconds
            longest_time_estimator = names[i]

        #predict results (y_pred) with best_estimator (one with best parameters from GridSearchCV)
        model = gs.best_estimator_
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        #calculate and print scores
        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)
        test_score = f1 #f1 used to rank classifiers

        #print results
        print("TEST {} score: {}".format(score_type, test_score))
        print("TRAIN best {} score={}".format(score_type, gs.best_score_))
        if params[i] == {}:
            cur_params = "params: default used"
        else:
            cur_params = "best params: {}".format(gs.best_params_)
        print(cur_params)
        print("#seconds for GridSearchCV for this classifier={}\n".format(seconds))
        print("TEST scores:\nf1: {}\nprecision: {}\nrecall: {}\naccuracy: {}\n".format(f1, precision, recall, accuracy))
        print("\nconfusion matrix:\n    TN    FP\n    FN    TP\n{}\n".format(confusion_matrix(y_test, y_pred)))

        #store scores for printing summary later
        scores[names[i]] = (f1, precision, recall, accuracy)

        #record best and worst results from all classifiers
        if test_score > best_test_score:
            second_best_estimator = best_estimator
            second_best_score = best_test_score
            best_test_score = test_score
            best_estimator = names[i]
            best_params = cur_params
            best_estimator_seconds = seconds
            best_model = model
        if test_score < worst_test_score:
            worst_test_score = test_score
            worst_estimator = names[i]
            worst_params = cur_params
            worst_estimator_seconds = seconds

    #after running each classifer, print summary results
    print()
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("=============================== SUMMARY ==================================")
    print("best_estimator={}\nparams={}\nf1 test score={}\n#seconds={}".format(best_estimator, best_params, best_test_score, best_estimator_seconds))
    print("\nsecond_best_estimator: {}, f1 score: {}".format(second_best_estimator, second_best_score))
    print("\nworst_estimator={}, #seconds={}, params={}, f1 test score={}".format(worst_estimator, worst_estimator_seconds, worst_params, worst_test_score))
    print("\nestimator that took most time: {}, seconds: {}".format(longest_time_estimator, longest_time))

    #print scores summary: rows are each classifer, columns f1, precision, recall, accuracy
    scores = pd.DataFrame(scores).T
    scores.columns = ['f1', 'precision', 'recall', 'accuracy']
    scores = scores.sort_values('f1', ascending=False)
    print("\nSummary of results:\n{}".format(scores))

    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
    return best_model, scores #returns classifer with best f1 score and best parameters from GridSearchCV


In [None]:
best_model, scores = pick_best_classifier(X2_train, X2_test, y2_train, y2_test)



TEST accuracy score: 0.8321897124204389
TRAIN best accuracy score=0.816171875
best params: {'criterion': 'friedman_mse', 'loss': 'deviance', 'max_depth': 7, 'max_features': None, 'n_estimators': 200}
#seconds for GridSearchCV for this classifier=29.648138284683228

TEST scores:
f1: 0.8321897124204389
precision: 0.8322050902718017
recall: 0.8321875
accuracy: 0.8321875


confusion matrix:
    TN    FP
    FN    TP
[[1324  264]
 [ 273 1339]]

TEST accuracy score: 0.5883757084829228
TRAIN best accuracy score=0.58921875
best params: {'class_weight': 'balanced', 'multi_class': 'ovr', 'solver': 'lbfgs'}
#seconds for GridSearchCV for this classifier=8.156395196914673

TEST scores:
f1: 0.5883757084829228
precision: 0.5884105886521579
recall: 0.5884375
accuracy: 0.5884375


confusion matrix:
    TN    FP
    FN    TP
[[915 673]
 [644 968]]



In [None]:
best_model, scores = pick_best_classifier(X3_train, X3_test, y3_train, y3_test)


In [None]:
model3 = GradientBoostingClassifier(criterion='friedman_mse', loss='deviance', max_depth=7, max_features='sqrt', n_estimators=200)
model2 = GradientBoostingClassifier(criterion='friedman_mse', loss='deviance', max_depth=7, max_features='sqrt', n_estimators=200)

In [None]:
model3.fit(X3_train, y3_train)
model2.fit(X2_train, y2_train)

In [None]:
result3 = model3.predict(X3)
result2 = model2.predict(X2)

In [None]:
result3L = []
for r in result3:
    if r == 0:
        result3L.append('mount')
    if r == 1:
        result3L.append('mountain')
    if r == 2:
        result3L.append('peak')
result3L = np.array(result3L)
result3L[:10], result3[:10]

In [None]:
result2L = []
for r in result2:
    if r == 0:
        result2L.append('mount')
    if r == 1:
        result2L.append('mountain')
    if r == 2:
        result2L.append('peak')
result2L = np.array(result2L)
result2L[:10], result2[:10]

In [None]:
result2

In [None]:
with open(capstone_folder + "pickled_images_labels/labels_type_GBC3.pkl", 'wb') as f:
    pickle.dump(result3L, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(capstone_folder + "pickled_images_labels/labels_type_GBC2.pkl", 'wb') as f:
    pickle.dump(result2L, f, protocol=pickle.HIGHEST_PROTOCOL)