# NYSE ML Project Machine Learning Analysis
In this notebook I will training and testing three different machine learning models with and without PCA.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

%matplotlib inline

In [2]:
# Read in the data
cleaned_df = pd.read_csv('./data/cleaned/final-model.csv')

## Split Into Train and Test

In [3]:
from sklearn.model_selection import train_test_split

top_performing_raw = cleaned_df['80th Percentile']
features_raw = cleaned_df.drop(['80th Percentile'], axis=1)
symbols_raw = features_raw['Symbol']
features_raw = features_raw.drop(['Symbol'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(features_raw,
                                                    top_performing_raw,
                                                    test_size=0.2,
                                                    random_state=42)

print(type(x_train))
print(type(x_test))
print(type(y_train))
print(type(y_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


### Standardize Values

In [4]:
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()
x_train = scalar.fit_transform(x_train)
x_test = scalar.transform(x_test)

In [5]:
# Export scalar
from joblib import dump, load
dump(scalar, './models/scalar.joblib')

['./models/scalar.joblib']

### Naive Predictor Performance

In [6]:
true_positives = np.sum(top_performing_raw)
false_positives = top_performing_raw.count() - true_positives
true_negatives = 0
false_negatives = 0

accuracy = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
precision = true_positives / (true_positives + false_positives)
fscore = (1 + pow(0.5,2)) * ((precision * recall) / ((pow(0.5,2) * precision) + recall))

# Setup time tracking
time_df = pd.DataFrame(columns=['Model', 'Time to Train', 'Time to Predict', 'F-score', 'Precision', 'Recall', 'Accuracy'])
new_row = {
    'Model': 'Naive Predictor',
    'Time to Train': 0,
    'Time to Predict': 0,
    'F-score': fscore,
    'Precision': precision,
    'Recall': recall,
    'Accuracy': accuracy
}

time_df.loc[len(time_df.index)] = new_row
time_df.head()

Unnamed: 0,Model,Time to Train,Time to Predict,F-score,Precision,Recall,Accuracy
0,Naive Predictor,0,0,0.238678,0.200514,1.0,0.200514


### Train Models

In [7]:
from sklearn.metrics import fbeta_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from pandas.core.frame import DataFrame
from pandas.core.series import Series
import timeit

# Function to train predict, and populate scores in the results dictionary
def train_predict(model:str,
                  grid_obj:GridSearchCV ,
                  x_train: DataFrame,
                  y_train: DataFrame,
                  x_test: Series,
                  y_test: Series) -> dict: 
    '''
    inputs:
        - model: the name of the model
        - grid_obj: the grid search object to train and predict
        - x_train: features training set
        - y_train: income training set
        - x_test: features testing set
        - y_test: income testing set
    '''
    
    results = {}
    results['Model'] = model
    
    start = timeit.default_timer()
    grid_fit = grid_obj.fit(x_train, y_train)
    end = timeit.default_timer()
    learner = grid_fit.best_estimator_
    
    results['Time to Train'] = end - start
        
    start = timeit.default_timer()
    predictions_test = learner.predict(x_test)
    end = timeit.default_timer()
    
    results['Time to Predict'] = end - start
            
    results['F-score'] = fbeta_score(y_test, predictions_test, beta=0.5, average=None)[0]
        
    results['Precision'] = precision_score(y_test, predictions_test, average=None)[0]
    
    results['Recall'] = recall_score(y_test, predictions_test, average=None)[0]
        
    results['Accuracy'] = accuracy_score(y_test, predictions_test)
       
    return results, learner


#### Without PCA
##### SVC

In [8]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

svc_no_pca = SVC(random_state=42)

scorer = make_scorer(fbeta_score, beta=0.5, average='micro')
parameters = {'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
              'shrinking': [True, False],
              'probability': [True, False],
              'decision_function_shape': ['ovo', 'ovr']}

grid_obj = GridSearchCV(svc_no_pca, parameters, scoring=scorer)

results, svc_no_pca = train_predict('SVC Without PCA', grid_obj, x_train, y_train, x_test, y_test)
time_df.loc[len(time_df.index)] = results
display(time_df.head())

print(svc_no_pca.get_params())
dump(svc_no_pca, './models/svc_no_pca.joblib')

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,Time to Train,Time to Predict,F-score,Precision,Recall,Accuracy
0,Naive Predictor,0.0,0.0,0.238678,0.200514,1.0,0.200514
1,SVC Without PCA,1.497846,0.001406,0.817694,0.782051,1.0,0.782051


{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovo', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': True, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}


['./models/svc_no_pca.joblib']

##### K Nearest Neighbors

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn_no_pca = KNeighborsClassifier()

parameters = {'n_neighbors': [3, 5, 7, 9, 11],
              'weights': ['uniform', 'distance'],
              'algorithm': ['ball_tree', 'kd_tree', 'brute'],
              'leaf_size': [10, 20, 30, 40, 50],
              'n_jobs': [-1]}
grid_obj = GridSearchCV(knn_no_pca, parameters, scoring=scorer)

results, knn_no_pca = train_predict('KNN Without PCA', grid_obj, x_train, y_train, x_test, y_test)
time_df.loc[len(time_df.index)] = results
display(time_df.head())
print(knn_no_pca.get_params())
dump(knn_no_pca, './models/knn_no_pca.joblib')

Unnamed: 0,Model,Time to Train,Time to Predict,F-score,Precision,Recall,Accuracy
0,Naive Predictor,0.0,0.0,0.238678,0.200514,1.0,0.200514
1,SVC Without PCA,1.497846,0.001406,0.817694,0.782051,1.0,0.782051
2,KNN Without PCA,16.416489,0.00865,0.813008,0.779221,0.983607,0.769231


{'algorithm': 'ball_tree', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': -1, 'n_neighbors': 9, 'p': 2, 'weights': 'distance'}


['./models/knn_no_pca.joblib']

##### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

rfc_no_pca = RandomForestClassifier(random_state=42)

parameters = {'bootstrap': [True, False],
              'n_estimators': [10, 25, 50, 75, 100], 
              'criterion': ['gini', 'entropy', 'log_loss'],
              'max_features': ['sqrt', 'log2'],
              'n_jobs': [-1]}

grid_obj = GridSearchCV(rfc_no_pca, parameters, scoring=scorer)

results, rfc_no_pca = train_predict('RFC Without PCA', grid_obj, x_train, y_train, x_test, y_test)
time_df.loc[len(time_df.index)] = results
display(time_df.head())
print(rfc_no_pca.get_params())
dump(rfc_no_pca, './models/rfc_no_pca.joblib')

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,Time to Train,Time to Predict,F-score,Precision,Recall,Accuracy
0,Naive Predictor,0.0,0.0,0.238678,0.200514,1.0,0.200514
1,SVC Without PCA,1.497846,0.001406,0.817694,0.782051,1.0,0.782051
2,KNN Without PCA,16.416489,0.00865,0.813008,0.779221,0.983607,0.769231
3,RFC Without PCA,15.303534,0.00908,0.817694,0.782051,1.0,0.782051


{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


['./models/rfc_no_pca.joblib']

In [11]:
# save results to csv
time_df.to_csv('./data/cleaned/model-performance.csv')
