# NYSE ML Project Machine Learning Analysis
In this notebook I will clean and wrangle the data from the data files I will be using for this project. As an output there will be a one file to use for the machine learning model analysis.

In [1]:
from time import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# Read in the data
cleaned_df = pd.read_csv('./data/cleaned/final-model.csv')

## Split Into Train and Test

In [3]:
from sklearn.model_selection import train_test_split

top_performing_raw = cleaned_df['95th Percentile']
features_raw = cleaned_df.drop(['95th Percentile'], axis=1)
symbols_raw = features_raw['Symbol']
features_raw = features_raw.drop(['Symbol'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(features_raw,
                                                    top_performing_raw,
                                                    test_size=0.2,
                                                    random_state=42)

### Standardize Values

In [4]:
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()
x_train = scalar.fit_transform(x_train)
x_test = scalar.transform(x_test)

In [5]:
# Export scalar
from joblib import dump, load
dump(scalar, './models/scalar.joblib')

['./models/scalar.joblib']

### Create Base Case

In [7]:
true_positives = np.sum(top_performing_raw)
false_positives = top_performing_raw.count() - true_positives
true_negatives = 0
false_negatives = 0

accuracy = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
precision = true_positives / (true_positives + false_positives)
fscore = (1 + pow(0.5,2)) * ((precision * recall) / ((pow(0.5,2) * precision) + recall))

print("Naive Predictor: [Accuracy score: {:.2f}%, F-score: {:.2f}%]".format(accuracy*100, fscore*100))

Naive Predictor: [Accuracy score: 5.40%, F-score: 6.66%]
