In [38]:
import numpy as np
import pandas as pd
import pickle
import warnings

# sql import
from sqlalchemy import create_engine
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, auc, f1_score
from sklearn.metrics import confusion_matrix, classification_report, fbeta_score, roc_curve

In [39]:
# gain the ability to see all rows/columns if desired
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# ignore warnings
warnings.filterwarnings("ignore")  

In [74]:
# create sql engine for stock database
engineStocks = create_engine("sqlite:///stocks.db")
# assign the database to pandas df
stocks_df = pd.read_sql('''SELECT * FROM "stocks_table"''', engineStocks)
# stocks_df = stocks_df[(stocks_df['month'] == '2021-09')]
stocks_df = stocks_df[(stocks_df['target'] == 'Success') | (stocks_df['target'] == 'Failure') | (stocks_df['target'] == 'StillWorking')]

In [75]:
# create sql engine for spy database
engineSpy = create_engine("sqlite:///spy.db")
# assign the database to pandas df
spy_df = pd.read_sql('''SELECT * FROM "spy_table"''', engineSpy)
# spy_df = spy_df[(spy_df['month'] == '2021-10')]
# spy_df = spy_df.iloc[[8]]
# spy_df['month'] = '2021-09'

In [80]:
# merge stocks_df and spy_df
df = pd.merge(stocks_df, spy_df, left_on=['dateBreachedF1'], right_on=['full_date'], how='left')

In [81]:
## transform the data so it is ready for modeling
# make sure year column is float
df['year'] = df['year'].astype(float)
# remove the year from the month and return month only as an int
df['monthDigit'] = df['month'].apply(lambda x: int(x.split('-')[1]))
# drop column not needed
df = df.drop('dateBreachedF1', axis=1)
# turn target column into binary variables
df['target'] = df['target'].apply(lambda x: 1 if x == 'Success' or x == 'StillWorking' else 0)
# move the target column to the first position (out of the middle)
first_col = df.pop('target')
df.insert(0, 'target', first_col)
# remove any rows with risk greater than 100% and less than 0%
df = df[(df['perRisk'] <= 100) & (df['perRisk'] >= 0)]

In [82]:
# use all data except for the most recent month in order to test Sept and see how it performed
df_model = df[(df['monthDigit'] == 9) & (df['year'] == 2021)]
# drop not needed columns and highly correlated columns
df_model = df_model.drop(['threeMoChng', 'perCapture', 'perRisk', 'ticker', 'full_date', 'revStratUp', 'month', 'longYrStuck', 'longCompndYrStuck', 'longQtrStuck', 'longCompndQtrStuck'], axis=1)
# get all dummy variables from the dataframe
df_model = pd.get_dummies(df_model)
# remove all np.inf variables from the dataframe so the data can be modeleded properly
df_model = df_model.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [83]:
df_model.shape

(547, 40)

In [84]:
# add columns needed to run model--these columns do not populate because none of these scenarios existed in the month of September
df_model['combo_111'] = 0

In [85]:
# Import the model
rf = pickle.load(open('rf_model.pkl','rb'))

In [86]:
# assign X and y variables
X = df_model.iloc[:, 1:]
y = df_model.iloc[:, 0]

In [87]:
# scale the data
std_scale = StandardScaler()
X_scaled = std_scale.fit_transform(X)

In [88]:
y_predict = rf.predict(X_scaled)

print("Accuracy Score:", metrics.accuracy_score(y, y_predict))

Accuracy Score: 0.8080438756855576


In [89]:
# Preictions
y_predict = rf.predict_proba(X_scaled)

In [90]:
X.shape

(547, 40)

In [94]:
y_predict[:10,1]

array([0.944, 0.996, 0.856, 0.804, 0.956, 0.936, 0.412, 0.888, 0.356,
       0.824])

### Create a dataframe to analyize the predicitons

In [110]:
# get latest month in the dataframe
df_compare = df[(df['month'] == '2021-09')]
# drop not needed columns and highly correlated columns
df_compare = df_compare.drop(['threeMoChng', 'revStratUp', 'month', 'full_date', 'longYrStuck', 'longCompndYrStuck', 'longQtrStuck', 'longCompndQtrStuck'], axis=1)
# remove all np.inf variables from the dataframe so the data can be modeleded properly
df_compare = df_compare.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [111]:
df_compare['predict'] = y_predict[:,1]

In [113]:
# df_compare = df_compare[(df_compare['perCapture'] > 2.5) & (df_compare['perRisk'] < 10)] 
df_compare = df_compare[(df_compare['perCapture'] > 5)] 

In [114]:
df_compare = df_compare[['ticker', 'combo', 'predict', 'perCapture', 'perRisk', 'riskReward', 'target']].sort_values('predict', ascending=False)
df_compare.head(5)

Unnamed: 0,ticker,combo,predict,perCapture,perRisk,riskReward,target
53475,TGLS,1,0.996,5.1,14.0,0.36,1
59234,HLIT,1,0.984,6.35,10.3,0.62,1
49446,OCGN,1,0.976,6.9,23.2,0.3,1
45273,TDUP,1,0.944,7.0,11.1,0.63,1
19962,RUN,1,0.924,12.7,17.7,0.72,1


In [115]:
df_compare.to_csv('out.csv', index=False)  