In [93]:
# Bring in all dependencies
import pandas as pd
import requests
import numpy as np
import scipy as sp
from statistics import mode
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Read in data file

In [31]:
stock_history_file = pd.read_csv('model_data.csv',index_col = False)
stock_history_file

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Symbol,Name,Sector
0,0,2021-04-19,9.900000,9.900000,9.900000,9.900000,100.0,0.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services
1,1,2021-04-20,9.945000,10.047000,9.945000,10.040000,1800.0,0.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services
2,2,2021-04-21,9.910000,10.100000,9.900000,9.932000,13200.0,0.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services
3,3,2021-04-22,9.930000,9.930000,9.930000,9.930000,2100.0,0.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services
4,4,2021-04-23,9.930000,9.980000,9.900000,9.930000,43700.0,0.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services
...,...,...,...,...,...,...,...,...,...,...,...,...
90856,76,2021-05-21,24.350000,24.350000,24.350000,24.350000,0.0,0.0,0.0,PCPC,Periphas Capital Partnering Corporation Class ...,Financial Services
90857,77,2021-05-24,24.350000,24.350000,24.350000,24.350000,0.0,0.0,0.0,PCPC,Periphas Capital Partnering Corporation Class ...,Financial Services
90858,78,2021-05-25,24.400000,24.400000,24.389999,24.389999,1600.0,0.0,0.0,PCPC,Periphas Capital Partnering Corporation Class ...,Financial Services
90859,79,2021-05-26,24.750000,24.750000,24.389999,24.389999,10100.0,0.0,0.0,PCPC,Periphas Capital Partnering Corporation Class ...,Financial Services


# Clean Data

In [32]:
# Drop unnecessary columns 
stock_history_file = stock_history_file.drop("Unnamed: 0",axis=1)
stock_history_file = stock_history_file.drop(['Stock Splits'], axis=1)

In [33]:
# creating bool series True for NaN values 
is_null = stock_history_file.isnull().values

# replacing nan values in pricing columns with zero
stock_history_file["Open"].fillna(0.00, inplace = True)
stock_history_file["Close"].fillna(0.00, inplace = True)
stock_history_file["High"].fillna(0.00, inplace = True)
stock_history_file["Low"].fillna(0.00, inplace = True)

# replacing nan values in volume with zero
stock_history_file["Volume"].fillna(0, inplace = True)

# replacing nan values in volume with zero
stock_history_file["Sector"].fillna("Undefined", inplace = True)

    
# filtering data to display any remaining NaN values
stock_history_file[is_null]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Symbol,Name,Sector
15864,2018-12-05,0.000000,0.000000,0.000000,0.000000,0.0,0.24,FNV,Franco-Nevada Corporation,Basic Materials
15864,2018-12-05,0.000000,0.000000,0.000000,0.000000,0.0,0.24,FNV,Franco-Nevada Corporation,Basic Materials
15864,2018-12-05,0.000000,0.000000,0.000000,0.000000,0.0,0.24,FNV,Franco-Nevada Corporation,Basic Materials
15864,2018-12-05,0.000000,0.000000,0.000000,0.000000,0.0,0.24,FNV,Franco-Nevada Corporation,Basic Materials
15864,2018-12-05,0.000000,0.000000,0.000000,0.000000,0.0,0.24,FNV,Franco-Nevada Corporation,Basic Materials
...,...,...,...,...,...,...,...,...,...,...
89516,2021-05-24,18.240000,18.299999,18.139999,18.139999,65800.0,0.00,FINS,Angel Oak Financial Strategies Income Term Tru...,Undefined
89517,2021-05-25,18.208000,18.299999,18.208000,18.260000,28900.0,0.00,FINS,Angel Oak Financial Strategies Income Term Tru...,Undefined
89518,2021-05-26,18.290001,18.299999,18.250000,18.250000,36500.0,0.00,FINS,Angel Oak Financial Strategies Income Term Tru...,Undefined
89519,2021-05-27,18.299999,18.350000,18.250000,18.309999,47000.0,0.00,FINS,Angel Oak Financial Strategies Income Term Tru...,Undefined


# Add calculated values

In [34]:
# Calculate daily price change
stock_history_file['Price Change'] = stock_history_file['Open'] - stock_history_file['Close']
# loop over df columns for change column
for i in range(1,len(stock_history_file)) :
    # Calculate volume changes from one day to next
    stock_history_file['Volume Change'] = stock_history_file.loc[(i-1), "Volume"] - stock_history_file.loc[i, "Volume"]
    # Calculate dividend changes from one day to next
    stock_history_file['Dividend Change'] = stock_history_file.loc[(i-1), "Dividends"] - stock_history_file.loc[i, "Dividends"]

# Check new Columns
stock_history_file.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Symbol,Name,Sector,Price Change,Volume Change,Dividend Change
0,2021-04-19,9.9,9.9,9.9,9.9,100.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0
1,2021-04-20,9.945,10.047,9.945,10.04,1800.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,-0.095,-20000.0,0.0
2,2021-04-21,9.91,10.1,9.9,9.932,13200.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,-0.022,-20000.0,0.0
3,2021-04-22,9.93,9.93,9.93,9.93,2100.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0
4,2021-04-23,9.93,9.98,9.9,9.93,43700.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0


# Add Volitility Scores

* Price

In [35]:
# Use describe to get the Price Volititliy Score parameters
price_min = stock_history_file['Price Change'].min()
price_25_qrt = stock_history_file['Price Change'].describe()['25%']
price_50_qrt = stock_history_file['Price Change'].describe()['50%']
price_75_qrt = stock_history_file['Price Change'].describe()['75%']
price_max = stock_history_file['Price Change'].max()

# Use parameters to assign volitility scores

# Empty list to hold scores
price_volitility_list = []

# Assign Volitility score based on .describe values
for i in range(len(stock_history_file)) :
    if stock_history_file['Price Change'][i] <= price_25_qrt:
        price_volitility_list.append(1)
    elif stock_history_file['Price Change'][i] <= price_50_qrt:
        price_volitility_list.append(2)
    elif stock_history_file['Price Change'][i] <= price_75_qrt:
        price_volitility_list.append(3)
    else:
        price_volitility_list.append(4)
        
# Add list to DF
stock_history_file['Price Volitility'] = price_volitility_list

# Check new Columns
stock_history_file.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Symbol,Name,Sector,Price Change,Volume Change,Dividend Change,Price Volitility
0,2021-04-19,9.9,9.9,9.9,9.9,100.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0,2
1,2021-04-20,9.945,10.047,9.945,10.04,1800.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,-0.095,-20000.0,0.0,2
2,2021-04-21,9.91,10.1,9.9,9.932,13200.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,-0.022,-20000.0,0.0,2
3,2021-04-22,9.93,9.93,9.93,9.93,2100.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0,2
4,2021-04-23,9.93,9.98,9.9,9.93,43700.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0,2


* Volume

In [36]:
# Use describe to get the Volume Volititliy Score parameters
volume_min = stock_history_file['Volume Change'].min()
volume_25_qrt = stock_history_file['Volume Change'].describe()['25%']
volume_50_qrt = stock_history_file['Volume Change'].describe()['50%']
volume_75_qrt = stock_history_file['Volume Change'].describe()['75%']
volume_max = stock_history_file['Volume Change'].max()

# Use parameters to assign volitility scores

# Empty list to hold scores
volume_volitility_list = []

# Assign Volitility score based on .describe values
for i in range(len(stock_history_file)) :
    if stock_history_file['Volume Change'][i] <= volume_25_qrt:
        volume_volitility_list.append(1)
    elif stock_history_file['Volume Change'][i] <= volume_50_qrt:
        volume_volitility_list.append(2)
    elif stock_history_file['Volume Change'][i] <= volume_75_qrt:
        volume_volitility_list.append(3)
    else:
        volume_volitility_list.append(4)
        
# Add list to DF
stock_history_file['Volume Volitility'] = volume_volitility_list

# Check new Columns
stock_history_file.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Symbol,Name,Sector,Price Change,Volume Change,Dividend Change,Price Volitility,Volume Volitility
0,2021-04-19,9.9,9.9,9.9,9.9,100.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0,2,1
1,2021-04-20,9.945,10.047,9.945,10.04,1800.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,-0.095,-20000.0,0.0,2,1
2,2021-04-21,9.91,10.1,9.9,9.932,13200.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,-0.022,-20000.0,0.0,2,1
3,2021-04-22,9.93,9.93,9.93,9.93,2100.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0,2,1
4,2021-04-23,9.93,9.98,9.9,9.93,43700.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0,2,1


* Dividends

In [37]:
# Use describe to get the Dividend Volititliy Score parameters
divd_min = stock_history_file['Dividend Change'].min()
divd_25_qrt = stock_history_file['Dividend Change'].describe()['25%']
divd_50_qrt = stock_history_file['Dividend Change'].describe()['50%']
divd_75_qrt = stock_history_file['Dividend Change'].describe()['75%']
divd_max = stock_history_file['Dividend Change'].max()

# Use parameters to assign volitility scores

# Empty list to hold scores
divd_volitility_list = []

# Assign Volitility score based on .describe values
for i in range(len(stock_history_file)) :
    if stock_history_file['Dividend Change'][i] <= divd_25_qrt:
        divd_volitility_list.append(1)
    elif stock_history_file['Dividend Change'][i] <= divd_50_qrt:
        divd_volitility_list.append(2)
    elif stock_history_file['Dividend Change'][i] <= divd_75_qrt:
        divd_volitility_list.append(3)
    else:
        divd_volitility_list.append(4)
        
# Add list to DF
stock_history_file['Dividend Volitility'] = divd_volitility_list

# Check new Columns
stock_history_file.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Symbol,Name,Sector,Price Change,Volume Change,Dividend Change,Price Volitility,Volume Volitility,Dividend Volitility
0,2021-04-19,9.9,9.9,9.9,9.9,100.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0,2,1,1
1,2021-04-20,9.945,10.047,9.945,10.04,1800.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,-0.095,-20000.0,0.0,2,1,1
2,2021-04-21,9.91,10.1,9.9,9.932,13200.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,-0.022,-20000.0,0.0,2,1,1
3,2021-04-22,9.93,9.93,9.93,9.93,2100.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0,2,1,1
4,2021-04-23,9.93,9.98,9.9,9.93,43700.0,0.0,AUS,Austerlitz Acquisition Corporation I Class A O...,Financial Services,0.0,-20000.0,0.0,2,1,1


* Overall Stock Volitility Score

In [38]:
stock_history_file['Stock Volitility Score'] = stock_history_file['Price Volitility'] + stock_history_file['Dividend Volitility'] + stock_history_file['Volume Volitility']

* Rearrange DF Columns

In [39]:
col_name_lst = list(stock_history_file)
col_name_lst

['Date',
 'Open',
 'High',
 'Low',
 'Close',
 'Volume',
 'Dividends',
 'Symbol',
 'Name',
 'Sector',
 'Price Change',
 'Volume Change',
 'Dividend Change',
 'Price Volitility',
 'Volume Volitility',
 'Dividend Volitility',
 'Stock Volitility Score']

In [83]:
stock_history_file = stock_history_file[['Symbol', 'Name', 'Sector', 'Date', 'Open', 'Close', 'High', 'Low', 'Price Change',
                                         'Price Volitility', 'Volume', 'Volume Change','Volume Volitility', 'Dividends', 'Dividend Change', 'Dividend Volitility', 'Stock Volitility Score']]

# Manipulate DF for model accepted formats

* Pre-Process Data for Model

In [98]:
# make copy of history df
model_data = stock_history_file

# define agg methods for columns
agg_dict = {
    'Sector': 'max',
    'Price Change': np.mean,
    'Dividend Change': np.mean,
    'Volume Change': np.mean,
    'Price Volitility': mode,
    'Dividend Volitility': mode,
    'Volume Volitility': mode,
    'Stock Volitility Score': mode
}

# Summarize tkr data into one row e/a with agg using dict
model_data = model_data.groupby(['Symbol']).agg(agg_dict)

# reset the index to retain the ticker
# model_data.reset_index()

# Convert sector into dummy columns
model_data = pd.get_dummies(model_data, prefix=['Sector'])

* Split Data for training

In [99]:
model_data

Unnamed: 0_level_0,Price Change,Dividend Change,Volume Change,Price Volitility,Dividend Volitility,Volume Volitility,Stock Volitility Score,Sector_Basic Materials,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Undefined,Sector_Utilities
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ACA,-0.041705,0.0,-20000.0,1,1,1,3,0,0,0,0,0,0,0,1,0,0,0,0
ADCT,0.088237,0.0,-20000.0,1,1,1,3,0,0,0,0,0,0,1,0,0,0,0,0
AFB,0.000708,0.0,-20000.0,2,1,1,4,0,0,0,0,0,1,0,0,0,0,0,0
AGCO,0.000571,0.0,-20000.0,1,1,1,3,0,0,0,0,0,0,0,1,0,0,0,0
AIRC,-0.048588,0.0,-20000.0,1,1,1,3,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VNT,0.036614,0.0,-20000.0,4,1,1,6,0,0,0,0,0,0,0,0,0,1,0,0
WIA,0.000068,0.0,-20000.0,2,1,1,4,0,0,0,0,0,1,0,0,0,0,0,0
WRI,0.008776,0.0,-20000.0,1,1,1,3,0,0,0,0,0,0,0,0,1,0,0,0
ZBH,0.025922,0.0,-20000.0,4,1,1,6,0,0,0,0,0,0,1,0,0,0,0,0


In [100]:
train_data, test_data = train_test_split(model_data, random_state=42, shuffle=True)

* Scale Test & Train data sets

In [101]:
train_scaler = StandardScaler().fit(train_data) 
train_scaler.transform(train_data)

test_scaler = StandardScaler().fit(test_data) 
test_scaler.transform(test_data)

array([[ 0.11261801,  0.        ,  0.        , -0.4330127 ,  0.        ,
         0.        , -0.4330127 , -0.20851441, -0.20851441, -0.37796447,
        -0.20851441, -0.20851441,  1.29099445, -0.30151134, -0.30151134,
        -0.30151134, -0.30151134,  0.        ,  0.        ],
       [-3.79084017,  0.        ,  0.        ,  1.29903811,  0.        ,
         0.        ,  1.29903811, -0.20851441,  4.79583152, -0.37796447,
        -0.20851441, -0.20851441, -0.77459667, -0.30151134, -0.30151134,
        -0.30151134, -0.30151134,  0.        ,  0.        ],
       [ 0.09213892,  0.        ,  0.        , -0.4330127 ,  0.        ,
         0.        , -0.4330127 , -0.20851441, -0.20851441, -0.37796447,
        -0.20851441, -0.20851441,  1.29099445, -0.30151134, -0.30151134,
        -0.30151134, -0.30151134,  0.        ,  0.        ],
       [-0.77552019,  0.        ,  0.        , -1.29903811,  0.        ,
         0.        , -1.29903811, -0.20851441, -0.20851441, -0.37796447,
         4.795