# Continuation of Quantitative Analyst Project

### Disclaimer: This notebook should not be considered any kind of financial advice. It exists only for the purposes of practicing modeling and making predictions

In [1]:
import yfinance as yf
from statsmodels.regression.rolling import RollingOLS
import matplotlib.pyplot as plt
import pandas_datareader.data as web
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import pandas_ta
import statsmodels.api as sm
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 25)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('/Users/samalainabayeva/Desktop/Capstone Project/stocks_for_modeling_clusters.csv')

In [3]:
df.shape

(13796, 23)

In [6]:
df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m,Mkt-RF,SMB,HML,RMW,CMA
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-11-30,AAL,39.429935,-0.000966,40.880488,38.219863,39.883038,41.546214,43.209389,44.872565,1.118944,-0.656804,-0.105388,0.031926,0.019861,-0.00367,-0.015729,-0.012729,1.322942,0.910139,0.804086,1.222983,-0.061192
2015-11-30,AAPL,26.960346,-0.003027,53.592898,25.708802,26.310024,26.911246,27.512468,28.11369,0.565622,0.161838,-0.005804,0.037844,0.017564,-0.014506,-0.007686,0.000966,1.084475,-0.136031,-0.39762,0.718503,-0.795369


In [5]:
df.set_index(["Date", "Ticker"], inplace=True)

In [8]:
# this shows the 25th, 50th, 75th% of every day in our data, .quantile() can accept a list of quantiles
df.groupby("Date")['RSI'].quantile([0.25, 0.5, 0.75])

Date            
2015-11-30  0.25    48.029423
            0.50    53.325293
            0.75    57.615866
2015-12-31  0.25    45.217133
            0.50    48.891191
                      ...    
2023-09-30  0.50    44.088706
            0.75    48.560172
2023-10-31  0.25    35.133633
            0.50    41.029025
            0.75    48.229785
Name: RSI, Length: 288, dtype: float64

In [9]:
dates = df.index.get_level_values(0).unique()

I will now add a feature for which quartile of RSI a given stock is during a specific monthly period, with 4 being the top quartile, and 1 being the bottom quartile. The reason that we use RSI is that it is a momentum indicator, and therefore should contain signal about a stock's short term outlook.

### Good function for creating quartile buckets, using the .cut() method

In [10]:
# Define a function to calculate quartiles

def calculate_quartiles(group):
    quartiles = group['RSI'].quantile([0.25, 0.5, 0.75])
    
    # pd.cut() is used to segment data, bins = -inf to .25, .25 to .5, .5 to .75, .75 to inf,
    # label "4" is the highest value group
    group['RSI_Quartile'] = pd.cut(group['RSI'], bins=[-float("inf")] + quartiles.to_list() + [float("inf")],\
                                   labels=[1, 2, 3, 4])
    return group

In [13]:
# Group by date and apply the quartile calculation function
quartile_df = df.groupby(level='Date', group_keys=False).apply(calculate_quartiles)

### New df with buckets, that will likely be OHE candidates as well

In [14]:
quartile_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m,Mkt-RF,SMB,HML,RMW,CMA,RSI_Quartile
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2015-11-30,AAL,39.429935,-0.000966,40.880488,38.219863,39.883038,41.546214,43.209389,44.872565,1.118944,-0.656804,-0.105388,0.031926,0.019861,-0.003670,-0.015729,-0.012729,1.322942,0.910139,0.804086,1.222983,-0.061192,1
2015-11-30,AAPL,26.960346,-0.003027,53.592898,25.708802,26.310024,26.911246,27.512468,28.113690,0.565622,0.161838,-0.005804,0.037844,0.017564,-0.014506,-0.007686,0.000966,1.084475,-0.136031,-0.397620,0.718503,-0.795369,3
2015-11-30,ABBV,41.160294,-0.053947,46.995675,41.269060,42.342785,43.416509,44.490233,45.563957,1.753494,0.411671,-0.023510,0.038557,-0.020296,-0.019629,-0.001641,-0.011735,0.851106,0.279184,0.021886,-0.269020,-0.043745,1
2015-11-30,ABT,38.669395,-0.009962,52.539149,38.075918,38.510178,38.944438,39.378697,39.812957,0.739885,0.405522,0.002679,0.059879,-0.000804,-0.011291,-0.004146,0.002517,1.056769,-0.140064,-0.638172,0.113359,0.560507,2
2015-11-30,ACN,94.345917,-0.006636,57.567387,91.367418,92.512621,93.657824,94.803026,95.948229,1.288679,0.812790,0.000186,0.050196,0.047565,0.020338,0.022024,0.020034,1.143698,-0.203828,-0.181753,0.085237,-0.074808,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-31,VRTX,363.040009,0.000200,53.833235,344.018442,352.603220,361.187999,369.772778,378.357556,6.590981,4.149759,0.043998,0.020882,0.010022,0.010627,0.013034,0.012706,1.142086,-0.222391,-1.246586,-0.743384,0.918423,4
2023-10-31,VZ,33.689999,0.000221,58.946533,29.828625,30.701467,31.574308,32.447150,33.319992,0.808540,0.107031,0.061682,-0.008194,0.003209,-0.017041,-0.017064,-0.002729,0.669774,-0.389182,-0.140889,0.456590,1.195363,4
2023-10-31,WFC,39.029999,0.000131,39.236461,38.205341,39.160670,40.116000,41.071329,42.026659,0.928127,-0.411437,-0.044787,-0.027753,-0.051934,-0.000449,-0.017741,-0.011247,1.180862,-0.162028,1.280360,-0.628291,-0.298237,2
2023-10-31,WMT,162.759995,0.000062,55.397237,156.054002,158.010000,159.965999,161.921997,163.877996,2.215188,0.245920,0.017695,0.000461,0.007201,0.013851,0.015090,0.012521,0.634972,-0.425805,-0.220643,0.608535,0.533930,4


In [19]:
# As you can see, the respective quartile changes month to month

quartile_df.xs("NVDA", level=1)["RSI_Quartile"]

Date
2017-05-31    4
2017-06-30    3
2017-07-31    3
2017-08-31    3
2017-09-30    2
             ..
2023-06-30    4
2023-07-31    3
2023-08-31    4
2023-09-30    3
2023-10-31    3
Name: RSI_Quartile, Length: 78, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]

In [337]:
quartile_df.index[0], quartile_df.index[-1]

(('2015-11-30', 'AAL'), ('2023-10-31', 'XOM'))

# Sector SPDR ETFs - processing for binary targets

In [51]:
spiders = pd.read_csv('/Users/samalainabayeva/Desktop/Capstone Project/SPDRs_w_technical_indicators.csv')

In [52]:
spiders.head()

Unnamed: 0,Date,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M)
0,1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228
1,1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892
2,1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0,-0.094617,,,,,,,,,0.649
3,1998-12-22,XLI,14.97139,23.28125,23.28125,23.203125,23.203125,600.0,-0.074152,,,,,,,,,0.009
4,1998-12-22,XLK,24.243126,32.046875,32.5,31.78125,32.40625,300500.0,-0.032286,,,,,,,,,7.2851


In [53]:
# Everything is a float except for the Date and Ticker columns, which will both be made indexes
# RSI - Bollinger Bands - ATR - MACD all contain null values, likely will be dropped just to have complete data

spiders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59633 entries, 0 to 59632
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   59633 non-null  object 
 1   Ticker                 59633 non-null  object 
 2   Adj Close              59633 non-null  float64
 3   Close                  59633 non-null  float64
 4   High                   59633 non-null  float64
 5   Low                    59633 non-null  float64
 6   Open                   59633 non-null  float64
 7   Volume                 59633 non-null  float64
 8   garman_klass_vol       59633 non-null  float64
 9   RSI                    59413 non-null  float64
 10  lowest_bollinger_band  59424 non-null  float64
 11  lower_bollinger_band   59424 non-null  float64
 12  20_day_SMA             59424 non-null  float64
 13  one_up_bollinger_band  59424 non-null  float64
 14  upper_bollinger_band   59424 non-null  float64
 15  AT

In [54]:
# Convert Date to date time

spiders["Date"] = pd.to_datetime(spiders["Date"])

In [55]:
spiders.set_index(["Date", "Ticker"], inplace=True)

In [56]:
spiders.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M)
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228
1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892
1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0,-0.094617,,,,,,,,,0.649
1998-12-22,XLI,14.97139,23.28125,23.28125,23.203125,23.203125,600.0,-0.074152,,,,,,,,,0.009
1998-12-22,XLK,24.243126,32.046875,32.5,31.78125,32.40625,300500.0,-0.032286,,,,,,,,,7.2851


In [57]:
spiders.groupby("Date")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb46009a0b0>

In [None]:
def 20_days_later(group):
    for row in group:
        current_index = 

In [66]:
samp = spiders.sample(1)
# samp.index.get_loc(key="Date")

In [67]:
# A failed attempt to create binary columns

# sampled_row = spiders.sample(1)
# value_to_find = 'some_value'  # Replace with the value you want to find

# try:
#     index_location = sampled_row.index.get_loc(value_to_find)
#     print(f"The index location for '{value_to_find}' in the sampled row is: {index_location}")
# except KeyError:
#     print(f"'{value_to_find}' not found in the index of the sampled row.")

'some_value' not found in the index of the sampled row.


In [None]:
# # Failed Experiments


# def create_return_binary_columns(df, intervals=[1, 3, 6, 12]):
#     for interval in intervals:
#         # Calculate the future date for the specified interval
#         future_date = df.index + pd.DateOffset(months=interval)
        
#         # Create a new column for future prices based on the calculated date
#         df[f'Price_{interval}M_Future'] = df.apply(lambda row: df.at[future_date, row.name[1]], axis=1)
        
#         # Create binary column: 1 if future price is higher, 0 otherwise
#         df[f'Price_{interval}M_Return'] = (df[f'Price_{interval}M_Future'] > df['Price']).astype(int)
        
#     return df

# # Example usage:
# # Assuming your DataFrame is indexed by date (level 0) and stock tickers (level 1) and has a 'Price' column.
# # df = ...

# # Create binary return columns for specified intervals
# binary_df = create_return_binary_columns(spiders, intervals=[1, 3, 6, 12])

In [84]:
spiders.head(2)

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228
1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892


### Trying to develop an approach to creating new feature columns

In [68]:
# Start with a specific date
start_date = pd.to_datetime('2023-01-15')  # For example, January 15, 2023

# Calculate the date one business month later
one_month_later = start_date + pd.tseries.offsets.BMonthEnd(1)

print("Start Date:", start_date)
print("One Month Later (Business Days):", one_month_later)

Start Date: 2023-01-15 00:00:00
One Month Later (Business Days): 2023-01-31 00:00:00


In [91]:
spiders.reset_index(inplace=True)

In [93]:
spiders['20_trading_days_later'] = spiders["Date"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59633 entries, 0 to 59632
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date                   59633 non-null  datetime64[ns]
 1   Ticker                 59633 non-null  object        
 2   Adj Close              59633 non-null  float64       
 3   Close                  59633 non-null  float64       
 4   High                   59633 non-null  float64       
 5   Low                    59633 non-null  float64       
 6   Open                   59633 non-null  float64       
 7   Volume                 59633 non-null  float64       
 8   garman_klass_vol       59633 non-null  float64       
 9   RSI                    59413 non-null  float64       
 10  lowest_bollinger_band  59424 non-null  float64       
 11  lower_bollinger_band   59424 non-null  float64       
 12  20_day_SMA             59424 non-null  float64       
 13  o

### Multi-Index Creation

In [86]:
spiders.set_index(["Date", "Ticker"], inplace=True)

In [90]:
for i in spiders.xs("XLB", level=1):
    print(type(i.index))

<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>
<class 'builtin_function_or_method'>


In [120]:
spiders

Unnamed: 0,Date,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M)
0,1998-12-22,XLB,12.011767,20.828125,20.828125,20.750000,20.781250,1900.0,-0.116068,,,,,,,,,0.0228
1,1998-12-22,XLE,12.448157,23.265625,23.390625,23.187500,23.312500,15200.0,-0.152028,,,,,,,,,0.1892
2,1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0,-0.094617,,,,,,,,,0.6490
3,1998-12-22,XLI,14.971390,23.281250,23.281250,23.203125,23.203125,600.0,-0.074152,,,,,,,,,0.0090
4,1998-12-22,XLK,24.243126,32.046875,32.500000,31.781250,32.406250,300500.0,-0.032286,,,,,,,,,7.2851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59628,2023-10-25,XLP,67.940002,67.940002,68.239998,67.419998,67.599998,11351100.0,0.000063,43.174669,66.122483,66.838991,67.555500,68.272008,68.988516,0.820658,-0.697945,771.1938
59629,2023-10-25,XLRE,32.189999,32.189999,32.650002,32.099998,32.540001,8454400.0,0.000099,33.847969,32.120178,32.866839,33.613500,34.360161,35.106822,0.626503,-0.653811,272.1471
59630,2023-10-25,XLU,59.349998,59.349998,59.470001,58.680000,58.919998,20654700.0,0.000069,46.305928,56.095167,57.214334,58.333500,59.452666,60.571832,1.211284,-0.672943,1225.8564
59631,2023-10-25,XLV,126.510002,126.510002,127.459999,125.570000,127.070000,11427000.0,0.000104,37.148926,126.420510,127.778756,129.137002,130.495248,131.853494,1.483988,-0.873242,1445.6298


In [134]:
spiders.reset_index(inplace=True)

### Creating a list of the unique dates in the stock data

In [140]:
dates = spiders["Date"].unique()
dates

<DatetimeArray>
['1998-12-22 00:00:00', '1998-12-23 00:00:00', '1998-12-24 00:00:00',
 '1998-12-28 00:00:00', '1998-12-29 00:00:00', '1998-12-30 00:00:00',
 '1998-12-31 00:00:00', '1999-01-04 00:00:00', '1999-01-05 00:00:00',
 '1999-01-06 00:00:00',
 ...
 '2023-10-12 00:00:00', '2023-10-13 00:00:00', '2023-10-16 00:00:00',
 '2023-10-17 00:00:00', '2023-10-18 00:00:00', '2023-10-19 00:00:00',
 '2023-10-20 00:00:00', '2023-10-23 00:00:00', '2023-10-24 00:00:00',
 '2023-10-25 00:00:00']
Length: 6251, dtype: datetime64[ns]

In [151]:
zero = spiders.loc[0]
zero["Date"]

Timestamp('1998-12-22 00:00:00')

In [160]:
# Draft, doesn't quite work

# def one_month_later(row):
#     dates = spiders["Date"].unique()
#     # Start with a specific date
#     start_date = row["Date"] 
    
#     # Calculate the date one month later, adjusting for weekends
#     one_month_later = start_date + pd.DateOffset(months=1)

#     # Check if the calculated date is a business day (Monday to Friday)
#     while one_month_later not in dates:  # 5 and 6 represent Saturday and Sunday
#         if one_month_later > dates[-1]:
#             row["one_month_later"] = np.nan
#         else:
#             one_month_later += pd.DateOffset(days=1)

#     row["one_month_later"] = one_month_later
#     return row

In [176]:
dates[-1] 

Timestamp('2023-10-25 00:00:00')

In [177]:
(dates[-1] + pd.DateOffset(days=1))

Timestamp('2023-10-26 00:00:00')

In [174]:
spiders.loc[42, 'Date'] not in dates

False

### Functions to create new features

In [193]:
# This group of functions works -> will revise later to repeat less

def one_month_later(date):
    dates = spiders["Date"].unique()
    # Start with a specific date
    start_date = date 
    
    # Calculate the date one month later, adjusting for weekends
    one_month_later = start_date + pd.DateOffset(months=1)

    while one_month_later not in dates:  
        if one_month_later > dates[-1]:
            return np.nan
            break
        else:
            one_month_later += pd.DateOffset(days=1)
    
    return one_month_later


def three_months_later(date):
    dates = spiders["Date"].unique()
    # Start with a specific date
    start_date = date 
    
    # Calculate the date one month later, adjusting for weekends
    one_month_later = start_date + pd.DateOffset(months=3)

    while one_month_later not in dates:  
        if one_month_later > dates[-1]:
            return np.nan
            break
        else:
            one_month_later += pd.DateOffset(days=1)
    
    return one_month_later


def six_months_later(date):
    dates = spiders["Date"].unique()
    # Start with a specific date
    start_date = date 
    
    # Calculate the date one month later, adjusting for weekends
    one_month_later = start_date + pd.DateOffset(months=6)

    while one_month_later not in dates:  
        if one_month_later > dates[-1]:
            return np.nan
            break
        else:
            one_month_later += pd.DateOffset(days=1)
    
    return one_month_later



def twelve_months_later(date):
    dates = spiders["Date"].unique()
    # Start with a specific date
    start_date = date 
    
    # Calculate the date one month later, adjusting for weekends
    one_month_later = start_date + pd.DateOffset(months=12)

    while one_month_later not in dates:  
        if one_month_later > dates[-1]:
            return np.nan
            break
        else:
            one_month_later += pd.DateOffset(days=1)
    
    return one_month_later

In [194]:
# Initialize these columns

spiders["one_month_later"] = 0
spiders["three_months_later"] = 0
spiders["six_months_later"] = 0
spiders["twelve_months_later"] = 0

In [195]:
spiders.head(2)

Unnamed: 0,Date,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),one_month_later,three_months_later,six_months_later,twelve_months_later
0,1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228,0,0,0,0
1,1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892,0,0,0,0


In [196]:
# Checking data/type

spiders["Date"][0]

Timestamp('1998-12-22 00:00:00')

In [197]:
# showing that the function does work, January 22 1999 is one month later and is a date of observations

one_month_later(spiders["Date"][0])

Timestamp('1999-01-22 00:00:00')

### Creating new feature for dates to be targetted

In [198]:
# Cell takes about 3 minutes to run

spiders['one_month_later'] = spiders["Date"].apply(one_month_later)
spiders["three_months_later"] = spiders["Date"].apply(three_months_later)
spiders["six_months_later"] = spiders["Date"].apply(six_months_later)
spiders["twelve_months_later"] = spiders["Date"].apply(twelve_months_later)

In [199]:
spiders.head()

Unnamed: 0,Date,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),one_month_later,three_months_later,six_months_later,twelve_months_later
0,1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228,1999-01-22,1999-03-22,1999-06-22,1999-12-22
1,1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892,1999-01-22,1999-03-22,1999-06-22,1999-12-22
2,1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0,-0.094617,,,,,,,,,0.649,1999-01-22,1999-03-22,1999-06-22,1999-12-22
3,1998-12-22,XLI,14.97139,23.28125,23.28125,23.203125,23.203125,600.0,-0.074152,,,,,,,,,0.009,1999-01-22,1999-03-22,1999-06-22,1999-12-22
4,1998-12-22,XLK,24.243126,32.046875,32.5,31.78125,32.40625,300500.0,-0.032286,,,,,,,,,7.2851,1999-01-22,1999-03-22,1999-06-22,1999-12-22


In [200]:
# Future Dates are date time objects

spiders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59633 entries, 0 to 59632
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date                   59633 non-null  datetime64[ns]
 1   Ticker                 59633 non-null  object        
 2   Adj Close              59633 non-null  float64       
 3   Close                  59633 non-null  float64       
 4   High                   59633 non-null  float64       
 5   Low                    59633 non-null  float64       
 6   Open                   59633 non-null  float64       
 7   Volume                 59633 non-null  float64       
 8   garman_klass_vol       59633 non-null  float64       
 9   RSI                    59413 non-null  float64       
 10  lowest_bollinger_band  59424 non-null  float64       
 11  lower_bollinger_band   59424 non-null  float64       
 12  20_day_SMA             59424 non-null  float64       
 13  o

# Re-establish the date as the index, try to create binary columns

In [201]:
spiders.set_index("Date", inplace=True)

In [202]:
spiders.head(2)

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),one_month_later,three_months_later,six_months_later,twelve_months_later
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228,1999-01-22,1999-03-22,1999-06-22,1999-12-22
1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892,1999-01-22,1999-03-22,1999-06-22,1999-12-22


In [205]:
spiders.groupby("Ticker", group_keys=False)["Adj Close"].agg('mean')

Ticker
XLB     34.733393
XLC     57.960035
XLE     40.074415
XLF     17.709913
XLI     41.127297
XLK     45.314988
XLP     31.292390
XLRE    32.039864
XLU     29.751282
XLV     49.507054
XLY     60.183651
Name: Adj Close, dtype: float64

In [251]:
spiders.loc[dates[0], :]

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),one_month_later,three_months_later,six_months_later,twelve_months_later,1-Month-Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228,1999-01-22,1999-03-22,1999-06-22,1999-12-22,
1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892,1999-01-22,1999-03-22,1999-06-22,1999-12-22,
1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0,-0.094617,,,,,,,,,0.649,1999-01-22,1999-03-22,1999-06-22,1999-12-22,
1998-12-22,XLI,14.97139,23.28125,23.28125,23.203125,23.203125,600.0,-0.074152,,,,,,,,,0.009,1999-01-22,1999-03-22,1999-06-22,1999-12-22,
1998-12-22,XLK,24.243126,32.046875,32.5,31.78125,32.40625,300500.0,-0.032286,,,,,,,,,7.2851,1999-01-22,1999-03-22,1999-06-22,1999-12-22,
1998-12-22,XLP,14.967611,26.5,26.53125,25.875,25.875,150300.0,-0.115433,,,,,,,,,2.2496,1999-01-22,1999-03-22,1999-06-22,1999-12-22,
1998-12-22,XLU,12.582784,29.828125,30.25,29.828125,30.25,7900.0,-0.297125,,,,,,,,,0.0994,1999-01-22,1999-03-22,1999-06-22,1999-12-22,
1998-12-22,XLV,17.505878,25.03125,25.0625,24.8125,24.8125,5700.0,-0.04695,,,,,,,,,0.0998,1999-01-22,1999-03-22,1999-06-22,1999-12-22,
1998-12-22,XLY,19.151007,25.46875,25.46875,25.3125,25.3125,3700.0,-0.030038,,,,,,,,,0.0709,1999-01-22,1999-03-22,1999-06-22,1999-12-22,


In [302]:
spiders.rename(columns={"one_month_later":"one_months_later"}, inplace=True)

In [256]:
# First Draft, did not work in the end

# def one_month_gainer(df):
#     for ticker in list(df["Ticker"].unique()):
#         ticker_df = df[df["Ticker"] == ticker]
#         while ticker_df.loc[ticker_df["one_months_later"]]:
#             try:
#                 ticker_df["1_Month_Positive"] = (ticker_df["Adj Close"] < ticker_df.loc[ticker_df["one_month_later"], "Adj Close"])
#             except KeyError:
#                 pass

#         df.loc[df["Ticker"] == ticker, "1-Month-Positive"] = ticker_df["1_Month_Positive"]
    
#     return df


In [307]:
# This function works!!!!

def one_month_gainer(df, num="One"):
    for ticker in list(df["Ticker"].unique()):
        ticker_df = df[df["Ticker"] == ticker]

        for index, row in ticker_df.iterrows():
            try:
                one_month_later_value = row[f"{num.lower()}_months_later"]
                if not pd.isna(one_month_later_value):
                    ticker_df.at[index, f"{num}_Month_Positive"] = (row["Adj Close"] < ticker_df.loc[\
                                                                    one_month_later_value, "Adj Close"]).astype(int)
                else:
                    # Handle the case where one_month_later is NaN (NaT)
                    ticker_df.at[index, f"{num}_Month_Positive"] = np.nan
            except KeyError:
                # Handle the KeyError exception here if necessary
                ticker_df.at[index, f"{num}_Month_Positive"] = np.nan
           
        df.loc[df["Ticker"] == ticker, f"{num}_Month_Positive"] = ticker_df[f"{num}_Month_Positive"]
    
    return df


###  Based on the above function

In [308]:
def three_month_gainer(df, num="Three"):
    return one_month_gainer(df, num)


def six_month_gainer(df, num="Six"):
    return one_month_gainer(df, num)


def twelve_month_gainer(df, num="Twelve"):
    return one_month_gainer(df, num)

In [309]:
# Looks like it is working as intended! 


bi_spiders = one_month_gainer(spiders)

In [310]:
bi_spiders.head(2)

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228,1999-01-22,1999-03-22,1999-06-22,1999-12-22,1.0
1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892,1999-01-22,1999-03-22,1999-06-22,1999-12-22,0.0


In [321]:
# Adding the 3 month binary column

three_month_gainer(bi_spiders)
six_month_gainer(bi_spiders)
twelve_month_gainer(bi_spiders)

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Positive,Three_Month_Positive,Six_Month_Positive,Twelve_Month_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.750000,20.781250,1900.0,-0.116068,,,,,,,,,0.0228,1999-01-22,1999-03-22,1999-06-22,1999-12-22,1.0,1.0,1.0,1.0
1998-12-22,XLE,12.448157,23.265625,23.390625,23.187500,23.312500,15200.0,-0.152028,,,,,,,,,0.1892,1999-01-22,1999-03-22,1999-06-22,1999-12-22,0.0,1.0,1.0,1.0
1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0,-0.094617,,,,,,,,,0.6490,1999-01-22,1999-03-22,1999-06-22,1999-12-22,0.0,1.0,1.0,1.0
1998-12-22,XLI,14.971390,23.281250,23.281250,23.203125,23.203125,600.0,-0.074152,,,,,,,,,0.0090,1999-01-22,1999-03-22,1999-06-22,1999-12-22,1.0,1.0,1.0,1.0
1998-12-22,XLK,24.243126,32.046875,32.500000,31.781250,32.406250,300500.0,-0.032286,,,,,,,,,7.2851,1999-01-22,1999-03-22,1999-06-22,1999-12-22,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-25,XLP,67.940002,67.940002,68.239998,67.419998,67.599998,11351100.0,0.000063,43.174669,66.122483,66.838991,67.555500,68.272008,68.988516,0.820658,-0.697945,771.1938,NaT,NaT,NaT,NaT,,,,
2023-10-25,XLRE,32.189999,32.189999,32.650002,32.099998,32.540001,8454400.0,0.000099,33.847969,32.120178,32.866839,33.613500,34.360161,35.106822,0.626503,-0.653811,272.1471,NaT,NaT,NaT,NaT,,,,
2023-10-25,XLU,59.349998,59.349998,59.470001,58.680000,58.919998,20654700.0,0.000069,46.305928,56.095167,57.214334,58.333500,59.452666,60.571832,1.211284,-0.672943,1225.8564,NaT,NaT,NaT,NaT,,,,
2023-10-25,XLV,126.510002,126.510002,127.459999,125.570000,127.070000,11427000.0,0.000104,37.148926,126.420510,127.778756,129.137002,130.495248,131.853494,1.483988,-0.873242,1445.6298,NaT,NaT,NaT,NaT,,,,


In [322]:
bi_spiders.head(2)

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Positive,Three_Month_Positive,Six_Month_Positive,Twelve_Month_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228,1999-01-22,1999-03-22,1999-06-22,1999-12-22,1.0,1.0,1.0,1.0
1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892,1999-01-22,1999-03-22,1999-06-22,1999-12-22,0.0,1.0,1.0,1.0


In [306]:
# bi_spiders.drop(columns=["1-Month-Positive", "One-Month-Positive"], inplace=True)

### Visually inspecting some random dates

In [320]:
spiders.loc["2010-09-03", :][:3]

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Positive,Three_Month_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2010-09-03,XLB,24.445906,32.73,32.75,32.450001,32.599998,7809600.0,-0.031965,58.475407,22.523344,23.042463,23.561582,24.080701,24.599821,0.665356,0.104667,190.9127,2010-10-04,2010-12-03,2011-03-03,2011-09-06,1.0,1.0
2010-09-03,XLE,34.812813,54.200001,54.439999,53.75,54.029999,13247100.0,-0.074554,54.335066,32.259655,33.139128,34.018601,34.898074,35.777547,1.108917,-0.159517,461.1688,2010-10-04,2010-12-03,2011-03-03,2011-09-06,1.0,1.0
2010-09-03,XLF,9.235112,11.795288,11.795288,11.657189,11.70593,108976122.0,-0.021644,54.527284,8.39304,8.646164,8.899289,9.152413,9.405537,0.240945,-0.072484,1006.4067,2010-10-04,2010-12-03,2011-03-03,2011-09-06,0.0,1.0


In [319]:
bi_spiders.loc["2010-08-03", :][:3]

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Positive,Three_Month_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2010-08-03,XLB,24.102339,32.27,32.560001,32.200001,32.549999,9554200.0,-0.034813,58.020647,21.676848,22.430437,23.184027,23.937616,24.691206,0.750625,0.460585,230.2786,2010-09-03,2010-11-03,2011-02-03,2011-08-03,1.0,1.0
2010-08-03,XLE,35.77626,55.700001,56.060001,55.240002,55.490002,14254500.0,-0.074311,59.232019,32.829966,33.536847,34.243729,34.95061,35.657492,1.23776,0.389115,509.9727,2010-09-03,2010-11-03,2011-02-03,2011-08-03,0.0,1.0
2010-08-03,XLF,9.48316,12.112104,12.217709,12.095857,12.185215,81845251.0,-0.02423,54.170104,8.891043,9.075639,9.260234,9.44483,9.629425,0.28978,0.072944,776.1516,2010-09-03,2010-11-03,2011-02-03,2011-08-03,0.0,0.0


In [273]:
spiders[spiders["Ticker"] == "XLK"].loc[['1998-12-22', '1999-1-22'], "Adj Close"]

Date
1998-12-22    24.243126
1999-01-22    26.288010
Name: Adj Close, dtype: float64

In [269]:
spiders["Ticker"]

Date
1998-12-22     XLB
1998-12-22     XLE
1998-12-22     XLF
1998-12-22     XLI
1998-12-22     XLK
              ... 
2023-10-25     XLP
2023-10-25    XLRE
2023-10-25     XLU
2023-10-25     XLV
2023-10-25     XLY
Name: Ticker, Length: 59633, dtype: object

### Checking out the value counts of the target variables

In [325]:
# Pretty good balance, which should help with training

bi_spiders["One_Month_Positive"].value_counts()

One_Month_Positive
1.0    35707
0.0    23684
Name: count, dtype: int64

In [327]:
# Still pretty good balance (>2-1), which should help with training

bi_spiders["Three_Month_Positive"].value_counts()

Three_Month_Positive
1.0    38492
0.0    20426
Name: count, dtype: int64

In [328]:
# Slightly more imbalanced

bi_spiders["Six_Month_Positive"].value_counts()

Six_Month_Positive
1.0    39697
0.0    18539
Name: count, dtype: int64

In [330]:
# The most imbalanced, indicative of the long term uptrends in markets I suppose

bi_spiders["Twelve_Month_Positive"].value_counts()

Twelve_Month_Positive
1.0    41526
0.0    15346
Name: count, dtype: int64

In [335]:
bi_spiders.index[0], bi_spiders.index[-1]

(Timestamp('1998-12-22 00:00:00'), Timestamp('2023-10-25 00:00:00'))

# Future to-dos

In [118]:
pd.to_datetime("2023-03-10 00:00:00") - pd.to_datetime('2023-01-29')

Timedelta('40 days 00:00:00')

In [107]:
spiders.head(1)

Unnamed: 0,Date,Ticker,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M)
0,1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228


In [112]:
# spiders.apply(one_month_later)

In [73]:
spiders.groupby("Ticker")["Adj Close"].apply(

'Date        Ticker\n1998-12-22  XLB       12.011767\n1998-12-23  XLB       12.137930\n1998-12-24  XLB       12.417266\n1998-12-28  XLB       12.309141\n1998-12-29  XLB       12.534411\n                        ...    \n2023-10-19  XLB       76.220001\n2023-10-20  XLB       75.300003\n2023-10-23  XLB       74.459999\n2023-10-24  XLB       75.339996\n2023-10-25  XLB       74.459999\nName: XLB, Length: 6251, dtype: float64'

# This was all intended to be for clustering and unsupervised learning, but my kernel died every time I attempted to run the code

In [9]:
first_month = pd.read_csv('/Users/samalainabayeva/Desktop/Capstone Project/first_month_for_clusters.csv').set_index("Ticker")

In [10]:
first_month.describe()

Unnamed: 0,Adj Close,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m,Mkt-RF,SMB,HML,RMW,CMA
count,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0
mean,80.912312,-0.016675,52.983209,77.146678,79.358676,81.570674,83.782672,85.99467,2.054617,0.072713,0.005862,0.044641,0.015797,-0.00419,-0.001865,0.000671,1.074641,0.060857,0.109225,-0.087549,0.204366
std,127.120915,0.021141,7.348326,119.630251,126.567087,133.570306,140.629991,147.738049,3.498124,3.688756,0.065844,0.04538,0.03386,0.029332,0.02266,0.020469,0.355909,0.468053,0.63235,0.75359,0.977439
min,7.628457,-0.141608,34.720317,6.098736,7.453916,8.271838,8.53426,8.796681,0.254239,-30.420512,-0.305013,-0.102977,-0.095143,-0.133946,-0.100787,-0.092039,0.363615,-0.757726,-1.413453,-2.987445,-2.137904
25%,34.801295,-0.021135,48.029423,33.670969,34.185573,34.693648,35.263859,35.988396,0.838569,-0.023736,-0.02065,0.025159,-0.000159,-0.018923,-0.011457,-0.00687,0.884639,-0.309634,-0.295009,-0.469367,-0.455296
50%,56.151548,-0.0116,53.325297,53.874267,55.044674,56.671385,57.669419,58.639578,1.336663,0.305626,0.0076,0.045585,0.017693,-0.00117,0.000944,0.000947,1.038932,0.050978,0.016935,-0.075542,0.12084
75%,81.304329,-0.002884,57.615901,78.60598,79.78621,80.96644,81.928434,82.624958,2.014971,0.803474,0.034713,0.072823,0.035928,0.010741,0.010404,0.012554,1.21826,0.369028,0.494978,0.313979,0.710429
max,1248.849976,0.001324,74.285022,1158.433266,1241.832887,1325.232507,1408.632128,1492.031749,31.678713,6.815738,0.265952,0.139612,0.090324,0.07564,0.06866,0.079019,2.985178,1.447644,1.837404,2.325522,3.780059


In [None]:
# Attempting to plot cluster metrics
silhouttes = []
inertias = []

for i in range(3, 5):
    clusters = KMeans(n_clusters=i, random_state=42, init='random')
    clusters.fit(first_month)
    silhouttes.append(silhouette_score(first_month, clusters.predict(first_month)))
    inertias.append(clusters.inertia_)
        
    
fig, ax = plt.subtplots(ncols=2, figsize=(16,6))
ax[0].plot(range(3,5), silhouttes)
ax[0].set_title("Silhoutte Scores for Various # of Clusters")

ax[1].plot(range(3,5), inertias)
ax[1].set_title("Inertia Plots")

In [1]:
# pd.read_csv('/Users/samalainabayeva/Desktop/Capstone Project/stocks_for_modeling_clusters').set_index(["Date", "Ticker"])

In [11]:
# Attempting to plot cluster metrics
silhouttes = []
inertias = []

In [None]:
clusters = KMeans(n_clusters=2, random_state=42, init='random')
clusters.fit(first_month)
silhouttes.append(silhouette_score(first_month, clusters.predict(first_month)))
inertias.append(clusters.inertia_)