# Pre-processing Data

In [38]:
import pandas as pd
import numpy as np
import os
import os.path as op
import yfinance as yf

In [29]:
### Directories
root_dir = os.path.dirname(os.getcwd())
dest_dir = root_dir+'\\data'

#### Create Preprocessed Data Directory

In [30]:
def create_directory(logdir):
    try:
        os.makedirs(logdir)
    except FileExistsError:
        pass

In [31]:
create_directory(dest_dir)    

### Processing Data

#### Get S&P 500 Constituents List

In [None]:
# Read the CSV file containing the constituents information
constituents = pd.read_csv("https://datahub.io/core/s-and-p-500-companies/r/constituents.csv")

# Get the ticker symbol for each constituent
tickers = constituents['Symbol'].tolist()

# Print the list of constituents
print(tickers)

#### Download Returns for all tickers From YFinance

In [71]:
returns=[]   
for firm in tickers:
    df_firm = yf.download(tickers=[firm], start="1990-01-01")
    df_firm["Return"] = df_firm["Close"].pct_change()
    df_firm = df_firm[["Return"]]
    df_firm = df_firm.rename(columns={"Return": f"{firm}"})
    returns.append(df_firm)
df = pd.concat(returns, axis=1, sort=False)
df = df.dropna(how='all',axis=1)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [72]:
df

Unnamed: 0_level_0,MMM,AOS,ABT,ABBV,ABMD,ACN,ATVI,ADM,ADBE,AAP,...,WHR,WMB,WYNN,XEL,XYL,YUM,ZBRA,ZBH,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-02,,,,,,,,,,,...,,,,,,,,,,
1990-01-03,0.010870,-0.009615,0.003610,,-0.010638,,,0.000000,0.049383,,...,0.003876,0.022013,,-0.003086,,,,,0.017857,
1990-01-04,0.007680,0.009708,-0.001799,,-0.032258,,,-0.005435,0.047059,,...,0.011583,-0.030769,,-0.024768,,,,,0.000000,
1990-01-05,-0.016768,-0.009615,-0.010811,,0.022222,,,-0.043716,0.022472,,...,-0.030534,0.003174,,-0.009524,,,,,0.000000,
1990-01-08,0.026357,0.009708,0.000000,,0.000000,,,0.011429,0.010989,,...,0.015748,0.012658,,-0.009615,,,,,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-07,0.003433,-0.012743,0.011196,0.000827,,0.011741,0.056161,0.001211,0.022893,-0.001319,...,-0.002400,0.012934,0.005829,0.001163,0.035752,0.002170,0.019864,0.018830,0.012754,0.003881
2023-02-08,-0.014115,-0.006303,-0.015984,-0.003652,,-0.010805,-0.035847,-0.012824,-0.014225,-0.017495,...,-0.037829,-0.009343,0.000773,-0.014954,-0.014197,0.016082,-0.016941,-0.001375,-0.026465,-0.021139
2023-02-09,-0.020130,-0.006041,-0.019328,0.028283,,0.001581,0.032515,-0.012255,-0.006740,0.020763,...,-0.019728,-0.015404,0.047964,-0.007811,-0.001694,-0.001978,-0.027585,-0.018890,-0.013123,-0.007774
2023-02-10,0.008412,0.004559,0.000833,0.022529,,-0.004208,0.002525,0.018486,-0.012826,-0.000724,...,0.002835,0.015964,0.000460,0.017231,0.001037,0.001144,-0.009793,-0.002962,-0.002470,-0.015422


In [73]:
def slice_test_dataset(df_X, df_target, dest_dir, sp):
    cols = df_X.columns
    index_list, dates = [], []
    X_list = []
    Y_list = []
    lookback = 240
    for i in range(lookback, len(df_X)):
        dates.append(df_X.index[i])
        for j,col in enumerate(cols):
            X = df_X[col][i-lookback:i].values
            Y = df_target[col][i]
            if np.isnan(X).any() or np.isnan(df_X[col].iloc[i]):
                continue
            else: 
                index_list.append([i-240, j])
                X_list.append(X)
                Y_list.append(Y)
    columns = np.array(df_X.columns)
    dates_array = np.array(dates)
    index_array = np.array(index_list)
    inference_dir = op.join(dest_dir, 'sp'+str(sp))
    X_test = np.array(X_list).reshape(-1,240,1)
    Y_test = np.array(Y_list).reshape(-1,1)
    create_directory(inference_dir)
    np.save(op.join(inference_dir, 'columns.npy'), columns)
    np.save(op.join(inference_dir, 'dates.npy'), dates_array)
    np.save(op.join(inference_dir, 'index_array.npy'), index_array)
    np.save(op.join(dest_dir, 'study_period_X_'+str(sp)+'_test.npy'), X_test)
    np.save(op.join(dest_dir, 'study_period_Y_'+str(sp)+'_test.npy'), Y_test)


In [74]:
def slice_dataset(df_X, df_target, cut_=None, sp=None):
    cols = df_X.columns
    X_list = []
    Y_list = []
    for i in range(cut_):
        for col in cols:
            X = df_X[col][i:i+240].values
            Y = df_target[col][i+240]
            if np.isnan(X).any() or np.isnan(Y):
                continue
            else:
                X_list.append(X)
                Y_list.append(Y)
    X_train = np.array(X_list).reshape(-1,240,1)
    Y_train = np.array(Y_list).reshape(-1,1) 
    np.save(op.join(dest_dir, 'study_period_X_'+str(sp)+'_train.npy'), X_train)
    np.save(op.join(dest_dir, 'study_period_Y_'+str(sp)+'_train.npy'), Y_train)

In [75]:
def normalize_df(df):
    mean_ = np.nanmean(df.values[:750])
    std_ = np.nanstd(df.values[:750])
    return (df-mean_)/std_

In [76]:
def prepare_target_df(df):
    ''' 
        Clean dataframe to create targets. 
        Remove any returns that don't have enough history so they don't count towards the labeling.
    '''
    copy_of_df = df.copy()
    for cols in df.columns:
        for i in range(240, len(df)):
            if df[cols].iloc[i-240:i].isnull().values.any():
                copy_of_df.iloc[i][cols] = np.nan
    return copy_of_df

In [77]:
def calculate_target_df(df):
    ''' 
        Stock returns that are above the daily median are labeled as one, and zero otherwise.
        Returns a dataframe with the classification labels.
    '''
    new_df = prepare_target_df(df)
    median = new_df.median(axis=1)
    target_df = new_df.subtract(median, axis=0)
    target_df[target_df>=0] = 1
    target_df[target_df<0] = 0
    return target_df

In [78]:
def create_dataset(df_, sp, dest_dir): 
    '''
        Select only the companies that existed at the beginning of testing period
        In a split of 1000 days, we are checking the companies that were present on the 750th day
        Then we are only using those companies
    '''
    
    cols = df_.iloc[750].dropna().index.values # Columns on 750th date of the split
    df_X = df_[cols] #Selecting only those columns
    target_df = calculate_target_df(df_X)
    normalized_df = normalize_df(df_X)
    slice_dataset(normalized_df[:750], target_df[:750], cut_=750-240, sp=sp)
    slice_test_dataset(normalized_df[750-240:],target_df[750-240:], dest_dir, sp)
    # return train_x, train_y, test_x, test_y

In [None]:
def process_dataset(dest_dir, df):
    j = 0
    count = 0
    while count+1000 < len(df):
        print("Split :"+str(j+1))
        df_ = df.iloc[count:count+1000]
        create_dataset(df_, j, dest_dir)
        count += 250
        j += 1

In [80]:
process_dataset(dest_dir, df)

Split :0
Split :1
Split :2
Split :3
Split :4
Split :5
Split :6
Split :7
Split :8
Split :9
Split :10
Split :11
Split :12
Split :13
Split :14
Split :15
Split :16
Split :17
Split :18
Split :19
Split :20
Split :21
Split :22
Split :23
Split :24
Split :25
Split :26
Split :27
Split :28
Split :29
