# Pre-processing Data

In [14]:
import pandas as pd
import numpy as np
import os
import os.path as op
import yfinance as yf

In [15]:
### Directories
root_dir = os.path.dirname(os.getcwd())
dest_dir = root_dir+'\\data'

#### Create Preprocessed Data Directory

In [16]:
def create_directory(logdir):
    try:
        os.makedirs(logdir)
    except FileExistsError:
        pass

In [17]:
create_directory(dest_dir)    

### Downloading Data
#### Get S&P 500 Constituents List

In [18]:
# Read the CSV file containing the constituents information
constituents = pd.read_csv("https://datahub.io/core/s-and-p-500-companies/r/constituents.csv")

# Get the ticker symbol for each constituent
tickers = constituents['Symbol'].tolist()

# Print the list of constituents
print(tickers)

['MMM', 'AOS', 'ABT', 'ABBV', 'ABMD', 'ACN', 'ATVI', 'ADM', 'ADBE', 'AAP', 'AMD', 'AES', 'AFL', 'A', 'APD', 'AKAM', 'ALK', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AEE', 'AAL', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'ABC', 'AME', 'AMGN', 'APH', 'ADI', 'ANSS', 'ANTM', 'AON', 'APA', 'AAPL', 'AMAT', 'APTV', 'ANET', 'AJG', 'AIZ', 'T', 'ATO', 'ADSK', 'ADP', 'AZO', 'AVB', 'AVY', 'BKR', 'BLL', 'BAC', 'BBWI', 'BAX', 'BDX', 'BRK.B', 'BBY', 'BIO', 'TECH', 'BIIB', 'BLK', 'BK', 'BA', 'BKNG', 'BWA', 'BXP', 'BSX', 'BMY', 'AVGO', 'BR', 'BRO', 'BF.B', 'CHRW', 'CDNS', 'CZR', 'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CARR', 'CTLT', 'CAT', 'CBOE', 'CBRE', 'CDW', 'CE', 'CNC', 'CNP', 'CDAY', 'CERN', 'CF', 'CRL', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG', 'CTXS', 'CLX', 'CME', 'CMS', 'KO', 'CTSH', 'CL', 'CMCSA', 'CMA', 'CAG', 'COP', 'ED', 'STZ', 'CPRT', 'GLW', 'CTVA', 'COST', 'CTRA', 'CCI', 'CSX', 'CMI', 'CVS', 'DHI', 'DHR', '

#### Download Returns for all tickers From YFinance

In [None]:
returns=[]   
for firm in tickers:
    df_firm = yf.download(tickers=[firm], start="1990-01-01")
    df_firm["Return"] = df_firm["Close"].pct_change()
    df_firm = df_firm[["Return"]]
    df_firm = df_firm.rename(columns={"Return": f"{firm}"})
    returns.append(df_firm)
df = pd.concat(returns, axis=1, sort=False)
df = df.dropna(how='all',axis=1)

In [199]:
archive_df = df.copy()
archive_df.to_csv(dest_dir+"\\raw_data.csv")

### Functions to Process Data

#### Create Dataset function

This function is used to creating initial dataset for financial time series analysis. The function takes in a dataframe containing stock market data, a split bucket number, and a destination directory dest_dir.

The first step in the function is to select only the companies that existed at the beginning of the testing period. To do this, the function checks which companies were present on the 750th day of the split which is the first day for the testing period. It then selects only the columns of the dataframe corresponding to these companies.

Next, the function calculates the target dataframe and after that we normalize the data.

Then, we further create slices for test and training data using the below mentioned functions.

In [None]:
def create_dataset(df_, sp, dest_dir): 
    '''
        Select only the companies that existed at the beginning of testing period
        In a split of 1000 days, we are checking the companies that were present on the 750th day
        Then we are only using those companies
    '''
    
    cols = df_.iloc[750].dropna().index.values # Columns on 750th date of the split
    df_X = df_[cols] #Selecting only those columns
    target_df = calculate_target_df(df_X)
    normalized_df = normalize_df(df_X)
    slice_train_dataset(normalized_df[:750], target_df[:750], sp=sp)
    slice_test_dataset(normalized_df[750-240:],target_df[750-240:], dest_dir, sp)

#### Calculate Target_Df (Y) function (Labeling)

This function is used to calculate the target dataframe, which is used for training the financial time series classification model. The function takes in a dataframe df containing stock market data.

Next, the function calculates the median return for each row in the prepared target dataframe. This median value is used as a threshold for classifying the returns as positive or negative.

Finally, the function creates a target dataframe with the same dimensions as the prepared target dataframe, where stock returns that are above the daily median are labeled as one, and zero otherwise.

The resulting target dataframe is used in the financial time series classification model to predict whether the stock returns will be positive or negative.

In [None]:
def calculate_target_df(df):
    ''' 
        Stock returns that are above the daily median are labeled as one, and zero otherwise.
        Returns a dataframe with the classification labels.
    '''
    df = prepare_target_df(df)
    
    # Calculate the median for each row
    median = df.loc[:, :].median(axis=1)
    
    # Create the target dataframe
    target_df = pd.DataFrame(0, index=df.index, columns=df.columns)
    target_df[df.sub(median, axis=0).ge(0)] = 1
    
    return target_df

#### Prepare Target_Df (Y) function

This function is used to prepare the target dataframe by cleaning the input dataframe df.

The function takes the input dataframe df, which contains stock market data, and creates a copy of it called copy_of_df. The reason for creating a copy is to avoid modifying the original dataframe.

The function then iterates over the columns in the input dataframe and over the rows from the 240th row until the end of the dataframe. For each row, the function checks whether the previous 240 rows (i.e., the previous 240 days) for that column contain any missing values (NaN) in original dataframe. If there are missing values, it means that there isn't enough history to calculate the returns, so the function replaces the return value for that row and column with NaN in the copy_of_df.

The function returns the copy_of_df dataframe with NaN values where there isn't enough history to calculate returns. This cleaned dataframe is then used to calculate the target dataframe in the calculate_target_df function.

In [None]:
def prepare_target_df(df):
    ''' 
        Clean dataframe to create targets. 
        Remove any returns that don't have enough history so they don't count towards the labeling.
    '''
    copy_of_df = df.copy()
    for cols in df.columns:
        for i in range(240, len(df)):
            if df[cols].iloc[i-240:i].isnull().values.any():
                copy_of_df.iloc[i][cols] = np.nan
    return copy_of_df

#### Normalising original dataframe (To be used as X in training data)

The normalize_df function takes a pandas DataFrame df as an input and returns a normalized version of it.

The normalization process involves subtracting the mean of the first 750 rows of the DataFrame (df.iloc[:750].mean()) from the entire DataFrame df, and then dividing the result by the standard deviation of the first 750 rows of the DataFrame (df.iloc[:750].std()). This effectively standardizes the data in the DataFrame, scaling it so that it has a mean of 0 and a standard deviation of 1.

The function then returns the normalized DataFrame.

#### Why do this?
Normalizing stock returns data using mean and standard deviation is a common technique in finance and statistics to remove the scale and capture the relative variation of the data.

Stock returns data often has large variations over time, and different stocks can have different levels of volatility. Normalizing the data using the mean and standard deviation allows us to standardize the returns data so that it can be compared across different stocks and time periods.

By subtracting the mean from each data point, we center the data around 0, which removes any inherent bias in the data. Then, by dividing by the standard deviation, we scale the data to have a standard deviation of 1. This is useful because it allows us to compare the volatility of different stocks and time periods on a relative basis.

Normalizing stock returns data can be particularly useful in financial modeling, such as in portfolio optimization or risk management, where we may want to analyze the risk and return characteristics of a portfolio of stocks or securities.

In [None]:
def normalize_df(df):
    mean_ = df.iloc[:750].mean()
    std_ = df.iloc[:750].std()
    return (df - mean_) / std_

#### Slicing for training dataset

The slice_train_dataset function takes two pandas DataFrames, df_X and df_target, as inputs, and a few optional parameters.

The function slices the input data into shorter windows of length 240, and creates a training dataset by concatenating these windows. For each window, the function takes a subset of the columns in df_X and the corresponding target values in df_target.

'sp' is a label that is used to identify the dataset in the saved files.

Here's a step-by-step breakdown of what the function does:

1. Create a list of column names cols from the columns in df_X.
2. Initialize empty lists X_list and Y_list.
3. Loop through each row in df_X, up to len(df_X)-240.
4. For each row and column, extract a window of length 240 from df_X starting at the current row and append it to X_list.
5. For the same row and column, extract the corresponding target value from df_target and append it to Y_list.
6. If any values in the window or target are NaN, skip this row and column.
7. After looping through all rows and columns, convert X_list and Y_list to NumPy arrays with the correct shape for training: X_train is a 3D array with shape (n_samples, window_length, n_features=1) and Y_train is a 2D array with shape (n_samples, 1).
8. Save the resulting X_train and Y_train arrays as NumPy binary files with labels based on the sp parameter.

In summary, this function prepares a dataset for training a machine learning model on a time series problem, where the goal is to predict future values of a target variable based on a window of past values. The function creates a training set by sliding a window of length 240 along the time series, and extracting a subset of columns from df_X and their corresponding target values from df_target. The function also saves the resulting dataset to disk for future use.

In [158]:
def slice_train_dataset(df_X, df_target, sp=None):
    cols = df_X.columns
    X_list = []
    Y_list = []
    for i in range(len(df_X)-240):
        for col in cols:
            X = df_X[col][i:i+240].values
            Y = df_target[col][i+240]
            if np.isnan(X).any() or np.isnan(Y):
                continue
            else:
                X_list.append(X)
                Y_list.append(Y)
    X_train = np.array(X_list).reshape(-1,240,1)
    Y_train = np.array(Y_list).reshape(-1,1) 
    np.save(op.join(dest_dir, 'study_period_X_'+str(sp)+'_train.npy'), X_train)
    np.save(op.join(dest_dir, 'study_period_Y_'+str(sp)+'_train.npy'), Y_train)

#### Slicing for test dataset

The slice_test_dataset function takes two pandas DataFrames, df_X and df_target, and two additional parameters dest_dir and sp, as inputs. The function is similar to slice_train_dataset but prepares a dataset for testing a machine learning model on a time series problem.

The function first creates a list of column names cols from the columns in df_X. It then initializes empty lists index_list, dates, X_list, and Y_list. index_list keeps track of the row and column index for each sample in the test dataset, and dates keeps track of the date corresponding to each row.

The function loops through each row in df_X, starting at index lookback, which is set to 240. For each row and column, the function extracts a window of length 240 from df_X ending at the current row, and appends it to X_list. It also extracts the corresponding target value from df_target and appends it to Y_list. The function also adds the row and column index to index_list and the date to dates.

If any values in the window or target are NaN, the function skips this row and column. After looping through all rows and columns, the function converts X_list and Y_list to NumPy arrays with the correct shape for testing: X_test is a 3D array with shape (n_samples, window_length, n_features=1) and Y_test is a 2D array with shape (n_samples, 1).

The function also saves the resulting X_test and Y_test arrays, as well as the columns, dates, and index_array to disk. These files are saved in a directory named sp{sp} under the directory specified by dest_dir.

In summary, this function prepares a dataset for testing a machine learning model on a time series problem, where the goal is to evaluate the model's performance on a hold-out set of data. The function creates a testing set by sliding a window of length 240 along the time series, and extracting a subset of columns from df_X and their corresponding target values from df_target. The function also saves the resulting dataset and related metadata to disk for future use.

In [202]:
def slice_test_dataset(df_X, df_target, dest_dir, sp):
    cols = df_X.columns
    index_list, dates = [], []
    X_list = []
    Y_list = []
    lookback = 240
    for i in range(lookback, len(df_X)):
        dates.append(df_X.index[i])
        for j,col in enumerate(cols):
            X = df_X[col][i-lookback:i].values
            Y = df_target[col][i]
            if np.isnan(X).any() or np.isnan(df_X[col].iloc[i]):
                continue
            else: 
                index_list.append([i-240, j])
                X_list.append(X)
                Y_list.append(Y)
    columns = np.array(df_X.columns)
    dates_array = np.array(dates)
    index_array = np.array(index_list)
    inference_dir = op.join(dest_dir, 'sp'+str(sp))
    X_test = np.array(X_list).reshape(-1,240,1)
    Y_test = np.array(Y_list).reshape(-1,1)
    create_directory(inference_dir)
    np.save(op.join(inference_dir, 'columns.npy'), columns)
    np.save(op.join(inference_dir, 'dates.npy'), dates_array)
    np.save(op.join(inference_dir, 'index_array.npy'), index_array)
    np.save(op.join(dest_dir, 'study_period_X_'+str(sp)+'_test.npy'), X_test)
    np.save(op.join(dest_dir, 'study_period_Y_'+str(sp)+'_test.npy'), Y_test)

### Splitting in small buckets
Here we are running the pre-processing code. We are dividing the data into small stocks into small buckets of 1000 days and applying different data augmentation methods to each bucket.

#### Why do this?: Data Sparsity problems
In her paper titled "Evaluating Data Augmentation for Financial Time Series Classification" (Fons, 2020), Elizabeth Fons proposed a novel approach to address the data sparsity problem for individual stocks in financial time series data by dividing the S&P 500 stocks into small buckets of 1000 days and applying different data augmentation methods to each bucket.

The data sparsity problem arises due to the varying lengths of historical data available for each individual stock. To address this issue, we group stocks with similar amounts of historical data together into small buckets of 1000 days. By doing so, we ensure that each bucket contains a comparable amount of data for each stock, which makes it possible to apply data augmentation methods to each bucket separately.

This approach allows us to augment the data within each bucket using a variety of methods, such as random rotation, cropping, flipping, and scaling. By doing so, we can generate more training data for each stock and improve the performance of machine learning models that are trained on this data.

Overall, this method represents a practical approach to dealing with data sparsity in individual stocks and applying data augmentation methods effectively. By dividing the data into small buckets and applying different data augmentation methods to each bucket, we are able to mitigate the effects of data sparsity and generate more robust training data for machine learning models.

In [200]:
def process_dataset(dest_dir, df):
    j = 0
    count = 0
    while count+1000 < len(df):
        print("Split :"+str(j+1))
        df_ = df.iloc[count:count+1000]
        create_dataset(df_, j, dest_dir)
        count += 250
        j += 1

### Bringing everything together
Running the pre-processing code. It will take a lot of time. Total size on the disk ~ 15 GB.

In [80]:
process_dataset(dest_dir, df)

Split :0
Split :1
Split :2
Split :3
Split :4
Split :5
Split :6
Split :7
Split :8
Split :9
Split :10
Split :11
Split :12
Split :13
Split :14
Split :15
Split :16
Split :17
Split :18
Split :19
Split :20
Split :21
Split :22
Split :23
Split :24
Split :25
Split :26
Split :27
Split :28
Split :29
