# Preprocessing

## Import Packages

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

## Define functions to create dataframes
Pull only the data to be used by the model with appropriate naming conventions. 

Includes engineered feature *differential*: $C_d - O_d$  (*daily close - daily open*)

In [3]:
def parser(x):
    return datetime.strptime(x, '%Y-%m-%d')

In [4]:
def get_company_data(name, filename):
    """
    function to pull only the data to be used by the model from the flat files
    
    name is a string used to add the company name to the columns
    
    filename is a string which must match the name of the flat file in the raw data folder
    """
    
    df = pd.read_csv('raw data/'+filename+'.csv', parse_dates =  {'date' : [0]}, 
                     date_parser = parser)
    
    df.drop(['1. open', '2. high', '3. low', '4. close', '7. dividend amount', 
             '8. split coefficient'], axis = 1, inplace = True)
    
    df.rename(index = str, columns = {'5. adjusted close': name + ' adj close',
                                      '6. volume': name + ' vol'}, inplace = True)
    
    #we only want volume data for the predictor stocks
    #we will stationarize (diff) the msft data after creating the fourier transforms
    if name != 'msft':
        df.drop(name + ' vol', axis=1, inplace=True)
        diff = df[name + ' adj close'].diff()  
        df[name + ' adj close'] = diff  
        df.dropna(inplace=True)
    
    df.set_index('date', inplace = True)
    
    return df

In [5]:
def get_index_data(name, filename):
    """
    function to pull only the data to be used by the model from the flat files
    
    name is a string used to add the index name to the columns
    
    filename is a string which must match the name of the flat file in the raw data folder
    """
        
    df = pd.read_csv('raw data/'+filename+'.csv', parse_dates =  {'date' : [0]}, 
                     date_parser = parser)

    df.drop(['High', 'Low', 'Open', 'Close', 'Volume'], axis = 1, inplace = True)
    
    df.rename(index = str, columns = {'Adj Close': name + ' adj close'}, inplace = True)
    
    diff = pd.DataFrame(df[name + ' adj close']).diff()
    
    df[name + ' adj close'] = diff
    
    df.dropna(inplace=True)
    
    df.set_index('date', inplace = True)
    return df

## Extract data from flat files

In [6]:
#historical stock data
amzn = get_company_data('amzn', 'amzn')
aapl = get_company_data('aapl', 'apple')
googl = get_company_data('googl', 'google')
msft = get_company_data('msft', 'msft')
#technical indicators
ma7 = pd.read_csv('raw data/msft_ma7.csv', parse_dates =  {'date' : [0]}, 
                  date_parser = parser)
ma7.set_index('date', inplace = True)  # set DateTimeIndex
ma21 = pd.read_csv('raw data/msft_ma21.csv', parse_dates =  {'date' : [0]}, 
                   date_parser = parser)
ma21.set_index('date', inplace = True)  # set DateTimeIndex
bbands = pd.read_csv('raw data/msft_bbands.csv', parse_dates =  {'date' : [0]}, 
                     date_parser = parser)
bbands.set_index('date', inplace = True)  # set DateTimeIndex
macd = pd.read_csv('raw data/msft_macd.csv', parse_dates =  {'date' : [0]}, 
                   date_parser = parser)
macd.set_index('date', inplace = True)  # set DateTimeIndex
#indicies
nasdaq = get_index_data('nasdaq', '^IXIC')
nyse = get_index_data('nyse', '^NYA')
sp500 = get_index_data('sp500', '^GSPC')
tb13 = get_index_data('tb13', '^IRX')

## Add Fourier Transforms
to denoise the data and add long- and short-term trends.

3 component transforms are long term trends, 9 component are short term.

In [7]:
#add fourier transforms with 3, 6, and 9 components
close_fft = np.fft.fft(np.asarray(msft['msft adj close'].tolist()))
fft_df = pd.DataFrame({'fft':close_fft})
fft_list = np.asarray(fft_df['fft'].tolist())
for num_ in [3, 6, 9]:
    fft_list_m10= np.copy(fft_list); fft_list_m10[num_:-num_]=0
    fft_df['fft {}'.format(num_)] = np.fft.ifft(fft_list_m10)
fft_df.drop(['fft'], axis = 1, inplace = True)
fft_df = fft_df.set_index(msft.index)

## Difference Microsoft Data

In [8]:
diff = msft['msft adj close'].diff()
msft['msft adj close'] = diff
msft.dropna(inplace=True)

## Create predictors and target dataframes

In [9]:
predictors = pd.DataFrame(index = amzn.index)  # blank dataframe to merge data into

df_to_merge = [msft, ma7, ma21, bbands, macd, fft_df, amzn, aapl, googl,  
               nasdaq, nyse, sp500, tb13]  # list of data frames to merge

#merge data into 1 dataframe
predictors = pd.concat(df_to_merge, join = 'outer', axis = 1, sort = True) 

#predictors.dropna(inplace = True)  # drop NaN values

#drop target and other unnecessary data from predictors dataframe
predictors.fillna(method='ffil')

target = predictors['msft adj close'].copy()  # create target dataframe

predictors.drop(['msft adj close', 'Real Middle Band', 'MACD', 'MACD_Signal'], 
                axis = 1, inplace = True) 

In [10]:
predictors.columns

Index(['msft vol', 'SMA', 'SMA', 'Real Lower Band', 'Real Upper Band',
       'MACD_Hist', 'fft 3', 'fft 6', 'fft 9', 'amzn adj close',
       'aapl adj close', 'googl adj close', 'nasdaq adj close',
       'nyse adj close', 'sp500 adj close', 'tb13 adj close'],
      dtype='object')

## Split into Testing & Training Sets

In [9]:
# testing data is 2016 - 2018, training data is 2010 - 2015
predictors_train = predictors.iloc[0:-501]
predictors_test = predictors.iloc[-501:]
target_train = target.iloc[0:-501]
target_test = target.iloc[-501:]

## Push to flat files

In [10]:
predictors_train.to_csv('processed data/predictors_train.csv')
predictors_test.to_csv('processed data/predictors_test.csv')
target_train.to_csv('processed data/target_train.csv')
target_test.to_csv('processed data/target_test.csv')