# Preprocessing

## Import Packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

## Define functions to create dataframes
Pull only the data to be used by the model with appropriate naming conventions. 

Includes engineered feature *differential*: $C_d - O_d$  (*daily close - daily open*)

In [2]:
def parser(x):
    return datetime.strptime(x, '%Y-%m-%d')

In [3]:
def get_company_data(name, filename):
    """
    function to pull only the data to be used by the model from the flat files
    
    name is a string used to add the company name to the columns
    
    filename is a string which must match the name of the flat file in the raw data folder
    """
    
    df = pd.read_csv('raw data/'+filename+'.csv', parse_dates =  {'date' : [0]}, 
                     date_parser = parser)
    
    df.drop(['2. high', '3. low', '7. dividend amount', '8. split coefficient'],
           axis = 1, inplace = True)
    df['1. open'] = df['4. close'] - df['1. open'] #create differential values in open column location
    df.rename(index = str, columns = {'1. open': name + ' differential',
                                      '5. adjusted close': name + ' adj close',
                                      '6. volume': name + ' vol'}, inplace = True) 
    df.drop('4. close', axis = 1, inplace = True) #drop close column
    
    df.set_index('date', inplace = True)
    return df

In [4]:
def get_index_data(name, filename):
    """
    function to pull only the data to be used by the model from the flat files
    
    name is a string used to add the index name to the columns
    
    filename is a string which must match the name of the flat file in the raw data folder
    """
        
    df = pd.read_csv('raw data/'+filename+'.csv', parse_dates =  {'date' : [0]}, 
                     date_parser = parser)

    df.drop(['High', 'Low'], axis = 1, inplace = True)
    df['Open'] = df['Close'] - df['Open'] #create differential values in open column location
    df.rename(index = str, columns = {'Open': name + ' differential',
                                      'Adj Close': name + ' adj close',
                                      'Volume': name + ' vol'}, inplace = True) 
    df.drop('Close', axis = 1, inplace = True) #drop close column
    
    df.set_index('date', inplace = True)
    return df

## Load data from flat files

In [5]:
#historical stock data
amzn = get_company_data('amzn', 'amzn')
aapl = get_company_data('aapl', 'apple')
googl = get_company_data('googl', 'google')
msft = get_company_data('msft', 'msft')
#technical indicators
ma7 = pd.read_csv('raw data/msft_ma7.csv', parse_dates =  {'date' : [0]}, 
                  date_parser = parser)
ma7.set_index('date', inplace = True)  # set DateTimeIndex
ma21 = pd.read_csv('raw data/msft_ma21.csv', parse_dates =  {'date' : [0]}, 
                   date_parser = parser)
ma21.set_index('date', inplace = True)  # set DateTimeIndex
bbands = pd.read_csv('raw data/msft_bbands.csv', parse_dates =  {'date' : [0]}, 
                     date_parser = parser)
bbands.set_index('date', inplace = True)  # set DateTimeIndex
macd = pd.read_csv('raw data/msft_macd.csv', parse_dates =  {'date' : [0]}, 
                   date_parser = parser)
macd.set_index('date', inplace = True)  # set DateTimeIndex
#indicies
nasdaq = get_index_data('nasdaq', '^IXIC')
nyse = get_index_data('nyse', '^NYA')
sp500 = get_index_data('sp500', '^GSPC')
tb13 = get_index_data('tb13', '^IRX')

## Add Fourier Transforms
to denoise the data and add long- and short-term trends.

3 component transforms are long term trends, 9 component are short term.

In [6]:
#add fourier transforms with 3, 6, and 9 components
close_fft = np.fft.fft(np.asarray(msft['msft adj close'].tolist()))
fft_df = pd.DataFrame({'fft':close_fft})
fft_list = np.asarray(fft_df['fft'].tolist())
for num_ in [3, 6, 9]:
    fft_list_m10= np.copy(fft_list); fft_list_m10[num_:-num_]=0
    fft_df['fft {}'.format(num_)] = np.fft.ifft(fft_list_m10)
fft_df.drop(['fft'], axis = 1, inplace = True)
fft_df = fft_df.set_index(amzn.index)

## Create predictors and target dataframes

In [7]:
predictors = pd.DataFrame(index = amzn.index)  # blank dataframe to merge data into

df_to_merge = [amzn, aapl, googl, msft, ma7, ma21, bbands, macd,  # list of data frames to merge
                              nasdaq, nyse, sp500, tb13, fft_df] 

#merge data into 1 dataframe
predictors = pd.concat(df_to_merge, join = 'outer', axis = 1, sort = True) 

#predictors.dropna(inplace = True)  # drop NaN values

#drop target and other unnecessary data from predictors dataframe
predictors.drop(['msft adj close', 'Real Middle Band', 'MACD_Hist', 'MACD_Signal'], 
                axis = 1, inplace = True) 

target = msft['msft adj close'].copy()  # create target dataframe

## Push predictors & target sets to flat files

In [8]:
predictors.to_csv('processed data/predictors.csv')
target.to_csv('processed data/target.csv')

## Split into Testing & Training Sets

In [9]:
# testing data is 2016 - 2018, training data is 2010 - 2015
predictors_train = predictors.iloc[0:-501]
predictors_test = predictors.iloc[-501:]
target_train = target.iloc[0:-501]
target_test = target.iloc[-501:]

In [10]:
predictors_train.to_csv('processed data/predictors_train.csv')
predictors_test.to_csv('processed data/predictors_test.csv')
target_train.to_csv('processed data/target_train.csv')
target_test.to_csv('processed data/target_test.csv')