# Preprocessing

## Import Packages

In [1]:
import pandas as pd
import numpy as np

## Define functions to create dataframes
Pull only the data to be used by the model with appropriate naming conventions. 

Includes engineered feature *differential*: $C_d - O_d$  (*daily close - daily open*)

In [2]:
def get_company_data(name, filename):
    """
    function to pull only the data to be used by the model from the flat files
    
    name is a string used to add the company name to the columns
    
    filename is a string which must match the name of the flat file in the raw data folder
    """
    
    df = pd.read_csv('raw data/'+filename+'.csv', parse_dates =  {'date' : [0]}, index_col = 0)
    df.drop(['2. high', '3. low', '7. dividend amount', '8. split coefficient'],
           axis = 1, inplace = True)
    df['1. open'] = df['4. close'] - df['1. open'] #create differential values in open column location
    df.rename(index = str, columns = {'1. open': name + ' differential',
                                      '5. adjusted close': name + ' adj close',
                                      '6. volume': name + ' vol'}, inplace = True) 
    df.drop('4. close', axis = 1, inplace = True) #drop close column
    return df

In [3]:
def get_index_data(name, filename):
    """
    function to pull only the data to be used by the model from the flat files
    
    name is a string used to add the index name to the columns
    
    filename is a string which must match the name of the flat file in the raw data folder
    """
        
    df = pd.read_csv('raw data/'+filename+'.csv', parse_dates =  {'date' : [0]}, index_col = 0)
    df.drop(['High', 'Low'], axis = 1, inplace = True)
    df['Open'] = df['Close'] - df['Open'] #create differential values in open column location
    df.rename(index = str, columns = {'Open': name + ' differential',
                                      'Adj Close': name + ' adj close',
                                      'Volume': name + ' vol'}, inplace = True) 
    df.drop('Close', axis = 1, inplace = True) #drop close column
    return df

## Load data from flat files

In [4]:
#historical stock data
amzn = get_company_data('amzn', 'amzn')
aapl = get_company_data('aapl', 'apple')
googl = get_company_data('googl', 'google')
msft = get_company_data('msft', 'msft')
#technical indicators
ma7 = pd.read_csv('raw data/msft_ma7.csv', index_col = 0)
ma21 = pd.read_csv('raw data/msft_ma21.csv', index_col = 0)
bbands = pd.read_csv('raw data/msft_bbands.csv', index_col = 0)
macd = pd.read_csv('raw data/msft_macd.csv', index_col = 0)
#indicies
nasdaq = get_index_data('nasdaq', '^IXIC')
nyse = get_index_data('nyse', '^NYA')
sp500 = get_index_data('sp500', '^GSPC')
tb13 = get_index_data('tb13', '^IRX')

## Create predictors and target dataframes

In [5]:
predictors = pd.DataFrame(index = amzn.index) #blank dataframe to merge data into

df_to_merge = [amzn, aapl, googl, msft, ma7, ma21, bbands, macd, #list of data frames to merge
                              nasdaq, nyse, sp500, tb13] 

#merge data into 1 dataframe
predictors = pd.concat(df_to_merge, join = 'outer', axis = 1, sort = True) 

#drop target and other unnecessary data from predictors dataframe
predictors.drop(['msft adj close', 'Real Middle Band', 'MACD_Hist', 'MACD_Signal'], 
                axis = 1, inplace = True) 

target = msft['msft adj close'].copy() #create target dataframe

## Push predictors & target sets to flat files

In [6]:
predictors.to_csv('processed data/predictors.csv')
target.to_csv('processed data/target.csv')