# Preprocessing

## Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Read data from flat files

In [2]:
#historical stock data
amzn = pd.read_csv('raw data/amzn.csv', index_col = 0)
aapl = pd.read_csv('raw data/apple.csv', index_col = 0)
goog = pd.read_csv('raw data/google.csv', index_col = 0)
msft = pd.read_csv('raw data/msft.csv', index_col = 0)
#technical indicators
ma7 = pd.read_csv('raw data/msft_ma7.csv', index_col = 0)
ma21 = pd.read_csv('raw data/msft_ma21.csv', index_col = 0)
bbands = pd.read_csv('raw data/msft_bbands.csv', index_col = 0)
macd = pd.read_csv('raw data/msft_macd.csv', index_col = 0)
#indicies
nasdaq = pd.read_csv('raw data/^IXIC.csv', index_col = 0)
nyse = pd.read_csv('raw data/^NYA.csv', index_col = 0)
sp500 = pd.read_csv('raw data/^GSPC.csv', index_col = 0)
tb13 = pd.read_csv('raw data/^IRX.csv', index_col = 0)

## Merge data into one dataframe

In [3]:
predictors = pd.DataFrame(index = amzn.index) #blank dataframe to merge data into

df_to_merge = [amzn, aapl, goog, msft, ma7, ma21, bbands, macd, #list of data frames to merge
                              nasdaq, nyse, sp500, tb13] 
columns = [ #to be used to rename columns in merged dataframe for clarity
    #amazon
    'amzn open', 'amzn high', 'amzn low', 'amzn close', 'amzn adj close',
    'amzn volume', 'amzn dividend amt', 'amzn split coef',
    #apple 
    'aapl open', 'aapl high', 'aapl low', 'aapl close', 'aapl adj close',
    'aapl volume', 'aapl dividend amt', 'aapl split coef',
    #google 
    'googl open', 'googl high', 'googl low', 'googl close', 'googl adj close',
    'googl volume', 'googl dividend amt', 'googl split coef',
    #microsoft 
    'msft open', 'msft high', 'msft low', 'msft close', 'msft adj close',
    'msft volume', 'msft dividend amt', 'msft split coef',
    #microsoft 7 day moving averge
    'msft ma7',
    #microsoft 21 day moving average
    'msft ma21',
    #microsoft bollinger bands
    'msft Real Lower Band', 'msft Real Middle Band', 'msft Real Upper Band',
    #microsoft moving average convergence divergence
    'msft MACD', 'msft MACD_Hist', 'msft MACD_Signal',
    #nasdaq index 
    'nasdaq Open', 'nasdaq High', 'nasdaq Low', 'nasdaq Close', 
    'nasdaq Adj Close', 'nasdaq Volume',
    #nyse index
    'nyse Open', 'nyse High', 'nyse Low', 'nyse Close', 
    'nyse Adj Close', 'nyse Volume',
    #s&p 500
    'sp500 Open', 'sp500 High', 'sp500 Low', 'sp500 Close', 
    'sp500 Adj Close', 'sp500 Volume',
    #13 week treasury bill interest rate
    'tb13 Open', 'tb13 High', 'tb13 Low', 'tb13 Close', 
    'tb13 Adj Close', 'tb13 Volume'
          ]

#merge data into one data frame
predictors = pd.concat(df_to_merge, join = 'outer', axis = 1, sort = True) 
predictors.columns = columns #rename columns

## Create predictors and target dataframes

In [4]:
predictors.drop('msft adj close', axis = 1, inplace = True) #drop target from predictors dataframe
target = msft['5. adjusted close'].copy() #create target dataframe

In [5]:
predictors.columns

Index(['amzn open', 'amzn high', 'amzn low', 'amzn close', 'amzn adj close',
       'amzn volume', 'amzn dividend amt', 'amzn split coef', 'aapl open',
       'aapl high', 'aapl low', 'aapl close', 'aapl adj close', 'aapl volume',
       'aapl dividend amt', 'aapl split coef', 'googl open', 'googl high',
       'googl low', 'googl close', 'googl adj close', 'googl volume',
       'googl dividend amt', 'googl split coef', 'msft open', 'msft high',
       'msft low', 'msft close', 'msft volume', 'msft dividend amt',
       'msft split coef', 'msft ma7', 'msft ma21', 'msft Real Lower Band',
       'msft Real Middle Band', 'msft Real Upper Band', 'msft MACD',
       'msft MACD_Hist', 'msft MACD_Signal', 'nasdaq Open', 'nasdaq High',
       'nasdaq Low', 'nasdaq Close', 'nasdaq Adj Close', 'nasdaq Volume',
       'nyse Open', 'nyse High', 'nyse Low', 'nyse Close', 'nyse Adj Close',
       'nyse Volume', 'sp500 Open', 'sp500 High', 'sp500 Low', 'sp500 Close',
       'sp500 Adj Close', 'sp500

## Feature Engineering
Add *differential* feature: difference of daily open & close, then remove open & close from predictors dataset.

In [6]:
#create function to make differential column & drop open/close columns
def create_diff(df, opn, cls, stock):
    """
    function to create differential between daily opening and closing price of a stock
    in a dataframe, then drop the open and close columns
    
    df is the dataframe which will be modified
    
    opn is the name of the opening price column in the dataframe df
    
    cls is the name of the closing price column in the dataframe df
    
    stock is the name of the stock (string)
    """
    df[opn] = df[cls] - df[opn] #create differential values in open column location
    df.rename(index = str, columns = {opn: stock+' differential'}, inplace = True) 
    df.drop(cls, axis = 1, inplace = True) #drop close column
    return df

In [7]:
#create iterable list of open/close/name tuples to loop over
open_close_name = [
    ('amzn open', 'amzn close', 'amzn'),
    ('aapl open', 'aapl close', 'aapl'),
    ('googl open', 'googl close', 'googl'),
    ('msft open', 'msft close', 'msft'),
    ('nasdaq Open', 'nasdaq Close', 'nasdaq'),
    ('nyse Open', 'nyse Close', 'nyse'),
    ('sp500 Open', 'sp500 Close', 'sp500'),
    ('tb13 Open', 'tb13 Close', 'tb13')
]

In [8]:
for opn, cls, name in open_close_name:
    create_diff(predictors, opn, cls, name)

In [9]:
predictors.columns

Index(['amzn differential', 'amzn high', 'amzn low', 'amzn adj close',
       'amzn volume', 'amzn dividend amt', 'amzn split coef',
       'aapl differential', 'aapl high', 'aapl low', 'aapl adj close',
       'aapl volume', 'aapl dividend amt', 'aapl split coef',
       'googl differential', 'googl high', 'googl low', 'googl adj close',
       'googl volume', 'googl dividend amt', 'googl split coef',
       'msft differential', 'msft high', 'msft low', 'msft volume',
       'msft dividend amt', 'msft split coef', 'msft ma7', 'msft ma21',
       'msft Real Lower Band', 'msft Real Middle Band', 'msft Real Upper Band',
       'msft MACD', 'msft MACD_Hist', 'msft MACD_Signal',
       'nasdaq differential', 'nasdaq High', 'nasdaq Low', 'nasdaq Adj Close',
       'nasdaq Volume', 'nyse differential', 'nyse High', 'nyse Low',
       'nyse Adj Close', 'nyse Volume', 'sp500 differential', 'sp500 High',
       'sp500 Low', 'sp500 Adj Close', 'sp500 Volume', 'tb13 differential',
       'tb13 Hi