# Requirements

In [1]:
import pandas as pd
import numpy as np

import datetime
import pytz
import requests
from pandas.io.json import json_normalize

import pickle

import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.linear_model import LinearRegression

# Dataset

In [None]:
IEX_API_URL_TEMPLATE = 'https://api.iextrading.com/1.0/stock/{}/chart/{}'

HIST_5Y = '5y'
HIST_1Y = '1y'
HIST_1M = '1m'

DJIA_TICKERS = [
    'BA',   'PFE', 'MCD', 'WMT', 'KO',   'MRK',  'HD',   'V',   'JNJ',  'VZ',
    'CSCO', 'AXP', 'TRV', 'DIS', 'MSFT', 'UNH',  'DWDP', 'CAT', 'AAPL', 'UTX',
    'MMM',  'JPM', 'IBM', 'GS',  'XOM',  'INTC', 'NKE',  'CVX', 'PG',   'WBA' ]

In [None]:
def expand_date_in_dataframe(df):
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['day'] = df.date.dt.day
    df['week'] = df.date.dt.week
    df['dayofweek'] = df.date.dt.dayofweek
    df['dayofyear'] = df.date.dt.dayofyear
    df['timestamp'] = df.date.values.astype(np.int64)

def format_dataframe(df):
    formated_df = df.drop(['label',
                           'change', 'changeOverTime', 'changePercent',
                           'high', 'low', 'open',
                           'unadjustedVolume', 'volume', 'vwap'],
                          axis=1)

    expand_date_in_dataframe(formated_df)
    return formated_df

def get_dataframe_for_ticker(ticker_symbol, hist_period=HIST_5Y):
    r = requests.get(url = IEX_API_URL_TEMPLATE.format(ticker_symbol.lower(), hist_period))
    df = json_normalize(r.json())

    df.date = pd.to_datetime(df.date, format='%Y-%m-%d')
    df.insert(loc=0, column='symbol', value=ticker_symbol)

    return format_dataframe(df)


def get_djia_dataframe(hist_period=HIST_5Y):
    df = None

    for ticker_symbol in DJIA_TICKERS:
        if df is None:
            df = get_dataframe_for_ticker(ticker_symbol, hist_period=hist_period)
        else:
            df = df.append(get_dataframe_for_ticker(ticker_symbol, hist_period))

    return df


def update_djia_dataframe(df):
    last_recorded_day = max(df.date)
    today = datetime.datetime.now()
    days_to_update = (today - last_recorded_day).days

    hist_period = HIST_5Y

    if days_to_update < 1:
        return df
    elif days_to_update < 28:
        hist_period = HIST_1M
    elif days_to_update < 365:
        hist_period = HIST_1Y

    last_df = get_djia_dataframe(hist_period)

    return df.append(last_df).drop_duplicates(['symbol', 'date'], keep='last')

In [2]:
class Dataset:

    DJIA_TICKERS = [
        'BA',   'PFE', 'MCD', 'WMT', 'KO',   'MRK',  'HD',   'V',   'JNJ',  'VZ',
        'CSCO', 'AXP', 'TRV', 'DIS', 'MSFT', 'UNH',  'DWDP', 'CAT', 'AAPL', 'UTX',
        'MMM',  'JPM', 'IBM', 'GS',  'XOM',  'INTC', 'NKE',  'CVX', 'PG',   'WBA' ]

    HIST_5Y = '5y'
    HIST_1Y = '1y'
    HIST_1M = '1m'

    __IEX_API_URL_TEMPLATE = 'https://api.iextrading.com/1.0/stock/{}/chart/{}'


    def __init__(self):
        self.dataframe = None

    @staticmethod
    def __expand_date_in_dataframe(df):
        df['year'] = df.date.dt.year
        df['month'] = df.date.dt.month
        df['day'] = df.date.dt.day
        df['week'] = df.date.dt.week
        df['dayofweek'] = df.date.dt.dayofweek
        df['dayofyear'] = df.date.dt.dayofyear
        df['timestamp'] = df.date.values.astype(np.int64)


    @staticmethod
    def __format_dataframe(df):
        formated_df = df.drop(['label',
                               'change', 'changeOverTime', 'changePercent',
                               'high', 'low', 'open',
                               'unadjustedVolume', 'volume', 'vwap'],
                              axis=1)

        Dataset.__expand_date_in_dataframe(formated_df)
        return formated_df


    @staticmethod
    def __get_dataframe_for_ticker(ticker_symbol, hist_period=HIST_5Y):
        r = requests.get(url = Dataset.__IEX_API_URL_TEMPLATE.format(ticker_symbol.lower(), hist_period))
        df = json_normalize(r.json())

        df.date = pd.to_datetime(df.date, format='%Y-%m-%d')
        df.insert(loc=0, column='symbol', value=ticker_symbol)

        return Dataset.__format_dataframe(df)


    @staticmethod
    def __get_djia_dataframe(hist_period=HIST_5Y):
        df = None

        for ticker_symbol in Dataset.DJIA_TICKERS:
            if df is None:
                df = Dataset.__get_dataframe_for_ticker(ticker_symbol, hist_period=hist_period)
            else:
                df = df.append(Dataset.__get_dataframe_for_ticker(ticker_symbol, hist_period))

        return df


    @staticmethod
    def __update_djia_dataframe(df):
        last_recorded_day = max(df.date)
        today = datetime.datetime.now()
        days_to_update = (today - last_recorded_day).days

        hist_period = Dataset.HIST_5Y

        if days_to_update < 1:
            return df
        elif days_to_update < 28:
            hist_period = Dataset.HIST_1M
        elif days_to_update < 365:
            hist_period = Dataset.HIST_1Y

        last_df = Dataset.__get_djia_dataframe(hist_period)

        return df.append(last_df).drop_duplicates(['symbol', 'date'], keep='last')


    def loaDataFromFile(self, file_name):
        with open(file_name, 'rb') as fp:
            self.dataframe = pickle.load(fp)


    def saveDataToFile(self, file_name=None):
        with open(file_name, 'wb') as fp:
            pickle.dump(self.dataframe, fp)


    def createData(self):
        self.dataframe = get_djia_dataframe()


    def updateData(self):
        self.dataframe = Dataset.__update_djia_dataframe(self.dataframe)


    def getDataframe(self, ticker_symbol=None, from_date=None, to_date=None):
        df = self.dataframe

        if ticker_symbol is not None:
            df = df.query("symbol == '{}'".format(ticker_symbol))
        if from_date is not None:
            df = df.query("date >= '{}'".format(from_date))
        if to_date is not None:
            df = df.query("date <= '{}'".format(to_date))

        return df

In [3]:
ds = Dataset()
ds.loaDataFromFile('data/djia_20140303-20190315.pkl')

df = ds.getDataframe()
print('{} --> {}'.format(min(df.date), max(df.date)))

ds.updateData()
df = ds.getDataframe()
print('{} --> {}'.format(min(df.date), max(df.date)))



2014-03-03 00:00:00 --> 2019-03-15 00:00:00
2014-03-03 00:00:00 --> 2019-03-19 00:00:00


# Trading days

In [None]:
market_holidays = []

def load_market_holidays(market_holidays_file='market_holidays.txt'):
    market_holidays.clear()

    with open(market_holidays_file) as f:
        lines = f.readlines()

    for line in lines:
        market_holidays.append(datetime.datetime.strptime(line.strip(), '%Y-%m-%d'))

load_market_holidays()

In [None]:
def is_trading_day(day):
    day_of_week = day.weekday()
    if day_of_week == 5 or day_of_week == 6 or day in market_holidays:
        return False
    else:
        return True


def get_trading_days_in_range(start_date, end_date):
    trading_days = []

    current_day = start_date
    while (current_day <= end_date):
        if is_trading_day(current_day):
            trading_days.append(current_day)
        current_day += datetime.timedelta(days=1)

    return trading_days

In [None]:
def str_to_datetime(str_date):
    return datetime.datetime.strptime(str_date, '%Y-%m-%d')

def datetime_array_to_dataframe(days):
    return pd.DataFrame({'date': days})

# Models

In [None]:
class StockForecasterModel:
    def __init__(self, ticker, dataset):
        self.ticker = ticker
        self.dataset = dataset

    def getDataset(self):
        return self.dataset

    def setDataset(self, dataset):
        self.dataset = dataset

    def train(self, start_date, end_date):
        raise NotImplementedError("Please Implement this method")

    def predict(self, from_date, to_date):
        raise NotImplementedError("Please Implement this method")

In [None]:
class LinearStockForecaster(StockForecasterModel):

    def __init__(self, ticker, dataset):
        StockForecasterModel.__init__(self, ticker, dataset)

    def train(self, start_date, end_date):
        training_set = self.dataset.query(
            "symbol == '{}' and date >= '{}' and date <= '{}'"
            .format(self.ticker, start_date, end_date))

        x = training_set.drop(['symbol', 'date', 'close'], axis=1)
        y = training_set.close

        self.model = LinearRegression()
        self.model.fit(x, y)

    def predict(self, from_date, to_date):
        days_to_predict = get_trading_days_in_range(from_date, to_date)
        x = datetime_array_to_dataframe(days_to_predict)
        expand_date(x)
        x = x.drop(['date'], axis=1)
        return self.model.predict(x)

In [None]:
djia_ds = load_dataset_from_file('data/djia_20140303-20190315.pkl')

m = LinearStockForecaster('AAPL', djia_ds)
m.train(str_to_datetime('2015-04-01'), str_to_datetime('2018-03-31'))
y = m.predict(str_to_datetime('2018-04-01'), str_to_datetime('2018-06-30'))

print(y)

# Analysis

In [None]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [None]:
def perform_test(dataset, ticker_symbol, model_constructor,
                 training_start, training_end,
                 validation_days=[1, 7, 15, 30, 60, 90, 180]):

    #validation_days = validation_days.sort()

    training_set = dataset.query("symbol == '{}' and date >= '{}' and date <= '{}'"
                                 .format(ticker_symbol, training_start, training_end))
    model = model_constructor(ticker_symbol, training_set)
    model.train(training_start, training_end)

    for v in validation_days:
        validation_start = training_end + datetime.timedelta(days=1)
        validation_end = validation_start + datetime.timedelta(days=v)
        validation_set = dataset.query("symbol == '{}' and date >= '{}' and date <= '{}'"
                                       .format(ticker_symbol, validation_start, validation_end))
        preds = model.predict(validation_start, validation_end)
        print('Validation {} day{}, RMSE = {}'.format(v, 's' if v != 1 else '', rmse(preds, validation_set.close)))
    
    plt.plot(training_set.date, training_set.close, label='Training')
    plt.plot(validation_set.date, validation_set.close, label='Validation')
    plt.plot(validation_set.date, preds, label='Prediction')

In [None]:
perform_test(djia_ds, 'AAPL', LinearStockForecaster,
             str_to_datetime('2015-04-01'), str_to_datetime('2018-03-31'))