# Sample project: stock prediction and trading

_Goal: to predict the price of a stock in the near future._



## Setup

Configure logging, import config, and connect to database.

In [2]:
%matplotlib inline

import os
import time
import datetime
import logging

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import config

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%H:%M:%S')
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from data.database import Database
from data.api_manager import API_Manager

db = Database(config.database)
api = API_Manager(config.api_key)


## Fetch trades from API

Fetch all trades for ticker within date range and store them in the database.

In [3]:
ticker = 'MSFT'
date_from = '2015-01-01'
date_to = '2015-01-02' #'2020-12-31'
bar_frequency = '1S' # 1 second

ticker_details = db.get_ticker_details(ticker)
if ticker_details is None:
    ticker_details = db.store_ticker_details(api.get_ticker_details(ticker))

dates_with_trades = db.get_open_dates(ticker_details['exchange'], date_from, date_to)
dates_already_stored = db.get_stored_dates('trades', ticker)
dates_to_fetch = [d for d in dates_with_trades if d not in dates_already_stored]

logging.info(f'Fetching {len(dates_to_fetch)} days of {ticker} trades.')
for date in dates_to_fetch:
    
    # Download raw trades.
    time_before_fetch = time.time()
    trades = api.get_daily_trades(ticker, date)
    
    # Process date to calculate bars from raw trades.
    time_before_process = time.time()
    trades = pd.DataFrame(trades)[['t', 'p', 's']]
    trades = trades.rename(columns={'t': 'timestamp', 'p': 'price', 's': 'volume'})
    
    trades['time'] = pd.to_datetime(trades['timestamp']) - pd.DateOffset(hours=5) # America/New Yow
    trades_grouped = trades.groupby(
        pd.Grouper(key='time', freq=bar_frequency, sort=True)
    )
    def get_weighted_bars(x):
        cols = ['mean', 'median', 'std']
        if not any(x['volume']):
            return pd.Series([np.nan, np.nan, np.nan], index=cols)
        weighted = np.concatenate([[r.price] * r.volume for r in x.itertuples()])
        return pd.Series(
            [np.mean(weighted), np.median(weighted), np.std(weighted)], 
            index=cols
        )
    price_bars = trades_grouped['price'].agg(['count', 'mean', 'median', 'min', 'max', 'std'])
    price_bars_weighted = trades_grouped.apply(get_weighted_bars)
    volume_vars = trades_grouped['volume'].agg(['sum', 'mean', 'median', 'min', 'max', 'std'])
    
    # Store raw trades and bars in database.
    time_before_store = time.time()
    db.store_trades(ticker, date, trades)
    
    time_to_fetch = int(round(time_before_process - time_before_fetch))
    time_to_process = int(round(time_before_store - time_before_process))
    time_to_store = int(round(time.time() - time_before_store))
    logging.info(
        f'{ticker} {date} - '
        f'fetch time: {time_to_fetch}s, '
        f'process time: {time_to_process}s, '
        f'store time: {time_to_store}s'
    )

01:37:08 Fetching 1 days of MSFT trades.
01:37:11 MSFT 2015-01-02 - fetch time: 1s, store time: 2s


In [92]:
def get_weighted_bars(x):
    cols = ['mean', 'median', 'std']
    if not any(x['volume']):
        return pd.Series([np.nan, np.nan, np.nan], index=cols)
    weighted = np.concatenate([[r.price] * r.volume for r in x.itertuples()])
    return pd.Series(
        [np.mean(weighted), np.median(weighted), np.std(weighted)], 
        index=cols
    )
a = time.time()
price_bars_weighted = trades_grouped.apply(get_weighted_bars)
time.time()-a

27.26710557937622

In [93]:
# Try for mean https://stackoverflow.com/questions/26205922/calculate-weighted-average-using-a-pandas-dataframe
# Try for median https://stackoverflow.com/questions/29678166/pandas-weighted-median-of-grouped-observations
# For std, make own function

26.798645734786987

## Feature engineering

For every `n` seconds during open hours, calculate target price and features. For now, `n` is set to `1`. Features are stored back in the database as they can take a while to calculate. 

Feature ideas:

- Summary stats of previous prices and volumes (https://alphascientist.com/feature_engineering.html).
- Trading markers (https://blog.roboforex.com/blog/2020/01/10/creating-a-trading-strategies-based-on-the-mean-reversion-and-momentum/).
- Stats on quotes.
- Relevant news articles.
- Mentions on social networks.


In [None]:
def iuhn(frequency)

### Target variable
The future price to predict. The variable classified into either `buy`, `don't sell`, or `sell` depending on the relative increase from the current price to the price in `m` seconds. For now, `m` is set to `5`. 

In [3]:
ticker = 'MSFT'
exchange = db.get_ticker_details(ticker)['exchange']

frequency = '1S' # 1 second

stored_dates = sorted(db.get_stored_dates('trades', ticker))
holidays = dict(db.get_holidays(exchange))

trades

for date in stored_dates[::-1]:
    
    # Determine open hours (by Robinhood/Alpaca)
    halfday = (date in holidays and holidays[date] == 'half')
    start_time = datetime.time(9, 0) 
    close_time = datetime.time(15 if halfday else 18, 0)
    
    trades = pd.DataFrame(
        db.get_trades(ticker, date), 
        columns=['timestamp', 'price', 'volume']
    )
    trades['time'] = pd.to_datetime(trades['timestamp']) - pd.DateOffset(hours=5) # America/New Yow
    
    price_index = pd.date_range(
        datetime.datetime.combine(date, start_time),
        datetime.datetime.combine(date, close_time),
        freq=frequency,
        closed='left'
    )
    
    prices = pd.DataFrame(index=price_index).merge(
        trades.groupby(pd.Grouper(key='time', freq=frequency))['price'].median(),
        how='left',
        left_index=True,
        right_index=True
    )
    
    # Propegate last valid observation forward.
    prices = prices.fillna(method='ffill')

    break
    
prices

description = 'Price'
db.store_feature(ticker, 'target', prices, description)

### Check features for outliers

### Check feature importance

## Hyperparameter tuning

- Feature time window
- Target variable future time