# Week 2

Because of my lack of expertise when it comes to the stock market, I thought using machine learning might be a better approach, rather than manually trying to interpret complex technical indicators like I did last week. This week I began refreshing my Tensorflow knowledge, and based on many articles that I read this week (such as this: https://cs230.stanford.edu/projects_fall_2019/reports/26254244.pdf) I decided to work on creating a Long Short-Term Memory (LSTM) Recurrent neural network, as it is used for time series forecasting. I began by getting data using yahoo_fin and working on preprocessing it. By next week I hope to have built and trained my model so that I can start implementing my new technique. For my trades this week I reutilized my technical indicator approach from last week.  

In [1]:
pip install yahoo_fin

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas numpy

Note: you may need to restart the kernel to use updated packages.


In [4]:
from yahoo_fin import stock_info as si

In [4]:
si.tickers_nasdaq()

['AACG',
 'AACI',
 'AACIU',
 'AACIW',
 'AADI',
 'AADR',
 'AAL',
 'AAME',
 'AAOI',
 'AAON',
 'AAPL',
 'AATC',
 'AAWW',
 'AAXJ',
 'ABCB',
 'ABCL',
 'ABCM',
 'ABEO',
 'ABGI',
 'ABIO',
 'ABMD',
 'ABNB',
 'ABOS',
 'ABSI',
 'ABST',
 'ABTX',
 'ABUS',
 'ABVC',
 'ACAD',
 'ACAH',
 'ACAHU',
 'ACAHW',
 'ACB',
 'ACBA',
 'ACBAU',
 'ACBAW',
 'ACBI',
 'ACCD',
 'ACER',
 'ACET',
 'ACEV',
 'ACEVU',
 'ACEVW',
 'ACGL',
 'ACGLN',
 'ACGLO',
 'ACHC',
 'ACHL',
 'ACHV',
 'ACIU',
 'ACIW',
 'ACKIT',
 'ACKIU',
 'ACKIW',
 'ACLS',
 'ACMR',
 'ACNB',
 'ACOR',
 'ACQR',
 'ACQRU',
 'ACQRW',
 'ACRS',
 'ACRX',
 'ACST',
 'ACT',
 'ACTD',
 'ACTDU',
 'ACTDW',
 'ACTG',
 'ACVA',
 'ACWI',
 'ACWX',
 'ACXP',
 'ADAG',
 'ADALU',
 'ADAP',
 'ADBE',
 'ADER',
 'ADERU',
 'ADERW',
 'ADES',
 'ADGI',
 'ADI',
 'ADIL',
 'ADILW',
 'ADMA',
 'ADMP',
 'ADN',
 'ADNWW',
 'ADOC',
 'ADOCR',
 'ADOCW',
 'ADP',
 'ADPT',
 'ADRE',
 'ADSK',
 'ADTN',
 'ADTX',
 'ADUS',
 'ADV',
 'ADVM',
 'ADVWW',
 'ADXN',
 'ADXS',
 'AEAC',
 'AEACU',
 'AEACW',
 'AEAEU',
 'AEHA'

In [5]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


In [10]:
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
import numpy as np
from collections import deque 

In [7]:
features = ['adjclose', 'volume', 'open', 'high', 'low']

In [11]:
# loads data from yahoo finance based on a ticker string, ensures all requist columns are in the 
# DataFrame, adds date col if it doesn't exist (uses index values), scales the data, then shuffles 
# and splits the data into training and testing sets 
def get_data(ticker, window=50, predict=1):
    df = si.get_data(ticker)
    output = {}
    output['original df'] = df.copy()
    print(df.index)

    for col in features: 
        if col not in features:
            return f'Missing {col}'
    
    if 'date' not in df.columns:
        df['date'] = df.index

    scaler = preprocessing.MinMaxScaler()
    df[features] = scaler.fit_transform(df[features])
    
    df['predicted'] = df['adjclose'].shift(-predict)
    last_values = np.array(df[features].tail(predict)) 
    df.dropna(inplace=True)
 
    predicted_val = df['predicted'].values
    features_dates = df[features + ['date']].values
    d = deque(maxlen=window)
    window_seq = []

    for i in range(len(features_dates)):
        d.append(features_dates[i])
        if len(d) == window:
            window_seq.append([np.array(d), predicted_val[i]])
    
    # final feature values left after broken up into window sequences, and last values 
    # len(last_values) = (window + predict)
    last_values = list([seq[:len(features)] for seq in d]) + list(last_values) 
    last_values = np.array(last_values).astype(np.float32)
    output['last values'] = last_values

    x, y = [], []
    for X, Y in window_seq:
        x.append(X)
        y.append(Y)

    x = np.array(x)
    y = np.array(y)

    output['x train'], output['x test'], output['y train'], output['y test'] = train_test_split(x, y, test_size=.2)

    dates = output['x test'][:, -1, -1]
    output['test df'] = output['original df'].loc[dates]
    output['test df'] = output['test df'].index.drop_duplicates()
    output['x train'] = output['x train'][:, :, : len(features)].astype(np.float32)
    output['x test'] = output['x test'][:, :, : len(features)].astype(np.float32)
    
    return output

In [12]:
get_data('AAPL')

DatetimeIndex(['1980-12-12', '1980-12-15', '1980-12-16', '1980-12-17',
               '1980-12-18', '1980-12-19', '1980-12-22', '1980-12-23',
               '1980-12-24', '1980-12-26',
               ...
               '2021-11-16', '2021-11-17', '2021-11-18', '2021-11-19',
               '2021-11-22', '2021-11-23', '2021-11-24', '2021-11-26',
               '2021-11-29', '2021-11-30'],
              dtype='datetime64[ns]', length=10330, freq=None)


{'original df':                   open        high         low       close    adjclose  \
 1980-12-12    0.128348    0.128906    0.128348    0.128348    0.100453   
 1980-12-15    0.122210    0.122210    0.121652    0.121652    0.095213   
 1980-12-16    0.113281    0.113281    0.112723    0.112723    0.088224   
 1980-12-17    0.115513    0.116071    0.115513    0.115513    0.090408   
 1980-12-18    0.118862    0.119420    0.118862    0.118862    0.093029   
 ...                ...         ...         ...         ...         ...   
 2021-11-23  161.119995  161.800003  159.059998  161.410004  161.410004   
 2021-11-24  160.750000  162.139999  159.639999  161.940002  161.940002   
 2021-11-26  159.570007  160.449997  156.360001  156.809998  156.809998   
 2021-11-29  159.369995  161.190002  158.789993  160.240005  160.240005   
 2021-11-30  159.990005  165.520004  159.919998  165.300003  165.300003   
 
                volume ticker  
 1980-12-12  469033600   AAPL  
 1980-12-15  175884