# Week 2

Because of my lack of expertise when it comes to the stock market, I thought using machine learning might be a better approach, rather than manually trying to interpret complex technical indicators like I did last week. This week I began refreshing my Tensorflow knowledge, and based on many articles that I read this week (such as this: https://cs230.stanford.edu/projects_fall_2019/reports/26254244.pdf) I decided to work on creating a Long Short-Term Memory (LSTM) Recurrent neural network, as it is used for time series forecasting. I began by getting data using yahoo_fin and working on preprocessing it. By next week I hope to have built and trained my model so that I can start implementing my new technique. For my trades this week I reutilized my technical indicator approach from last week.  

In [1]:
pip install yahoo_fin

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas numpy

Note: you may need to restart the kernel to use updated packages.


In [3]:
from yahoo_fin import stock_info as si

In [4]:
si.tickers_nasdaq()

['AACG',
 'AACI',
 'AACIU',
 'AACIW',
 'AADI',
 'AADR',
 'AAL',
 'AAME',
 'AAOI',
 'AAON',
 'AAPL',
 'AATC',
 'AAWW',
 'AAXJ',
 'ABCB',
 'ABCL',
 'ABCM',
 'ABEO',
 'ABGI',
 'ABIO',
 'ABMD',
 'ABNB',
 'ABOS',
 'ABSI',
 'ABST',
 'ABTX',
 'ABUS',
 'ABVC',
 'ACAD',
 'ACAH',
 'ACAHU',
 'ACAHW',
 'ACB',
 'ACBA',
 'ACBAU',
 'ACBAW',
 'ACBI',
 'ACCD',
 'ACER',
 'ACET',
 'ACEV',
 'ACEVU',
 'ACEVW',
 'ACGL',
 'ACGLN',
 'ACGLO',
 'ACHC',
 'ACHL',
 'ACHV',
 'ACIU',
 'ACIW',
 'ACKIT',
 'ACKIU',
 'ACKIW',
 'ACLS',
 'ACMR',
 'ACNB',
 'ACOR',
 'ACQR',
 'ACQRU',
 'ACQRW',
 'ACRS',
 'ACRX',
 'ACST',
 'ACT',
 'ACTD',
 'ACTDU',
 'ACTDW',
 'ACTG',
 'ACVA',
 'ACWI',
 'ACWX',
 'ACXP',
 'ADAG',
 'ADALU',
 'ADAP',
 'ADBE',
 'ADER',
 'ADERU',
 'ADERW',
 'ADES',
 'ADGI',
 'ADI',
 'ADIL',
 'ADILW',
 'ADMA',
 'ADMP',
 'ADN',
 'ADNWW',
 'ADOC',
 'ADOCR',
 'ADOCW',
 'ADP',
 'ADPT',
 'ADRE',
 'ADSK',
 'ADTN',
 'ADTX',
 'ADUS',
 'ADV',
 'ADVM',
 'ADVWW',
 'ADXN',
 'ADXS',
 'AEAC',
 'AEACU',
 'AEACW',
 'AEAEU',
 'AEHA'

In [5]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


In [6]:
from sklearn import preprocessing, utils 
import numpy as np
from collections import deque 

In [7]:
features = ['adjclose', 'volume', 'open', 'high', 'low']

In [8]:
# loads data from yahoo finance based on a ticker string, ensures all requist columns are in the 
# DataFrame, adds date col if it doesn't exist (uses index values), splits the data by date into 
# training and testing sets 
def get_data(ticker, window=50, predict=1):
    try:
        df = si.get_data(ticker)
    except TypeError:
        print('ticker not string type')
    
    output = {}
    output['DataFrame'] = df.copy()

    for col in features:
        if col not in df.columns:
            return print(f'{col} missing')

    if 'date' not in df.columns:
        df['date'] = df.index

    # for col in features:
    #     scaler = preprocessing.MinMaxScaler()
    #     df[col] = scaler.fit_transform()
    # scaling data unfinished 

    df['future'] = df['adjclose'].shift(-predict)
    window_seq = np.array(df[features].tail(window))
    df.dropna(inplace=True)

    feat_date = deque(maxlen=window)
    feat_date_future = []
    temp = zip(df[features + ['date']].values, df['future'].values)

    for x, y in temp:
        feat_date.append(x)
        if len(feat_date) == window:
            feat_date_future.append([np.array(feat_date), y])
    
    # print(list([i[:len(features)] for i in feat_date]))
    # print(list([i[:len(features)] for i in feat_date]) + list(window_seq))

    window_seq = list([i[:len(features)] for i in feat_date]) + list(window_seq)
    window_seq = np.array(window_seq).astype(np.float32)
    
    output['window_seq'] = window_seq

    x = []
    y = []

    for X, Y in feat_date_future:
        x.append(X)
        y.append(Y)

    x = np.array(x)
    y = np.array(y)

    # print(x)

    train_size = int(.8 * len(x))
    output['train_x'] = x[:train_size]
    output['train_y'] = y[:train_size]
    output['test_x'] = x[train_size:]
    output['test_y'] = y[train_size:]

    utils.shuffle(output['train_x'], output['train_y'])
    utils.shuffle(output['test_x'], output['test_y'])

    dates = output['test_x'][:, -1, -1]
    output['DataFrame_test'] = output['DataFrame'].loc[dates]
    output['DataFrame_test'].drop_duplicates(keep='first')
    output['train_x'] = output['train_x'][:, :, :len(features)].astype(np.float32)
    output['test_x'] = output['test_x'][:, :, :len(features)].astype(np.float32)
    return output

In [9]:
get_data('AAPL')

{'DataFrame':                   open        high         low       close    adjclose  \
 1980-12-12    0.128348    0.128906    0.128348    0.128348    0.100453   
 1980-12-15    0.122210    0.122210    0.121652    0.121652    0.095213   
 1980-12-16    0.113281    0.113281    0.112723    0.112723    0.088224   
 1980-12-17    0.115513    0.116071    0.115513    0.115513    0.090408   
 1980-12-18    0.118862    0.119420    0.118862    0.118862    0.093029   
 ...                ...         ...         ...         ...         ...   
 2021-11-19  157.649994  161.020004  156.529999  160.550003  160.550003   
 2021-11-22  161.679993  165.699997  161.000000  161.020004  161.020004   
 2021-11-23  161.119995  161.800003  159.059998  161.410004  161.410004   
 2021-11-24  160.750000  162.139999  159.639999  161.940002  161.940002   
 2021-11-26  159.570007  160.449997  156.360001  156.809998  156.809998   
 
                volume ticker  
 1980-12-12  469033600   AAPL  
 1980-12-15  17588480