### Setup

In [70]:
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import yfinance as yf
from alpha_vantage.timeseries import TimeSeries
import time
import datetime
import matplotlib as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
%matplotlib inline

from keys import client_id, secret_key, av_key

### Datasets

#### CSV Files in `data/`

In [29]:
pd.set_option('display.max_rows', 150)

In [7]:
gme = pd.read_csv('data/STOCK_US_XNYS_GME.csv') # dataset is TINY. NEED MORE DATA
wsb_posts = pd.read_csv('data/reddit_wsb_posts.csv', index_col=0)

In [30]:
wsb_comments = pd.read_csv('data/wsb_comments_raw.csv', 
                           nrows=20000, 
                           error_bad_lines=False) # dataset is HUGE. Restricting to 20,000

#### Yahoo Finance

In [93]:
gme_df = yf.download('GME', start='2020-01-01', end='2021-03-28', progress=True)
gme_df['Change'] = gme_df.Close - gme_df.Open
gme_df['Put/Call OI Ratio'] = float(0)

[*********************100%***********************]  1 of 1 completed


In [96]:
gme_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Change,Put/Call OI Ratio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-02,6.140000,6.470000,6.070000,6.310000,6.310000,4453600,0.170000,0.0
2020-01-03,6.210000,6.250000,5.840000,5.880000,5.880000,3543900,-0.330000,0.0
2020-01-06,5.800000,5.910000,5.600000,5.850000,5.850000,3394800,0.050000,0.0
2020-01-07,5.770000,5.830000,5.440000,5.520000,5.520000,5228000,-0.250000,0.0
2020-01-08,5.490000,5.850000,5.410000,5.720000,5.720000,5629400,0.230000,0.0
...,...,...,...,...,...,...,...,...
2021-03-22,205.259995,210.360001,186.199997,194.490005,194.490005,10061500,-10.769989,0.0
2021-03-23,197.500000,201.750000,177.550003,181.750000,181.750000,14429100,-15.750000,0.0
2021-03-24,157.979996,166.970001,118.620003,120.339996,120.339996,24177900,-37.639999,0.0
2021-03-25,123.489998,187.500000,116.900002,183.750000,183.750000,50962300,60.260002,0.0


#### Alpha Vantage API

In [100]:
ts = TimeSeries(key=av_key, output_format='pandas')
data, meta_data = ts.get_intraday(symbol='GME', interval='1min', outputsize='full')

In [102]:
data

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-03-26 20:00:00,178.51,179.00,178.50,178.50,2653.0
2021-03-26 19:59:00,179.01,179.25,179.00,179.00,1845.0
2021-03-26 19:58:00,179.02,179.02,179.02,179.02,384.0
2021-03-26 19:57:00,179.30,179.30,179.01,179.01,910.0
2021-03-26 19:56:00,179.75,179.75,179.40,179.40,795.0
...,...,...,...,...,...
2021-03-15 04:05:00,273.16,274.80,273.00,273.00,3254.0
2021-03-15 04:04:00,271.93,273.00,271.93,272.99,3584.0
2021-03-15 04:03:00,270.16,271.99,270.10,271.99,2736.0
2021-03-15 04:02:00,268.00,269.00,268.00,269.00,1417.0


In [111]:
def ticker_df(ticker_list):
    for ticker in ticker_list:
        ts = TimeSeries(key=av_key, output_format='pandas')
        f'{ticker}'_df, f'{ticker}'_meta = ts.get_intraday(symbol=f'{ticker.upper()}', interval='1min', outputsize ='full')
    print(f'{ticker_list} have been imported to DataFrame. \To call the df, type (ticker)_df')

SyntaxError: invalid syntax (<ipython-input-111-67714b1f59e2>, line 4)

In [103]:
qqq_ts = TimeSeries(key=av_key, output_format='pandas')
qqq_df, qqq_meta = qqq_ts.get_intraday(symbol='QQQ', interval='1min', outputsize='full')

In [104]:
spy_ts = TimeSeries(key=av_key, output_format='pandas')
spy_df, spy_meta = spy_ts.get_intraday(symbol='SPY', interval='1min', outputsize='full')

In [105]:
XRT_ts = TimeSeries(key=av_key, output_format='pandas')
XRT_df, XRT_meta = XRT_ts.get_intraday(symbol='XRT', interval='1min', outputsize='full')

In [106]:
iwm_ts = TimeSeries(key=av_key, output_format='pandas')
iwm_df, iwm_meta = iwm_ts.get_intraday(symbol='IWM', interval='1min', outputsize='full')

In [107]:
dow_ts = TimeSeries(key=av_key, output_format='pandas')
dow_df, dow_meta = dow_ts.get_intraday(symbol='DIA', interval='1min', outputsize='full')

In [108]:
vxx_ts = TimeSeries(key=av_key, output_format='pandas')
vxx_df, vxx_meta = vxx_ts.get_intraday(symbol='VXX', interval='1min', outputsize='full')

In [109]:
dow_df

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-03-26 19:59:00,330.5200,330.5200,330.5200,330.5200,1744.0
2021-03-26 19:53:00,330.7600,330.7600,330.6100,330.6100,426.0
2021-03-26 19:44:00,330.5300,330.5300,330.5300,330.5300,412.0
2021-03-26 19:43:00,330.7800,330.7800,330.7800,330.7800,826.0
2021-03-26 19:23:00,330.7700,330.7700,330.7700,330.7700,302.0
...,...,...,...,...,...
2021-03-15 04:22:00,328.6032,328.6032,328.6032,328.6032,210.0
2021-03-15 04:16:00,328.7230,328.7230,328.7230,328.7230,100.0
2021-03-15 04:13:00,328.4935,328.4935,328.4735,328.4735,5710.0
2021-03-15 04:09:00,328.5034,328.5034,328.5034,328.5034,101.0


# Cleaning

# EDA

# Feature Engineering

**QQQ** — Nasdaq  
**SPY** — S&P 500  
**XRT** — Retail (GME among these)  
**IWM** — Russel 2000  
**DJIA** — Dow Jones Industrial Average  
**VXX** — Barclay's ETF for VIX CBOE Volatility Index

# Modeling

# TBD

#### Reddit API

#### Pandas DataReader

In [83]:
# start = datetime.datetime(2020, 1, 1)
# end = datetime.datetime.now()
# df = web.DataReader("GME", start, end)
# df.reset_index(inplace=True)
# df.set_index("Date", inplace=True)
# df = df.drop("Symbol", axis=1)

# print(df.head())

NotImplementedError: data_source=datetime.datetime(2020, 1, 1, 0, 0) is not implemented

In [58]:
# wsb_comments[wsb_comments.associated_award != 'NaN']
wsb_comments[wsb_comments.author == 'LazyMeal']

Unnamed: 0_level_0,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,author_fullname,...,subreddit_id,total_awards_received,treatment_tags,top_awarded_type,edited,distinguished,comment_type,author_cakeday,editable,media_metadata
all_awardings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
[],,LazyMeal,,,[],,,,text,t2_4yxnnufz,...,t5_2th52,0,,,,,,,,
[],,LazyMeal,,,[],,,,text,t2_4yxnnufz,...,t5_2th52,0,,,,,,,,
[],,LazyMeal,,,[],,,,text,t2_4yxnnufz,...,t5_2th52,0,,,,,,,,


In [66]:
len(gme)

23

NASDAQ GME dataset is ridiculously short. Need to find a better dataset.

In [59]:
wsb_posts.head()

Unnamed: 0_level_0,score,id,url,comms_num,created,body,timestamp
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"It's not about the money, it's about sending a message. 🚀💎🙌",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
Math Professor Scott Steiner says the numbers spell DISASTER for Gamestop shorts,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
NEW SEC FILING FOR GME! CAN SOMEONE LESS RETARDED THAN ME PLEASE INTERPRET?,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
"Not to distract from GME, just thought our AMC brothers should be aware of this",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56
