In [0]:
import nltk
import gensim
import pandas as pd

import datetime

In [0]:
import spacy
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
nlp = spacy.load("en_core_web_lg")

In [0]:
"""Gives you a size 300 array of all the relevant entries."""
def aggregate_score(all_relevant_entries):
  sum = [0]*300
  for i in all_relevant_entries.values:
    # Note: I'm using SpaCy's vectorizer here, but transition to Google's thing
    # if this doesn't work at all
    doc = nlp(i[0])
    sum = sum + doc.vector * i[1]
  return sum

# Dealing with the Reddit data

In [0]:
red = pd.read_csv("reddits.csv", index_col=0)

In [0]:
def utc_to_date(utc):
  return datetime.datetime.utcfromtimestamp(utc).strftime('%Y-%m-%d')

In [0]:
red.head()

Unnamed: 0,title,score,created_utc
0,802.11n officially makes its way to the Airpor...,0.300062,1205778551
1,Safari 3.1 available: busts out HTML 5 support...,0.334054,1205861839
2,How Apple Got Everything Right By Doing Everyt...,0.387689,1205908607
3,Unlimited iTunes Downloads Coming Soon?,0.320115,1205953302
4,Secrets: all your undocumented OS X settings i...,0.336608,1206037963


In [0]:
red2 = red.copy()
red2["date"] = red2.created_utc.apply(utc_to_date)

In [0]:
aggs = red2.groupby("date").apply(aggregate_score)

In [0]:
aggs

date
2008-03-17    [0.06911545991897583, 0.0015553233679383993, -...
2008-03-18    [-0.011221963912248611, 0.008152482099831104, ...
2008-03-19    [-0.05266345012933016, 0.14257743582129478, -0...
2008-03-20    [0.029394835233688354, 0.06772486865520477, -0...
2008-03-21    [-0.07712917774915695, 0.04496147111058235, -0...
                                    ...                        
2020-01-02    [-0.08120829425752163, 0.4266323000192642, -0....
2020-01-03    [-0.11732648708857596, 0.9997946191579103, -0....
2020-01-04    [-0.17635978001635522, 0.5973860220983624, -0....
2020-01-05    [-0.024435298517346382, 0.0701550617814064, -0...
2020-01-28    [-0.22190772369503975, 0.30561483185738325, -0...
Length: 3942, dtype: object

In [0]:
final_vectors = pd.DataFrame(data={"vector": aggs}, index=red2.date.unique())
final_vectors

Unnamed: 0,vector
2008-03-17,"[0.06911545991897583, 0.0015553233679383993, -..."
2008-03-18,"[-0.011221963912248611, 0.008152482099831104, ..."
2008-03-19,"[-0.05266345012933016, 0.14257743582129478, -0..."
2008-03-20,"[0.029394835233688354, 0.06772486865520477, -0..."
2008-03-21,"[-0.07712917774915695, 0.04496147111058235, -0..."
...,...
2020-01-02,"[-0.08120829425752163, 0.4266323000192642, -0...."
2020-01-03,"[-0.11732648708857596, 0.9997946191579103, -0...."
2020-01-04,"[-0.17635978001635522, 0.5973860220983624, -0...."
2020-01-05,"[-0.024435298517346382, 0.0701550617814064, -0..."


In [0]:
stock_market = pd.read_csv("stock_market.csv", index_col=0)

In [0]:
sm = stock_market.drop(columns=["year", "month", "day", "utc"])
sm

Unnamed: 0_level_0,1. open,4. close
index,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-21,318.62,313.05
2020-02-20,322.63,320.30
2020-02-19,320.00,323.62
2020-02-18,315.36,319.00
2020-02-14,324.73,324.95
...,...,...
2000-02-28,110.12,113.25
2000-02-25,114.81,110.37
2000-02-24,117.31,115.20
2000-02-23,113.23,116.25


In [0]:
sm = sm.rename(columns={"1. open": "open", "4. close": "close"})
sm["open"] = sm["open"].shift(1)
sm.dropna(axis=0, inplace=True)
sm

Unnamed: 0_level_0,open,close
index,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-20,318.62,320.30
2020-02-19,322.63,323.62
2020-02-18,320.00,319.00
2020-02-14,315.36,324.95
2020-02-13,324.73,324.87
...,...,...
2000-02-28,113.56,113.25
2000-02-25,110.12,110.37
2000-02-24,114.81,115.20
2000-02-23,117.31,116.25


In [0]:
newsm = sm.join(final_vectors, on="index", how="inner")
newsm = newsm.rename(columns={"1. open": "open", "4. close": "close"})
newsm
# newsm.vector['2020-02-10']

Unnamed: 0_level_0,open,close,vector
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-28,324.45,317.69,"[-0.22190772369503975, 0.30561483185738325, -0..."
2020-01-03,293.79,297.43,"[-0.11732648708857596, 0.9997946191579103, -0...."
2020-01-02,297.15,300.35,"[-0.08120829425752163, 0.4266323000192642, -0...."
2019-11-19,265.54,266.29,"[-0.05377985071390867, 0.2639914508908987, -0...."
2019-11-06,258.74,257.24,"[-0.2616723934188485, 1.3723224624991417, -0.8..."
...,...,...,...
2008-06-09,180.33,181.61,"[0.020789338275790215, 0.27000209502875805, -0..."
2008-03-20,134.01,133.27,"[0.029394835233688354, 0.06772486865520477, -0..."
2008-03-19,131.12,129.67,"[-0.05266345012933016, 0.14257743582129478, -0..."
2008-03-18,133.12,132.82,"[-0.011221963912248611, 0.008152482099831104, ..."


In [0]:
all_data = newsm
all_data.to_csv("all_data.csv")

In [0]:
all_data

Unnamed: 0_level_0,open,close,vector
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-28,324.45,317.69,"[-0.22190772369503975, 0.30561483185738325, -0..."
2020-01-03,293.79,297.43,"[-0.11732648708857596, 0.9997946191579103, -0...."
2020-01-02,297.15,300.35,"[-0.08120829425752163, 0.4266323000192642, -0...."
2019-11-19,265.54,266.29,"[-0.05377985071390867, 0.2639914508908987, -0...."
2019-11-06,258.74,257.24,"[-0.2616723934188485, 1.3723224624991417, -0.8..."
...,...,...,...
2008-06-09,180.33,181.61,"[0.020789338275790215, 0.27000209502875805, -0..."
2008-03-20,134.01,133.27,"[0.029394835233688354, 0.06772486865520477, -0..."
2008-03-19,131.12,129.67,"[-0.05266345012933016, 0.14257743582129478, -0..."
2008-03-18,133.12,132.82,"[-0.011221963912248611, 0.008152482099831104, ..."


# Making a Model

In [0]:
import pickle

In [0]:
with open('data.pkl', 'wb') as output:
  pickle.dump(all_data, output)