In [1]:
import matplotlib.pyplot as plt
import numpy as np
import copy
import pandas as pd
import json
from statistics import mean
from statsmodels.tsa.stattools import grangercausalitytests
import datetime as dt
from datetime import datetime, timedelta, date
import torch
from torch import nn
from pyro.nn import PyroModule

assert issubclass(PyroModule[nn.Linear], nn.Linear)
assert issubclass(PyroModule[nn.Linear], PyroModule)

In [2]:
# read in data
sample_stock = pd.read_csv('COMS6998-Project/financial/AMZN_financial.csv')
sample_tweet = pd.read_csv('COMS6998-Project/sentiments/Amazon_tweet_sentiment.csv')

In [3]:
january = sample_stock[:22][:]
january

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,2018-12-31,1510.800049,1520.76001,1487.0,1501.969971,1501.969971,6954500
1,1,2019-01-02,1465.199951,1553.359985,1460.930054,1539.130005,1539.130005,7983100
2,2,2019-01-03,1520.01001,1538.0,1497.109985,1500.280029,1500.280029,6975600
3,3,2019-01-04,1530.0,1594.0,1518.310059,1575.390015,1575.390015,9182600
4,4,2019-01-07,1602.310059,1634.560059,1589.189941,1629.51001,1629.51001,7993200
5,5,2019-01-08,1664.689941,1676.609985,1616.609985,1656.579956,1656.579956,8881400
6,6,2019-01-09,1652.97998,1667.800049,1641.400024,1659.420044,1659.420044,6348800
7,7,2019-01-10,1641.01001,1663.25,1621.619995,1656.219971,1656.219971,6507700
8,8,2019-01-11,1640.550049,1660.290039,1636.219971,1640.560059,1640.560059,4686200
9,9,2019-01-14,1615.0,1648.199951,1595.150024,1617.209961,1617.209961,6005900


In [4]:
times = []
for time in sample_tweet['Time']:
    date = time.split()[0]
    times.append(date)

In [5]:
sample_tweet['Time'] = times
sample_tweet

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Time,Text,Sentiment
0,0,0,2019-01-01,how train amazon’s alexa recognize different p...,"[25.824289998295193, -3.393862700085713, -10.6..."
1,1,1,2020-11-10,our best selling fire tv alexa. starting .,"[18.373346870949714, -3.6249137586891753, -11...."
2,2,2,2019-01-01,dantwitwit freetotweet torontostar ddale colle...,"[97.8097246096391, 105.56702201940892, -5.9932..."
3,3,3,2019-01-01,cosmic carol well done have read amazon got co...,"[41.73129325240489, 30.821970932232297, -10.08..."
4,4,4,2019-01-01,fat sushi roll amazonhelp newegg yikes. amazon...,"[50.74213731507144, 31.57358307433904, -10.536..."
...,...,...,...,...,...
1525,1525,1525,2019-01-30,"amazon , i considering cancel credit card futu...","[25.449361421899287, 3.7985175769405113, -13.9..."
1526,1526,1526,2019-01-30,"love ... new amazon review my book, jesus econ...","[69.24814373571483, 46.21258844375352, 1.73503..."
1527,1527,1527,2019-01-30,my amazon merch account got shut dow two desig...,"[32.12811082733438, -1.6580737853152958, -10.6..."
1528,1528,1528,2019-01-30,teamtforce customer service absolutely abhorre...,"[56.3848338674943, 34.02731093082885, -8.43321..."


In [14]:
def uni_lag(stocks, tweets):
    prev_stock_close = []
    curr_stock_close = []
    avg_pos = []
    avg_neg = []

    prev_time = datetime.strptime(tweets['Time'][0], '%Y-%m-%d')
    curr_time = None
    for date in stocks['Date']:
        if date == stocks['Date'][0]:
            continue
        index = stocks[stocks['Date'] == date].index[0] # getting current date's index in series
        prev_stock_close.append(stocks['Close'][index-1]) # lag = 1
        curr_stock_close.append(stocks['Close'][index])
        pos = []
        neg = []
        for time in tweets['Time']:
            start = datetime.strptime(copy.copy(time), '%Y-%m-%d')
            curr_time = datetime.strptime(date, '%Y-%m-%d')
            if (start.date() >= prev_time.date()) and (start.date() < curr_time.date()):
                index = tweets[tweets['Time'] == time].index[0]
                pos.append(json.loads(tweets['Sentiment'][index])[0])
                neg.append(json.loads(tweets['Sentiment'][index])[1])
        avg_pos.append(mean(pos))
        avg_neg.append(mean(neg))
        prev_time = copy.copy(curr_time)
    
    return prev_stock_close, curr_stock_close, avg_pos, avg_neg

In [None]:
def multi_lag(stocks, tweets, lag):
    prev_stock_close = []
    curr_stock_close = []
    avg_pos = []
    avg_neg = []

    prev_time = datetime.strptime(tweets['Time'][0], '%Y-%m-%d')
    curr_time = None
    for date in stocks['Date']:
        if datetime.strptime(date, '%Y-%m-%d').date() < datetime.strptime(stocks['Date'][lag], '%Y-%m-%d').date():
            continue
        index = stocks[stocks['Date'] == date].index[0] # getting current date's index in series
        prev_stock_close.append(stocks['Close'][index-lag])
        curr_stock_close.append(stocks['Close'][index])
        pos = []
        neg = []
        for time in tweets['Time']:
            start = datetime.strptime(copy.copy(time), '%Y-%m-%d')
            curr_time = datetime.strptime(date, '%Y-%m-%d')
            next_day = prev_time + timedelta(days = 1) 
            if (start.date() >= prev_time.date()) and (start.date() < next_day.date()):
                index = tweets[tweets['Time'] == time].index[0]
                pos.append(json.loads(tweets['Sentiment'][index])[0])
                neg.append(json.loads(tweets['Sentiment'][index])[1])
        avg_pos.append(mean(pos))
        avg_neg.append(mean(neg))
        prev_time = copy.copy(curr_time)
    
    return prev_stock_close, curr_stock_close, avg_pos, avg_neg

In [None]:
prev_stock_close, curr_stock_close, avg_pos, avg_neg = multi_lag(january, sample_tweet, 5)
data = {'curr_close': curr_stock_close,
        'prev_close': prev_stock_close,
        'pos_sentiment': avg_pos,
        'neg_sentiment': avg_neg}
df = pd.DataFrame(data, columns = ['curr_close','prev_close', 'pos_sentiment', 'neg_sentiment'])
df

In [16]:
# Gauri --> after cDPM: with topics, it would be current day close, prev day close, prev day topic (?)

prev_stock_close, curr_stock_close, avg_pos, avg_neg = uni_lag(january, sample_tweet)
data = {'curr_close': curr_stock_close,
        'prev_close': prev_stock_close,
        'pos_sentiment': avg_pos,
        'neg_sentiment': avg_neg}
df = pd.DataFrame(data, columns = ['curr_close','prev_close', 'pos_sentiment', 'neg_sentiment'])
df

Unnamed: 0,curr_close,prev_close,pos_sentiment,neg_sentiment
0,1539.130005,1501.969971,25.82429,-3.393863
1,1500.280029,1539.130005,138.404674,97.169707
2,1575.390015,1500.280029,98.321253,90.279607
3,1629.51001,1575.390015,55.811772,41.735355
4,1656.579956,1629.51001,29.60806,21.908681
5,1659.420044,1656.579956,42.887675,32.37915
6,1656.219971,1659.420044,74.385061,60.598953
7,1640.560059,1656.219971,18.517423,9.034163
8,1617.209961,1640.560059,29.640189,15.669504
9,1674.560059,1617.209961,10.022896,-0.132795


In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(sample_stock['Date'], sample_stock['Close'], "o")
ax.set(xlabel='Date',
          ylabel='Closing Price',
          title='HK')

In [None]:
# reformatting data for VAR model
y = torch.tensor(df['curr_close'].values, dtype=torch.float)
pos_x = torch.tensor(df[['prev_close', 'pos_sentiment']].values, dtype=torch.float)
neg_x = torch.tensor(df[['prev_close', 'neg_sentiment']].values, dtype=torch.float)
y

In [None]:
def var_model(x, y):
    # Regression model
    linear_reg_model = PyroModule[nn.Linear](2, 1)

    # Define loss and optimize
    loss_fn = torch.nn.MSELoss(reduction='sum')
    optim = torch.optim.Adam(linear_reg_model.parameters(), lr=0.05)
    num_iterations = 1500
    
    def train():
        # run the model forward on the data
        y_pred = linear_reg_model(x).squeeze(-1)
        # calculate the mse loss
        loss = loss_fn(y_pred, y)
        # initialize gradients to zero
        optim.zero_grad()
        # backpropagate
        loss.backward()
        # take a gradient step
        optim.step()
        return loss

    for j in range(num_iterations):
        loss = train()
        if (j + 1) % 50 == 0:
            print("[iteration %04d] loss: %.4f" % (j + 1, loss.item()))

    # Inspect learned parameters
    print("Learned parameters:")
    for name, param in linear_reg_model.named_parameters():
        print(name, param.data.numpy())

In [None]:
var_model(pos_x, y)

In [None]:
var_model(neg_x, y)

In [None]:
# one equation for each company (with pos + neg sentiment)
# experiment with lag = 1, 3, 5 days
# one equation for each topic in each company

In [19]:
# granger causality
granger_test_result = grangercausalitytests(df[['prev_close','pos_sentiment']], maxlag=1, verbose=True)
granger_test_result = grangercausalitytests(df[['prev_close','neg_sentiment']], maxlag=1, verbose=True)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=7.0507  , p=0.0167  , df_denom=17, df_num=1
ssr based chi2 test:   chi2=8.2950  , p=0.0040  , df=1
likelihood ratio test: chi2=6.9390  , p=0.0084  , df=1
parameter F test:         F=7.0507  , p=0.0167  , df_denom=17, df_num=1

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.9144  , p=0.1844  , df_denom=17, df_num=1
ssr based chi2 test:   chi2=2.2523  , p=0.1334  , df=1
likelihood ratio test: chi2=2.1342  , p=0.1440  , df=1
parameter F test:         F=1.9144  , p=0.1844  , df_denom=17, df_num=1


In [None]:
# pyro predictive model for evaluation