In [1]:
import matplotlib.pyplot as plt
import numpy as np
import copy
import pandas as pd
import json
from statistics import mean
from statsmodels.tsa.stattools import grangercausalitytests
import datetime as dt
from datetime import datetime, timedelta, date
import torch
from torch import nn
from pyro.nn import PyroModule

assert issubclass(PyroModule[nn.Linear], nn.Linear)
assert issubclass(PyroModule[nn.Linear], PyroModule)

In [2]:
# read in data
companies = [('AMZN', 'Amazon'), ('AAPL', 'Apple'), ('MSFT', 'Microsoft'),
             ('DIS', 'Disney'), ('GOOG', 'Google'), ('CVS', 'CVS'),
             ('GE', 'General Electric'), ('SAN', 'Santander'),
             ('GS', 'Goldman Sachs'), ('CICHY', 'China Construction Bank')]
stock_data = []
tweet_data = []
for company in companies:
    abbr = company[0]
    stock_data.append(pd.read_csv('./COMS6998-Project/financial/' + abbr + '_financial.csv'))
    curr = pd.read_csv('COMS6998-Project/sentiment/' + abbr + '_sentiment.csv')
    times = []
    for time in curr['Time']:
        date = time.split()[0]
        times.append(date)
    curr['Time'] = times
    tweet_data.append(curr)

In [3]:
def calculate_lag(stocks, tweets, lag):
    prev_stock_close = []
    curr_stock_close = []
    avg_pos = []
    avg_neg = []

    prev_time = datetime.strptime(tweets['Time'][0], '%Y-%m-%d')
    curr_time = None
    for date in stocks['Date']:
        if datetime.strptime(date, '%Y-%m-%d').date() < datetime.strptime(stocks['Date'][lag], '%Y-%m-%d').date():
            continue
        index = stocks[stocks['Date'] == date].index[0] # getting current date's index in series
        prev_stock_close.append(stocks['Close'][index-lag])
        curr_stock_close.append(stocks['Close'][index])
        pos = []
        neg = []
        for time in tweets['Time']:
            start = datetime.strptime(copy.copy(time), '%Y-%m-%d')
            curr_time = datetime.strptime(date, '%Y-%m-%d')
            next_day = prev_time + timedelta(days = 1)
            if lag == 1: # accounting for weekends, which 3 and 5-day lags skip over
                if (start.date() >= prev_time.date()) and (start.date() < curr_time.date()):
                    index = tweets[tweets['Time'] == time].index[0]
                    pos.append(json.loads(tweets['Sentiment'][index])[0])
                    neg.append(json.loads(tweets['Sentiment'][index])[1])
            else:
                if (start.date() >= prev_time.date()) and (start.date() < next_day.date()):
                    index = tweets[tweets['Time'] == time].index[0]
                    pos.append(json.loads(tweets['Sentiment'][index])[0])
                    neg.append(json.loads(tweets['Sentiment'][index])[1])
#             print(curr_time)
        if len(pos) > 0:
            avg_pos.append(mean(pos))
        else:
            avg_pos.append(0) # less popular companies may not have any tweets mentioning them on a particular day, so we add a 0-value sentiment
        if len(neg) > 0:
            avg_neg.append(mean(neg))
        else:
            avg_neg.append(0)
        prev_time = copy.copy(curr_time)
    
    return prev_stock_close, curr_stock_close, avg_pos, avg_neg

In [4]:
lags = [1, 3, 5]
lag1_data = []
lag3_data = []
lag5_data = []
for l in lags:
    for i in range(len(stock_data)):
        prev_stock_close, curr_stock_close, avg_pos, avg_neg = calculate_lag(stock_data[i], tweet_data[i], l)
        data = {'curr_close': curr_stock_close,
                'prev_close': prev_stock_close,
                'pos_sentiment': avg_pos,
                'neg_sentiment': avg_neg}
        df = pd.DataFrame(data, columns = ['curr_close','prev_close', 'pos_sentiment', 'neg_sentiment'])
        print("added data for", companies[i][1], "at lag =", l, "...")
        if l == 1:
            lag1_data.append(df)
        elif l == 3:
            lag3_data.append(df)
        else:
            lag5_data.append(df)
print("finished!")

added data for Amazon at lag = 1 ...
added data for Apple at lag = 1 ...
added data for Microsoft at lag = 1 ...
added data for Disney at lag = 1 ...
added data for Google at lag = 1 ...
added data for CVS at lag = 1 ...
added data for General Electric at lag = 1 ...
added data for Santander at lag = 1 ...
added data for Goldman Sachs at lag = 1 ...
added data for China Construction Bank at lag = 1 ...
added data for Amazon at lag = 3 ...
added data for Apple at lag = 3 ...
added data for Microsoft at lag = 3 ...
added data for Disney at lag = 3 ...
added data for Google at lag = 3 ...
added data for CVS at lag = 3 ...
added data for General Electric at lag = 3 ...
added data for Santander at lag = 3 ...
added data for Goldman Sachs at lag = 3 ...
added data for China Construction Bank at lag = 3 ...
added data for Amazon at lag = 5 ...
added data for Apple at lag = 5 ...
added data for Microsoft at lag = 5 ...
added data for Disney at lag = 5 ...
added data for Google at lag = 5 ...
a

In [22]:
# def var_model(x, y):
def var_model(x, y, num_iterations):
    
    # Regression model
    linear_reg_model = PyroModule[nn.Linear](2, 1)

    # Define loss and optimize
    loss_fn = torch.nn.MSELoss(reduction='sum')
    optim = torch.optim.Adam(linear_reg_model.parameters(), lr=0.05)
    # num_iterations = 1500

    def train():
        # run the model forward on the data
        y_pred = linear_reg_model(x).squeeze(-1)
        # calculate the mse loss
        loss = loss_fn(y_pred, y)
        # initialize gradients to zero
        optim.zero_grad()
        # backpropagate
        loss.backward()
        # take a gradient step
        optim.step()
        return loss

    for j in range(num_iterations):
        loss = train()
        if (j + 1) % 50 == 0:
            print("[iteration %04d] loss: %.4f" % (j + 1, loss.item()))

    # Inspect learned parameters
    print("Learned parameters:")
    for name, param in linear_reg_model.named_parameters():
        print(name, param.data.numpy())

In [23]:
def convert_data(df):
    y = torch.tensor(df['curr_close'].values, dtype=torch.float)
    pos_x = torch.tensor(df[['prev_close', 'pos_sentiment']].values, dtype=torch.float)
    neg_x = torch.tensor(df[['prev_close', 'neg_sentiment']].values, dtype=torch.float)
    return y, pos_x, neg_x

In [26]:
iterations = 100
for i in range(len(companies)):
    y, pos_x, neg_x = convert_data(lag1_data[i])
    print("\n", companies[i][1], "with positive sentiment: ")
    var_model(pos_x, y, iterations)
    granger_test_result = grangercausalitytests(lag1_data[i][['prev_close','pos_sentiment']], maxlag=1, verbose=True)

    print("\n", companies[i][1], "with negative sentiment: ")
    var_model(neg_x, y, iterations)
    granger_test_result = grangercausalitytests(lag1_data[i][['prev_close','neg_sentiment']], maxlag=1, verbose=True)


 Amazon with positive sentiment: 
[iteration 0050] loss: 9060047.0000
[iteration 0100] loss: 398935.1250
Learned parameters:
weight [[0.9682846 1.4361296]]
bias [1.8256166]

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.3122  , p=0.5768  , df_denom=248, df_num=1
ssr based chi2 test:   chi2=0.3160  , p=0.5740  , df=1
likelihood ratio test: chi2=0.3158  , p=0.5741  , df=1
parameter F test:         F=0.3122  , p=0.5768  , df_denom=248, df_num=1

 Amazon with negative sentiment: 
[iteration 0050] loss: 166688.6562
[iteration 0100] loss: 163808.7500
Learned parameters:
weight [[0.9945425  0.29776064]]
bias [0.20102845]

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.8893  , p=0.1705  , df_denom=248, df_num=1
ssr based chi2 test:   chi2=1.9121  , p=0.1667  , df=1
likelihood ratio test: chi2=1.9049  , p=0.1675  , df=1
parameter F test:         F=1.8893  , p=0.1705  , df_denom=248, df_num=1

 Apple with positive sentiment: 
[iteration 0

ssr based F test:         F=0.0079  , p=0.9291  , df_denom=248, df_num=1
ssr based chi2 test:   chi2=0.0080  , p=0.9286  , df=1
likelihood ratio test: chi2=0.0080  , p=0.9286  , df=1
parameter F test:         F=0.0079  , p=0.9291  , df_denom=248, df_num=1

 China Construction Bank with negative sentiment: 
[iteration 0050] loss: 3276.1072
[iteration 0100] loss: 22.9607
Learned parameters:
weight [[0.9106747  0.00511646]]
bias [1.1714329]

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0034  , p=0.9533  , df_denom=248, df_num=1
ssr based chi2 test:   chi2=0.0035  , p=0.9530  , df=1
likelihood ratio test: chi2=0.0035  , p=0.9530  , df=1
parameter F test:         F=0.0034  , p=0.9533  , df_denom=248, df_num=1


In [None]:
# fig, ax = plt.subplots(figsize=(12, 6))
# ax.plot(sample_stock['Date'], sample_stock['Close'], "o")
# ax.set(xlabel='Date',
#           ylabel='Closing Price',
#           title='HK')

In [None]:
# pyro predictive model for evaluation