In [7]:
# imports
import re
import sklearn
from scipy.optimize import fminbound
from sklearn import preprocessing
# import scikit-learn
import numpy as np
from numpy.linalg import inv
from pathlib import Path
from bs4 import BeautifulSoup as bs
# from textblob import TextBlob as tb
import math
import matplotlib.pyplot as plt
from datetime import datetime
import datetime as dt
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import csv
import os
import yfinance as yf
import json
import sys

In [2]:
STOP_WORDS = set(stopwords.words('english'))
#Hyper params -- ROLLING WINDOW IN FUTUERe
ALPHA_PLUS  = 0     # default for example: 0
ALPHA_MINUS = 0.2   # default for example: 0.2
KAPPA       = 1     # default for example: 1

# the complete sestm function list
def calc_returns(article):
    returns = float(article['mrkt_info']['open']) - float(article['mrkt_info']['close'])
    sgn_a = -1
    if (returns > 0): # add -1 if returns are 0 or less, 1 otherwise
        sgn_a = 1
    return (returns, sgn_a)

def html_to_bow(html):
    readable_text = bs(html, 'lxml').get_text().lower()
    # print("Text for article " + str(i) + ": '" + readable_text + "'")
    # substitute non alphabet chars (new lines become spaces)
    readable_text = re.sub(r'\n', ' ', readable_text)
    readable_text = re.sub(r'[^a-z ]', '', readable_text)
    # sub multiple spaces with one space
    readable_text = re.sub(r'\s+', ' ', readable_text)
    # tokenise text
    words = nltk.wordpunct_tokenize(readable_text)
    bow_art = {}
    lemmatizer = WordNetLemmatizer()
    en_words = set(nltk.corpus.words.words())
    # lemmatised_words = []
    if len(words) > 0:
        # lemmatise, remove non-english, and remove stopwords
        for w in words:
            rootword = lemmatizer.lemmatize(w, pos="v")
            if rootword not in STOP_WORDS and rootword in en_words:
                # lemmatised_words.append(rootword)
                if w in bow_art:
                    bow_art[w] += 1
                else:
                    bow_art[w] = 1
        # convert to bag of words
        # global_bow = {l: val+1 for l in lemmatised_words for val in global_bow.get(l, 0)}
        # bow_art = {l: val+1 for l in lemmatised_words for val in global_bow.get(l, 0)}
        # for l in lemmatised_words:
        #     if l in bow_art:
        #         bow_art[l] += 1
        #     else:
        #         bow_art[l] = 1
    
    return bow_art

def calc_f(d, sgn):
    pos_j = {}  #j occuring in positive article
    total_j = {}#j occuring in any article
    f = {}      #fraction of positive occurrences
    for i in range(len(d)):
        for w in d[i]:
            # pos_sent = sgn[i]
            pos_sent = 0
            if (sgn[i] == 1): pos_sent = 1
            if w in total_j:
                total_j[w] += d[i][w]
                pos_j[w] += d[i][w]*pos_sent
            else:
                total_j[w] = d[i][w]
                pos_j[w] = d[i][w]*pos_sent
            f[w] = pos_j[w]/total_j[w]
    return (pos_j, total_j, f)

def gen_sent_word_list(total_j,sgn,f):
    pi = sum(sgn_i > 0 for sgn_i in sgn)/len(sgn)
    print(pi)
    sentiment_words = [] # S
    neutral_words = []   # N
    for i in total_j:
        if ((f[i] >= pi + ALPHA_PLUS or f[i] <= pi - ALPHA_MINUS) and total_j[i] >= KAPPA and len(i) > 1):
            sentiment_words.append(i)
        else:
            neutral_words.append(i)
    return(sentiment_words, neutral_words)

# Calculates p_i
def calc_p(y):
    p = [0] * len(y)
    for i, x in enumerate(sorted(range(len(y)), key=lambda y_lam: y[y_lam])):
        p[x] = float((i+1)/(len(y)))
    return p

# Calculates s_i
def calc_s(sentiment_words, d):
    s = []                                          # ith element corresponds to total count of sentiment charged words for document i
    d_s = []                                        # ith element corresponds to list of word counts for each of the sentiment charged words for document i
    for doc in d:
        s.append(sum(doc.get(val,0) for val in sentiment_words))
        d_s.append([doc.get(val,0) for val in sentiment_words])
    return (s, d_s)

# Calculates h_i
def calc_h(sentiment_words, d, s, d_s):
    h = np.zeros((len(d), len(sentiment_words)))    # ith element corresponds to |S|x1 vector of word frequencies divided by total sentiment words in doc i

    for i in range(len(d)):
        # subvector of sentiment words in d_i
        if (s[i] == 0) :
            h[i] = np.zeros(len(sentiment_words)).transpose()
        else:
            h[i] = np.array([(j/s[i]) for j in d_s[i]]).transpose()
    return h

# Calculates O
def calc_o(p,h):
    p_inv = [(1-val) for val in p]
    W = np.column_stack((p, p_inv))
    W = W.transpose()
    ww = np.matmul(W,W.transpose())
    w2 = np.matmul(W.transpose(), inv(ww))
    O = np.matmul(h.transpose(),w2)
    O[O < 0] = 0 # remove negative entries of O
    O = O.transpose()
    # Normalise O columns to have l1 norm
    O = sklearn.preprocessing.normalize(O,norm='l1')
    O = O.transpose()
    return O

# lam = 3 is what i normally use
def equation_to_solve(O, p_solve, new_bow, sentiment_words, new_s, lam):
    i = 0
    equation = 0
    for j in sentiment_words:
        # a = (new_bow.get(j,0) * math.log(new_p*O[i][0] + (1-new_p)*O[i][1]))
        d_j = new_bow.get(j,0)
        in_log = p_solve*O[i][0] + (1-p_solve)*O[i][1]
        if not in_log == 0:
            equation += d_j * math.log(p_solve*O[i][0] + (1-p_solve)*O[i][1])

        i += 1
        # i += 1/new_s + lam * (new_p*(1-new_p))

    # if new_s == 0:
    #     new_s = 1
    equation /= new_s
    equation += lam*(p_solve*(1-p_solve))
    equation *= -1 #flip equation for argmin
    return equation

Now we will import all of the headlines from kaggle and pull the required stock information to compile a json list of articles like we have normally.

In [3]:
#loop through list of files
article_list = []
file_name = '/home/josh/Documents/year-4/thesis/code/kaggle-dataset-training/archive/analyst_ratings_processed.csv'
with open(file_name) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    # FORMAT: line#,headline,date,stock
    for row in csv_reader:
        if line_count > 0 and len(row) == 4:
            new_art = {
                'headline': row[1],
                'date': row[2],
                'ticker': row[3]
            }
            article_list.append(new_art)
        line_count += 1
    print(f'Processed {line_count} lines generating {len(article_list)} usable headlines')

Processed 1400470 lines generating 1397891 usable headlines


In [4]:
list_dates = [a['date'] for a in article_list]
print(f'Min date {min(list_dates)} and max date {max(list_dates)}')
for year in range(2009,2021):
    year_count = len([a for a in article_list if (a['date'] < str(year+1) + '-01-01' and a['date'] > str(year) + '-01-01')])
    print(f'No. articles in {year}: {year_count}')

Min date 2009-02-14 14:02:00-05:00 and max date 2020-06-11 17:12:00-04:00
No. articles in 2009: 14321
No. articles in 2010: 81144
No. articles in 2011: 132333
No. articles in 2012: 122234
No. articles in 2013: 121252
No. articles in 2014: 129949
No. articles in 2015: 132877
No. articles in 2016: 141315
No. articles in 2017: 120298
No. articles in 2018: 146413
No. articles in 2019: 150080
No. articles in 2020: 105675


In [6]:
list_tickers = [a['ticker'] for a in article_list]
list_tickers = list(dict.fromkeys(list_tickers))
stock_data = {}
failed_stocks = []
end_date = datetime.strptime(max(list_dates), '%Y-%m-%d %H:%M:%S%z') + dt.timedelta(days=5)
start_date = datetime.strptime(min(list_dates), '%Y-%m-%d %H:%M:%S%z') - dt.timedelta(days=5)
print('pulling stocks...')
# data = yf.download(tickers = list_tickers, end=str(end_date.date()), start=str(start_date.date()), progress=True)
curr_index = 0
TOTAL_TICKERS = len(list_tickers)
for t in list_tickers:
    arts_ticker = [a['date'] for a in article_list if a['ticker'] == t]
    # print(type(arts_ticker[0]))
    end_date = datetime.strptime(max(arts_ticker), '%Y-%m-%d %H:%M:%S%z') + dt.timedelta(days=5)
    start_date = datetime.strptime(min(arts_ticker), '%Y-%m-%d %H:%M:%S%z') - dt.timedelta(days=5)
    try:
        data = yf.download(tickers = t, end=str(end_date.date()), start=str(start_date.date()), progress=False, show_errors=False)
        if len(data > 0):
            stock_data[t] = data
            with open('/home/josh/Documents/year-4/thesis/code/kaggle-dataset-training/processed-data/' + s + '.json', 'w') as json_file:
                json.dump(data.to_json(), json_file)
        else:
            failed_stocks.append(t)
    except:
        failed_stocks.append(t)
    sys.stdout.write('\r')
    j = (curr_index + 1) / TOTAL_TICKERS
    sys.stdout.write("[%-20s] %d%% %d out of %d (%d)" % ('='*int(20*j), 100*j, curr_index, TOTAL_TICKERS, len(failed_stocks)))
    sys.stdout.flush()
    curr_index += 1
print("Failed stocks = " + str(failed_stocks))
# for a in article_list:


pulling stocks...


In [5]:
print(failed_stocks)
# stock_list_data = [s.to_json() for s in stock_list_data]
# dict_stock = dict(zip(stock_list_tickers, stock_list_data))
for s in stock_data:
    with open('/home/josh/Documents/year-4/thesis/code/kaggle-dataset-training/processed-data/' + s + '.json', 'w') as json_file:
        json.dump(stock_data[s].to_json(), json_file)



NameError: name 'failed_stocks' is not defined

In [27]:
print('Generating list of articles with associated market info...')
article_list_updated = [a for a in article_list if not a['ticker'] in failed_stocks]


Generating list of articles with associated market info...


In [35]:
article_list_mrkt = []
print(len(article_list))
print(len(article_list_updated))
curr_index = 0
TOTAL_ARTS = len(article_list_updated)
print('Done, assigning stock data to articles...')
for a in article_list_updated:
    article_date = datetime.strptime(a['date'], '%Y-%m-%d %H:%M:%S%z')
    day_t = article_date.date()
    if (article_date.hour > 15):
        day_t = day_t + dt.timedelta(days=1)
    while (not str(day_t) in stock_data[a['ticker']].index and day_t < max(stock_data[a['ticker']].index).date()):
        day_t += dt.timedelta(days=1)
    from_stock = day_t - dt.timedelta(days=2)
    to_stock = day_t + dt.timedelta(days=1)
    while (not str(from_stock) in stock_data[a['ticker']].index and from_stock >= min(stock_data[a['ticker']].index).date()):
        from_stock -= dt.timedelta(days=1)
    while (not str(to_stock) in stock_data[a['ticker']].index and to_stock <= max(stock_data[a['ticker']].index).date()):
        to_stock += dt.timedelta(days=1)
    if (to_stock < max(stock_data[a['ticker']].index).date() and from_stock > min(stock_data[a['ticker']].index).date()):
        new_art = {
            'headline': a['headline'],
            'mrkt_info': {
                'open': from_stock,
                'close': to_stock
            },
            'date': a['date'],
            'ticker': a['ticker']
        }
        article_list_mrkt.append(new_art)
    sys.stdout.write('\r')
    j = (curr_index + 1) / TOTAL_ARTS
    sys.stdout.write("[%-20s] %d%% %d out of %d" % ('='*int(20*j), 100*j, curr_index, TOTAL_ARTS))
    sys.stdout.flush()
    curr_index += 1

1397891
1053226
Done, assigning stock data to articles...
[                    ] 4% 48147 out of 1053226

KeyboardInterrupt: 