In [1]:
# imports
import re
import sklearn
from scipy.optimize import fminbound
from sklearn import preprocessing
# import scikit-learn
import numpy as np
from numpy.linalg import inv
from pathlib import Path
from bs4 import BeautifulSoup as bs
# from textblob import TextBlob as tb
import math
import matplotlib.pyplot as plt
from datetime import datetime
import pytz
import datetime as dt
import nltk
from dateutil.relativedelta import relativedelta
from pandas import DataFrame
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import csv
import os
import yfinance as yf
import json
import sys

This is the big list of functions for the actual SESTM computation. We'll use this later

In [2]:
STOP_WORDS = set(stopwords.words('english'))
#Hyper params -- ROLLING WINDOW IN FUTUERe
lemmatizer = WordNetLemmatizer()
en_words = set(nltk.corpus.words.words())

# the complete sestm function list
def calc_returns(article):
    returns = float(article['mrkt_info']['open']) - float(article['mrkt_info']['close'])
    sgn_a = -1
    if (returns > 0): # add -1 if returns are 0 or less, 1 otherwise
        sgn_a = 1
    return (returns, sgn_a)

def text_to_bow(text):
    readable_text = text.lower()
    # print("Text for article " + str(i) + ": '" + readable_text + "'")
    # substitute non alphabet chars (new lines become spaces)
    readable_text = re.sub(r'\n', ' ', readable_text)
    readable_text = re.sub(r'[^a-z ]', '', readable_text)
    # sub multiple spaces with one space
    readable_text = re.sub(r'\s+', ' ', readable_text)
    # tokenise text
    words = nltk.wordpunct_tokenize(readable_text)
    bow_art = {}
    # lemmatised_words = []
    if len(words) > 0:
        # lemmatise, remove non-english, and remove stopwords
        for w in words:
            rootword = lemmatizer.lemmatize(w, pos="v")
            # rootword = w
            if rootword not in STOP_WORDS and rootword in en_words:
                # lemmatised_words.append(rootword)
                if rootword in bow_art:
                    bow_art[rootword] += 1
                else:
                    bow_art[rootword] = 1
        # convert to bag of words
        # global_bow = {l: val+1 for l in lemmatised_words for val in global_bow.get(l, 0)}
        # bow_art = {l: val+1 for l in lemmatised_words for val in global_bow.get(l, 0)}
        # for l in lemmatised_words:
        #     if l in bow_art:
        #         bow_art[l] += 1
        #     else:
        #         bow_art[l] = 1
    
    return bow_art

def calc_f(d, sgn):
    pos_j = {}  #j occuring in positive article
    total_j = {}#j occuring in any article
    f = {}      #fraction of positive occurrences
    for i in range(len(d)):
        for w in d[i]:
            # pos_sent = sgn[i]
            pos_sent = 0
            if (sgn[i] == 1): pos_sent = 1
            if w in total_j:
                total_j[w] += d[i][w]
                pos_j[w] += d[i][w]*pos_sent
            else:
                total_j[w] = d[i][w]
                pos_j[w] = d[i][w]*pos_sent
            f[w] = pos_j[w]/total_j[w]
    return (pos_j, total_j, f)

def gen_sent_word_list(total_j,sgn,f):
    pi = sum(sgn_i > 0 for sgn_i in sgn)/len(sgn)
    print(pi)
    sentiment_words = [] # S
    neutral_words = []   # N
    for i in total_j:
        if ((f[i] >= pi + ALPHA_PLUS or f[i] <= pi - ALPHA_MINUS) and total_j[i] >= KAPPA and len(i) > 1):
            sentiment_words.append(i)
        else:
            neutral_words.append(i)
    return(sentiment_words, neutral_words)

# Calculates p_i
def calc_p(y):
    p = [0] * len(y)
    for i, x in enumerate(sorted(range(len(y)), key=lambda y_lam: y[y_lam])):
        p[x] = float((i+1)/(len(y)))
    return p

# Calculates s_i
def calc_s(sentiment_words, d):
    s = []                                          # ith element corresponds to total count of sentiment charged words for document i
    d_s = []                                        # ith element corresponds to list of word counts for each of the sentiment charged words for document i
    for doc in d:
        s.append(sum(doc.get(val,0) for val in sentiment_words))
        d_s.append([doc.get(val,0) for val in sentiment_words])
    return (s, d_s)

# Calculates h_i
def calc_h(sentiment_words, d, s, d_s):
    h = np.zeros((len(d), len(sentiment_words)))    # ith element corresponds to |S|x1 vector of word frequencies divided by total sentiment words in doc i

    for i in range(len(d)):
        # subvector of sentiment words in d_i
        if (s[i] == 0) :
            h[i] = np.zeros(len(sentiment_words)).transpose()
        else:
            h[i] = np.array([(j/s[i]) for j in d_s[i]]).transpose()
    return h

# Calculates O
def calc_o(p,h):
    p_inv = [(1-val) for val in p]
    W = np.column_stack((p, p_inv))
    W = W.transpose()
    ww = np.matmul(W,W.transpose())
    w2 = np.matmul(W.transpose(), inv(ww))
    O = np.matmul(h.transpose(),w2)
    O[O < 0] = 0 # remove negative entries of O
    O = O.transpose()
    # Normalise O columns to have l1 norm
    O = sklearn.preprocessing.normalize(O,norm='l1')
    O = O.transpose()
    return O

# lam = 3 is what i normally use
def equation_to_solve(p_solve, O, new_bow, sentiment_words, new_s, lam):
    i = 0
    equation = 0
    for j in sentiment_words:
        # a = (new_bow.get(j,0) * math.log(new_p*O[i][0] + (1-new_p)*O[i][1]))
        d_j = new_bow.get(j,0)
        in_log = p_solve*O[i][0] + (1-p_solve)*O[i][1]
        if not in_log == 0:
            equation += d_j * math.log(p_solve*O[i][0] + (1-p_solve)*O[i][1])

        i += 1
        # i += 1/new_s + lam * (new_p*(1-new_p))

    # if new_s == 0:
    #     new_s = 1
    equation /= new_s
    equation += lam*(p_solve*(1-p_solve))
    equation *= -1 #flip equation for argmin
    return equation

Now we will import all of the headlines from kaggle and pull the required stock information to compile a json list of articles like we have normally.

In [3]:
#loop through list of files
article_list = []
file_name = './archive/analyst_ratings_processed.csv'
with open(file_name) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    # FORMAT: line#,headline,date,stock
    for row in csv_reader:
        if line_count > 0 and len(row) == 4:
            new_art = {
                'headline': row[1],
                'date': row[2],
                'ticker': row[3]
            }
            article_list.append(new_art)
        line_count += 1
    print(f'Processed {line_count} lines generating {len(article_list)} usable headlines')

Processed 1400470 lines generating 1397891 usable headlines


This is just a function to list how many articles we have per year

In [24]:
list_dates = [a['date'] for a in article_list]
print(f'Min date {min(list_dates)} and max date {max(list_dates)}')
for year in range(2009,2021):
    year_count = len([a for a in article_list if (a['date'] <= str(year+1) + '-01-01' and a['date'] >= str(year) + '-01-01')])
    print(f'No. articles in {year}: {year_count}')

Min date 2009-04-27 14:39:00-04:00 and max date 2020-06-11 15:32:00-04:00
No. articles in 2009: 10915
No. articles in 2010: 57302
No. articles in 2011: 89757
No. articles in 2012: 82527
No. articles in 2013: 81226
No. articles in 2014: 82745
No. articles in 2015: 87227
No. articles in 2016: 100007
No. articles in 2017: 91921
No. articles in 2018: 120679
No. articles in 2019: 127941
No. articles in 2020: 95286


Run this to pull stock information about the tickers. It should already be downlaoded in `processed-data`

In [6]:
list_tickers = [a['ticker'] for a in article_list]
list_tickers = list(dict.fromkeys(list_tickers))
stock_data = {}
failed_stocks = []
end_date = datetime.strptime(max(list_dates), '%Y-%m-%d %H:%M:%S%z') + dt.timedelta(days=5)
start_date = datetime.strptime(min(list_dates), '%Y-%m-%d %H:%M:%S%z') - dt.timedelta(days=5)
print('pulling stocks...')
# data = yf.download(tickers = list_tickers, end=str(end_date.date()), start=str(start_date.date()), progress=True)
curr_index = 0
TOTAL_TICKERS = len(list_tickers)
for t in list_tickers:
    arts_ticker = [a['date'] for a in article_list if a['ticker'] == t]
    # print(type(arts_ticker[0]))
    end_date = datetime.strptime(max(arts_ticker), '%Y-%m-%d %H:%M:%S%z') + dt.timedelta(days=5)
    start_date = datetime.strptime(min(arts_ticker), '%Y-%m-%d %H:%M:%S%z') - dt.timedelta(days=5)
    try:
        data = yf.download(tickers = t, end=str(end_date.date()), start=str(start_date.date()), progress=False, show_errors=False)
        if len(data > 0):
            stock_data[t] = data
        else:
            failed_stocks.append(t)
    except:
        failed_stocks.append(t)
    sys.stdout.write('\r')
    j = (curr_index + 1) / TOTAL_TICKERS
    sys.stdout.write("[%-20s] %d%% %d out of %d (%d)" % ('='*int(20*j), 100*j, curr_index, TOTAL_TICKERS, len(failed_stocks)))
    sys.stdout.flush()
    curr_index += 1
print("Failed stocks = " + str(failed_stocks))
# for a in article_list:


pulling stocks...
[                    ] 1% 63 out of 6192 (21)

This generates the aforementioned file so you dont spend 2 hours downloading every time

In [12]:
# dump stock data (probably dont do this tho lol, it takes up a fair bit of space i won't lie)
# print(failed_stocks)
# stock_list_data = [s.to_json() for s in stock_list_data]
# dict_stock = dict(zip(stock_list_tickers, stock_list_data))
for s in stock_data:
    with open('./processed-data/' + s + '.json', 'w') as json_file:
        json.dump(stock_data[s].to_json(), json_file)

Some stock are private, so we are unable to pull stock information about these tickers. This segment removes any of the articles with these tickers

In [4]:
print('Generating list of articles with associated market info...')
failed_stocks = ['AAN', 'AAV', 'AAVL', 'ABAC', 'ABCW', 'ABDC', 'ABGB', 'ABTL', 'ABX', 'ABY', 'ACAS', 'ACAT', 'ACCU', 'ACE', 'ACG', 'ACHN', 'ACMP', 'ACPW', 'ACSF', 'ACT', 'ACTS', 'ACXM', 'ADAT', 'ADEP', 'ADGE', 'ADHD', 'ADK', 'ADMS', 'ADNC', 'ADRA', 'ADVS', 'AEC', 'AEGN', 'AEGR', 'AEPI', 'AETI', 'AF', 'AFA', 'AFC', 'AFFX', 'AFH', 'AFOP', 'AGC', 'AGII', 'AGN', 'AGNCB', 'AGOL', 'AGU', 'AHC', 'AHP', 'AI', 'AIB', 'AIRM', 'AIXG', 'AKAO', 'AKER', 'AKG', 'AKP', 'AKRX', 'AKS', 'ALDR', 'ALDW', 'ALJ', 'ALLB', 'ALQA', 'ALSK', 'ALTV', 'ALU', 'ALXA', 'ALXN', 'AMAG', 'AMBR', 'AMCC', 'AMCO', 'AMDA', 'AMFW', 'AMIC', 'AMID', 'AMPS', 'AMRB', 'AMRE', 'AMRI', 'AMSG', 'AMTG', 'AMZG', 'ANAC', 'ANAD', 'ANCI', 'AND', 'ANH', 'ANW', 'AOI', 'AOL', 'APAGF', 'APC', 'APF', 'API', 'APL', 'APOL', 'APP', 'APPY', 'APRI', 'APSA', 'AQQ', 'AQXP', 'ARCI', 'ARCX', 'ARDM', 'AREX', 'ARGS', 'ARIA', 'ARIS', 'ARMH', 'ARO', 'ARPI', 'ARQL', 'ARRS', 'ARRY', 'ARTX', 'ASBI', 'ASCMA', 'ASFI', 'ASMI', 'ASNA', 'ASPX', 'AST', 'AT', 'ATE', 'ATHN', 'ATK', 'ATL', 'ATLS', 'ATML', 'ATNY', 'ATRM', 'ATTU', 'ATU', 'ATV', 'ATW', 'AUMA', 'AUMAU', 'AUQ', 'AUXL', 'AV', 'AVG', 'AVH', 'AVHI', 'AVIV', 'AVL', 'AVNR', 'AVOL', 'AVP', 'AVX', 'AXE', 'AXJS', 'AXLL', 'AXN', 'AXPW', 'AXX', 'AYR', 'AZIA', 'BAA', 'BABS', 'BABY', 'BAF', 'BAGR', 'BALT', 'BAMM', 'BAS', 'BASI', 'BBCN', 'BBF', 'BBG', 'BBK', 'BBLU', 'BBNK', 'BBRC', 'BBRY', 'BBT', 'BBX', 'BCA', 'BCOM', 'BCR', 'BDBD', 'BDCV', 'BDE', 'BDGE', 'BEAT', 'BEE', 'BEL', 'BF', 'BFR', 'BFY', 'BGCA', 'BGG', 'BHBK', 'BHI', 'BHL', 'BID', 'BIK', 'BIN', 'BIND', 'BIOA', 'BIOD', 'BIOS', 'BIRT', 'BITA', 'BKJ', 'BKK', 'BKMU', 'BKS', 'BKYF', 'BLOX', 'BLT', 'BLVD', 'BLVDU', 'BMR', 'BMTC', 'BNCL', 'BNCN', 'BOBE', 'BOCH', 'BOFI', 'BONA', 'BONE', 'BONT', 'BORN', 'BOTA', 'BOXC', 'BPFH', 'BPFHW', 'BPI', 'BPL', 'BPOPN', 'BQH', 'BRAF', 'BRAQ', 'BRAZ', 'BRCD', 'BRCM', 'BRDR', 'BREW', 'BRK', 'BRKS', 'BRLI', 'BRSS', 'BRXX', 'BSCG', 'BSD', 'BSDM', 'BSE', 'BSFT', 'BSI', 'BSTC', 'BT', 'BTE', 'BTUI', 'BUNL', 'BUNT', 'BVA', 'BVSN', 'BVX', 'BWC', 'BWINA', 'BWINB', 'BWLD', 'BWS', 'BXE', 'BXS', 'BZC', 'BZM', 'CAB', 'CACGU', 'CACQ', 'CADC', 'CADT', 'CAFE', 'CAK', 'CAM', 'CAP', 'CAPN', 'CARB', 'CARO', 'CART', 'CAS', 'CASM', 'CATM', 'CAW', 'CBAK', 'CBB', 'CBDE', 'CBF', 'CBG', 'CBIN', 'CBK', 'CBLI', 'CBM', 'CBMG', 'CBMX', 'CBNJ', 'CBPO', 'CBPX', 'CBR', 'CBRX', 'CBS', 'CBSHP', 'CBST', 'CCC', 'CCCL', 'CCCR', 'CCE', 'CCG', 'CCSC', 'CCV', 'CCX', 'CCXE', 'CDC', 'CDI', 'CECO', 'CEL', 'CELGZ', 'CEMP', 'CERE', 'CERU', 'CETV', 'CFD', 'CFN', 'CFNL', 'CFP', 'CFRXW', 'CFRXZ', 'CGG', 'CGI', 'CGIX', 'CH', 'CHA', 'CHEV', 'CHFC', 'CHK', 'CHKE', 'CHL', 'CHLN', 'CHMT', 'CHOC', 'CHOP', 'CHSP', 'CHU', 'CHXF', 'CHYR', 'CIE', 'CIFC', 'CIMT', 'CISG', 'CIU', 'CJES', 'CKEC', 'CKH', 'CKP', 'CKSW', 'CLAC', 'CLC', 'CLCT', 'CLD', 'CLDN', 'CLGX', 'CLI', 'CLMS', 'CLNT', 'CLNY', 'CLRX', 'CLTX', 'CLUB', 'CLY', 'CMCSK', 'CMD', 'CMFN', 'CMGE', 'CMLP', 'CMN', 'CMSB', 'CNBKA', 'CNCO', 'CNDA', 'CNDO', 'CNIT', 'CNNX', 'CNTF', 'CNV', 'CNW', 'CNYD', 'COB', 'COBK', 'COCO', 'CODE', 'COH', 'COOL', 'COR', 'CORE', 'COSI', 'COT', 'COV', 'COVR', 'COVS', 'CPAH', 'CPGI', 'CPHD', 'CPHR', 'CPL', 'CPN', 'CPST', 'CPTA', 'CRAY', 'CRBQ', 'CRC', 'CRCM', 'CRD', 'CRDC', 'CRDS', 'CRDT', 'CRED', 'CREE', 'CRME', 'CRR', 'CRRC', 'CRRS', 'CRV', 'CRWN', 'CRZO', 'CSC', 'CSFL', 'CSG', 'CSH', 'CSJ', 'CSOD', 'CSRE', 'CSS', 'CST', 'CSUN', 'CTCT', 'CTF', 'CTL', 'CTNN', 'CTRL', 'CTRX', 'CTV', 'CTWS', 'CU', 'CUB', 'CUI', 'CUNB', 'CUO', 'CUR', 'CVA', 'CVC', 'CVD', 'CVOL', 'CVSL', 'CVTI', 'CWEI', 'CXA', 'CXO', 'CXP', 'CY', 'CYBX', 'CYN', 'CYNI', 'CYOU', 'CYT', 'CYTX', 'CZFC', 'CZZ', 'DAEG', 'DAKP', 'DANG', 'DARA', 'DATA', 'DATE', 'DBBR', 'DBMX', 'DBU', 'DBUK', 'DCA', 'DCIX', 'DCM', 'DCT', 'DDC', 'DDR', 'DEG', 'DEJ', 'DEPO', 'DEST', 'DF', 'DFRG', 'DFT', 'DGAS', 'DGI', 'DGSE', 'DHRM', 'DIVI', 'DKT', 'DLBL', 'DLPH', 'DM', 'DMD', 'DMND', 'DNB', 'DNBF', 'DNKN', 'DNO', 'DNR', 'DO', 'DOM', 'DOVR', 'DPLO', 'DPM', 'DPRX', 'DPW', 'DRAD', 'DRAM', 'DRC', 'DRII', 'DRL', 'DRNA', 'DRWI', 'DRYS', 'DSCI', 'DSCO', 'DSE', 'DSKX', 'DSKY', 'DSUM', 'DTLK', 'DTSI', 'DTV', 'DUC', 'DV', 'DVCR', 'DVD', 'DW', 'DWA', 'DWRE', 'DWTI', 'DXB', 'DXJF', 'DXJR', 'DXKW', 'DXM', 'DXPS', 'DYAX', 'DYN', 'EAC', 'EBIO', 'EBSB', 'ECA', 'ECR', 'ECT', 'ECTE', 'EDE', 'EDR', 'EDS', 'EE', 'EEHB', 'EEI', 'EEME', 'EEML', 'EFF', 'EFII', 'EFUT', 'EGAS', 'EGI', 'EGLT', 'EGOV', 'EGRW', 'EGT', 'EHIC', 'EIGI', 'EIV', 'EJ', 'ELGX', 'ELLI', 'ELNK', 'ELOS', 'ELRC', 'ELX', 'EMBB', 'EMCD', 'EMCI', 'EMCR', 'EMDI', 'EMES', 'EMEY', 'EMQ', 'EMSA', 'EMXX', 'ENBL', 'ENFC', 'ENGN', 'ENH', 'ENI', 'ENL', 'ENOC', 'ENRJ', 'ENT', 'ENVI', 'ENY', 'ENZY', 'EOC', 'EOPN', 'EOX', 'EPAX', 'EPE', 'EPIQ', 'EPRS', 'EQM', 'EQY', 'ERA', 'ERB', 'ERO', 'EROS', 'ERS', 'ESBF', 'ESCR', 'ESL', 'ESR', 'ESSX', 'ESV', 'ESYS', 'ETAK', 'ETE', 'ETF', 'ETFC', 'ETH', 'ETM', 'ETRM', 'EU', 'EV', 'EVAL', 'EVAR', 'EVBS', 'EVDY', 'EVEP', 'EVJ', 'EVLV', 'EVRY', 'EWCS', 'EWHS', 'EWSS', 'EXA', 'EXAM', 'EXAR', 'EXFO', 'EXL', 'EXLP', 'EXXI', 'FAC', 'FAV', 'FBNK', 'FBSS', 'FCAU', 'FCE', 'FCF', 'FCH', 'FCHI', 'FCLF', 'FCS', 'FCSC', 'FDEF', 'FDI', 'FDML', 'FDO', 'FEIC', 'FELP', 'FES', 'FEYE', 'FFG', 'FGP', 'FHCO', 'FHY', 'FI', 'FIG', 'FISH', 'FLIR', 'FLML', 'FLTX', 'FLXN', 'FLY', 'FMD', 'FMER', 'FNBC', 'FNFG', 'FNFV', 'FNJN', 'FNSR', 'FOMX', 'FPO', 'FPRX', 'FRAN', 'FRED', 'FREE', 'FRM', 'FRP', 'FRS', 'FRSH', 'FSAM', 'FSBK', 'FSC', 'FSGI', 'FSIC', 'FSL', 'FSNN', 'FSRV', 'FSYS', 'FTD', 'FTR', 'FTT', 'FUEL', 'FULL', 'FUR', 'FWM', 'FWV', 'FXCB', 'FXCM', 'GAI', 'GAINO', 'GALTU', 'GARS', 'GBSN', 'GCA', 'GCAP', 'GCH', 'GCVRZ', 'GDAY', 'GDEF', 'GDF', 'GDP', 'GEVA', 'GFA', 'GFIG', 'GFNCP', 'GFY', 'GG', 'GGAC', 'GGE', 'GGM', 'GGOV', 'GGP', 'GHDX', 'GHI', 'GIG', 'GIMO', 'GK', 'GKNT', 'GLDC', 'GLDX', 'GLOG', 'GLPW', 'GLUU', 'GMCR', 'GMFS', 'GMK', 'GMLP', 'GMO', 'GMT', 'GMZ', 'GNC', 'GNI', 'GNMK', 'GNVC', 'GOMO', 'GOODO', 'GOODP', 'GOV', 'GPIC', 'GPM', 'GPOR', 'GPX', 'GRAM', 'GRH', 'GRIF', 'GRN', 'GRO', 'GRT', 'GSB', 'GSH', 'GSI', 'GSOL', 'GST', 'GSVC', 'GTAA', 'GTI', 'GTIV', 'GTT', 'GTU', 'GTWN', 'GTXI', 'GUID', 'GUR', 'GURX', 'GWL', 'GWPH', 'GWR', 'GY', 'GYEN', 'GZT', 'HABT', 'HAR', 'HAWKB', 'HBHC', 'HBK', 'HBNK', 'HBOS', 'HCAC', 'HCAP', 'HCBK', 'HCHC', 'HCLP', 'HCN', 'HCOM', 'HCP', 'HDRA', 'HDRAU', 'HDS', 'HDY', 'HEB', 'HELI', 'HEOP', 'HF', 'HFBC', 'HFFC', 'HGG', 'HGI', 'HGR', 'HGT', 'HH', 'HIIQ', 'HILL', 'HILO', 'HK', 'HKOR', 'HKTV', 'HLS', 'HLSS', 'HME', 'HMPR', 'HMSY', 'HNH', 'HNR', 'HNSN', 'HNT', 'HOS', 'HOTRW', 'HPJ', 'HPT', 'HPTX', 'HPY', 'HRC', 'HRS', 'HRT', 'HSEA', 'HSGX', 'HSNI', 'HSOL', 'HSP', 'HTCH', 'HTF', 'HTR', 'HTS', 'HTWO', 'HTWR', 'HTZ', 'HUB', 'HVB', 'HW', 'HWAY', 'HWCC', 'HYGS', 'HYH', 'IACI', 'IBCA', 'IBCC', 'IBKC', 'IBLN', 'ICA', 'ICB', 'ICEL', 'ICI', 'ICN', 'ICON', 'IDHB', 'IDI', 'IDSY', 'IDTI', 'IDXJ', 'IEC', 'IFMI', 'IFON', 'IFT', 'IG', 'IGLD', 'IGTE', 'IGU', 'IID', 'IILG', 'IJNK', 'IKAN', 'IKGH', 'IKNX', 'IL', 'IM', 'IMDZ', 'IMI', 'IMMU', 'IMN', 'IMNP', 'IMPR', 'IMRS', 'IMS', 'IMUC', 'INAP', 'INB', 'INCR', 'IND', 'INF', 'INFA', 'ININ', 'INP', 'INPH', 'INS', 'INSY', 'INTL', 'INVN', 'INWK', 'INXN', 'INXX', 'INY', 'IOC', 'IOIL', 'IOT', 'IPCI', 'IPCM', 'IPD', 'IPF', 'IPHS', 'IPK', 'IPU', 'IPW', 'IQNT', 'IRC', 'IRDMB', 'IRDMZ', 'IRE', 'IRET', 'IRF', 'IRR', 'ISCA', 'ISF', 'ISH', 'ISIL', 'ISIS', 'ISLE', 'ISNS', 'ISRL', 'ISSI', 'IST', 'ITC', 'ITF', 'ITG', 'ITLT', 'ITLY', 'ITR', 'IVAN', 'IVOP', 'IXYS', 'JAH', 'JASN', 'JASO', 'JAXB', 'JCOM', 'JCP', 'JDD', 'JDSU', 'JEC', 'JFC', 'JGBD', 'JGBL', 'JGBS', 'JGBT', 'JGW', 'JIVE', 'JJA', 'JJM', 'JJN', 'JJP', 'JJT', 'JJU', 'JMEI', 'JMLP', 'JMP', 'JNS', 'JO', 'JOEZ', 'JONE', 'JOY', 'JPEP', 'JPP', 'JRN', 'JSC', 'JST', 'JTA', 'JTD', 'JTP', 'JUNR', 'JW', 'JYN', 'KATE', 'KBIO', 'KBSF', 'KBWC', 'KBWI', 'KCAP', 'KCC', 'KCG', 'KEF', 'KEG', 'KEM', 'KFH', 'KFI', 'KFX', 'KHI', 'KIN', 'KITE', 'KKD', 'KME', 'KNL', 'KNM', 'KONA', 'KONE', 'KOOL', 'KORS', 'KROO', 'KRU', 'KST', 'KSU', 'KTEC', 'KUTV', 'KWT', 'KYO', 'KYTH', 'KZ', 'LABC', 'LABL', 'LACO', 'LAS', 'LBF', 'LBIX', 'LBMH', 'LBY', 'LDL', 'LDR', 'LDRH', 'LEI', 'LEVY', 'LEVYU', 'LG', 'LGCY', 'LGF', 'LINE', 'LION', 'LIOX', 'LIQD', 'LLDM', 'LLEM', 'LLEX', 'LLSC', 'LLTC', 'LM', 'LMCA', 'LMCB', 'LMCK', 'LMIA', 'LMLP', 'LMNX', 'LMOS', 'LMRK', 'LNBB', 'LNCO', 'LNKD', 'LOCK', 'LOCM', 'LOGM', 'LOJN', 'LONG', 'LOOK', 'LORL', 'LPHI', 'LPT', 'LPTN', 'LRAD', 'LRE', 'LSC', 'LSG', 'LTM', 'LTS', 'LTXB', 'LUX', 'LVLT', 'LVNTA', 'LWC', 'LXFT', 'LXK', 'MAGS', 'MAMS', 'MBFI', 'MBLX', 'MBLY', 'MBRG', 'MBTF', 'MBVT', 'MCC', 'MCF', 'MCGC', 'MCOX', 'MCP', 'MCRL', 'MCUR', 'MCV', 'MCZ', 'MDAS', 'MDCA', 'MDCO', 'MDD', 'MDGN', 'MDLY', 'MDP', 'MDSO', 'MDSY', 'MDVN', 'MDVXU', 'MDW', 'MEA', 'MEET', 'MEG', 'MELA', 'MELR', 'MEN', 'MENT', 'MEP', 'MERU', 'MES', 'METR', 'MFI', 'MFLX', 'MFNC', 'MFRI', 'MFRM', 'MFSF', 'MFT', 'MGCD', 'MGH', 'MGLN', 'MGN', 'MGT', 'MHE', 'MHFI', 'MHGC', 'MHR', 'MIE', 'MIFI', 'MIK', 'MIL', 'MILL', 'MINI', 'MJN', 'MKTO', 'MLHR', 'MLNK', 'MLNX', 'MLPJ', 'MLPL', 'MM', 'MMAC', 'MNE', 'MNGA', 'MNI', 'MNK', 'MNRK', 'MNTA', 'MOBI', 'MOC', 'MOCO', 'MOG', 'MOKO', 'MOLG', 'MON', 'MONY', 'MORE', 'MPEL', 'MPET', 'MPO', 'MRD', 'MRH', 'MRKT', 'MRVC', 'MSBF', 'MSF', 'MSG', 'MSLI', 'MSO', 'MSON', 'MSP', 'MSTX', 'MTK', 'MTS', 'MTSC', 'MTSL', 'MTSN', 'MTT', 'MTU', 'MUH', 'MUS', 'MVC', 'MVG', 'MVNR', 'MW', 'MWE', 'MWIV', 'MWV', 'MXIM', 'MXWL', 'MY', 'MYCC', 'MYF', 'MYL', 'MYOS', 'MZF', 'NADL', 'NAME', 'NANO', 'NAO', 'NATL', 'NAV', 'NBBC', 'NBG', 'NBL', 'NBS', 'NBTF', 'NCB', 'NCFT', 'NCI', 'NCIT', 'NCQ', 'NDRO', 'NE', 'NEOT', 'NETE', 'NEWM', 'NEWS', 'NFEC', 'NGHC', 'NGHCP', 'NGLS', 'NHF', 'NHTB', 'NJ', 'NJV', 'NKA', 'NKY', 'NLNK', 'NMBL', 'NMO', 'NMRX', 'NMY', 'NNA', 'NNC', 'NOR', 'NORD', 'NPBC', 'NPD', 'NPP', 'NPSP', 'NRCIA', 'NRF', 'NRX', 'NSAM', 'NSH', 'NSPH', 'NSR', 'NTI', 'NTK', 'NTL', 'NTLS', 'NTN', 'NTRSP', 'NTT', 'NTX', 'NU', 'NUTR', 'NVDQ', 'NVGN', 'NVSL', 'NVX', 'NWHM', 'NWY', 'NXQ', 'NXR', 'NXTDW', 'NXTM', 'NYLD', 'NYMTP', 'NYNY', 'NYV', 'OAK', 'OAKS', 'OB', 'OCIR', 'OCLS', 'OCR', 'OCRX', 'OGXI', 'OHAI', 'OHGI', 'OHRP', 'OIBR', 'OILT', 'OKS', 'OKSB', 'OLO', 'OMAM', 'OME', 'OMED', 'OMG', 'OMN', 'ONEF', 'ONFC', 'ONNN', 'ONP', 'ONTY', 'ONVI', 'OPB', 'OPHT', 'OPWR', 'OPXA', 'ORB', 'ORBC', 'ORBK', 'OREX', 'ORIT', 'ORM', 'ORPN', 'OSGB', 'OSHC', 'OSIR', 'OSM', 'OSN', 'OTEL', 'OTIV', 'OUTR', 'OVTI', 'OWW', 'OXFD', 'OXLCO', 'OZM', 'OZRK', 'PACD', 'PAF', 'PAGG', 'PAH', 'PAL', 'PARN', 'PAY', 'PBCP', 'PBIB', 'PBM', 'PBMD', 'PBY', 'PCI', 'PCL', 'PCLN', 'PCMI', 'PCO', 'PCP', 'PCYC', 'PDII', 'PDLI', 'PE', 'PEGI', 'PEIX', 'PENX', 'PEOP', 'PER', 'PERF', 'PERM', 'PES', 'PETM', 'PETX', 'PFBI', 'PFK', 'PFNX', 'PFPT', 'PGI', 'PGM', 'PGN', 'PGNX', 'PHF', 'PHII', 'PHIIK', 'PHMD', 'PICO', 'PIH', 'PIP', 'PIR', 'PJC', 'PKD', 'PKO', 'PKY', 'PLCM', 'PLKI', 'PLMT', 'PLND', 'PLNR', 'PLPM', 'PLT', 'PLTM', 'PMBC', 'PMC', 'PMCS', 'PMFG', 'PNRA', 'PNTR', 'PNX', 'POL', 'POM', 'POPE', 'POT', 'POWR', 'POZN', 'PPHM', 'PPHMP', 'PPO', 'PPP', 'PPR', 'PPS', 'PQ', 'PRAH', 'PRAN', 'PRB', 'PRCP', 'PRE', 'PRGN', 'PRGX', 'PRLS', 'PRSC', 'PRTO', 'PRXI', 'PRXL', 'PRY', 'PSAU', 'PSBH', 'PSDV', 'PSEM', 'PSG', 'PSTB', 'PSTR', 'PSUN', 'PTBI', 'PTIE', 'PTLA', 'PTM', 'PTP', 'PTRY', 'PTX', 'PULB', 'PVA', 'PVTB', 'PWE', 'PWRD', 'PWX', 'PXMC', 'PXR', 'PXSC', 'PZI', 'Q', 'QADA', 'QCAN', 'QDEM', 'QDXU', 'QEH', 'QEM', 'QEP', 'QEPM', 'QGBR', 'QIHU', 'QKOR', 'QLGC', 'QLIK', 'QLTB', 'QLTC', 'QLTI', 'QLTY', 'QSII', 'QTM', 'QTS', 'QTWN', 'QTWW', 'QUNR', 'QVCA', 'QVCB', 'QXUS', 'RAI', 'RALY', 'RAS', 'RATE', 'RAVN', 'RAX', 'RBC', 'RBL', 'RBPAA', 'RBS', 'RBY', 'RCAP', 'RCPI', 'RCPT', 'RDC', 'RDEN', 'RDS', 'RECN', 'REE', 'REMY', 'REN', 'RENT', 'RESI', 'REXI', 'REXX', 'RFT', 'RGDO', 'RGDX', 'RGSE', 'RHP', 'RHS', 'RHT', 'RIC', 'RICE', 'RIF', 'RIGP', 'RIO', 'RIOM', 'RIT', 'RIVR', 'RJET', 'RJF', 'RKT', 'RKUS', 'RLD', 'RLH', 'RLOC', 'RLOG', 'RLYP', 'RNA', 'RNDY', 'RNET', 'RNF', 'RNN', 'ROC', 'ROIAK', 'ROIQ', 'ROIQU', 'ROIQW', 'ROKA', 'RORO', 'ROSE', 'ROVI', 'ROX', 'RP', 'RPAI', 'RPRX', 'RPRXW', 'RPRXZ', 'RPTP', 'RPX', 'RRST', 'RSE', 'RSH', 'RSO', 'RST', 'RSTI', 'RT', 'RTEC', 'RTGN', 'RTI', 'RTIX', 'RTK', 'RTR', 'RTRX', 'RUBI', 'RUK', 'RVBD', 'RVLT', 'RVM', 'RWC', 'RWV', 'RWXL', 'RXDX', 'RXII', 'RYL', 'S', 'SAAS', 'SAEX', 'SAJA', 'SALT', 'SAPE', 'SARA', 'SBBX', 'SBGL', 'SBRAP', 'SBSA', 'SBY', 'SCAI', 'SCHB', 'SCHF', 'SCHL', 'SCHP', 'SCLN', 'SCMP', 'SCOK', 'SCPB', 'SCSC', 'SCSS', 'SCTY', 'SCU', 'SCVL', 'SCZ', 'SD', 'SDLP', 'SDR', 'SDRL', 'SDT', 'SE', 'SEIC', 'SEMG', 'SEMI', 'SERV', 'SEV', 'SFB', 'SFG', 'SFL', 'SFLA', 'SFLY', 'SFN', 'SFNC', 'SFS', 'SFXE', 'SGAR', 'SGB', 'SGBK', 'SGF', 'SGG', 'SGM', 'SGNL', 'SGNT', 'SGOC', 'SGY', 'SGYP', 'SGYPU', 'SGYPW', 'SHLD', 'SHLO', 'SHM', 'SHOR', 'SHOS', 'SIAL', 'SIBC', 'SIFI', 'SIGM', 'SIMG', 'SINA', 'SINO', 'SIRO', 'SKBI', 'SKH', 'SKIS', 'SKUL', 'SLCT', 'SLH', 'SLI', 'SLTC', 'SLW', 'SLXP', 'SMACU', 'SMI', 'SMK', 'SMRT', 'SMTP', 'SMTX', 'SN', 'SNAK', 'SNBC', 'SNC', 'SNDK', 'SNH', 'SNOW', 'SNR', 'SNSS', 'SNTA', 'SORL', 'SPA', 'SPAN', 'SPAR', 'SPEX', 'SPF', 'SPKE', 'SPLS', 'SPNC', 'SPP', 'SPPR', 'SPPRO', 'SPRT', 'SPU', 'SPW', 'SQBG', 'SQBK', 'SQI', 'SQNM', 'SRSC', 'SSE', 'SSFN', 'SSH', 'SSI', 'SSLT', 'SSN', 'SSNI', 'SSRG', 'SSRI', 'SSS', 'SSW', 'STAY', 'STCK', 'STEM', 'STI', 'STJ', 'STML', 'STMP', 'STNR', 'STO', 'STR', 'STRN', 'STRZA', 'STRZB', 'STS', 'SUBK', 'SUNE', 'SUSQ', 'SUTR', 'SVLC', 'SWHC', 'SWY', 'SXCP', 'SXE', 'SYA', 'SYKE', 'SYMC', 'SYMX', 'SYNC', 'SYRG', 'SYRX', 'SYT', 'SYUT', 'SYX', 'SZYM', 'TAHO', 'TAOM', 'TAS', 'TASR', 'TAT', 'TAX', 'TAXI', 'TBAR', 'TBIO', 'TCAP', 'TCBIP', 'TCCA', 'TCHI', 'TCK', 'TCO', 'TCP', 'TCPI', 'TCRD', 'TDA', 'TDD', 'TDI', 'TE', 'TEAR', 'TECD', 'TECU', 'TEG', 'TERP', 'TESO', 'TEU', 'TFM', 'TFSCU', 'TGC', 'TGD', 'TGE', 'THOR', 'THRX', 'THTI', 'TI', 'TICC', 'TIF', 'TIK', 'TIME', 'TINY', 'TISA', 'TIVO', 'TKAI', 'TKMR', 'TLF', 'TLI', 'TLL', 'TLLP', 'TLM', 'TLMR', 'TLO', 'TLP', 'TLR', 'TMH', 'TMK', 'TNAV', 'TNDQ', 'TNGO', 'TOF', 'TOO', 'TOT', 'TOWR', 'TPI', 'TPLM', 'TPRE', 'TPUB', 'TRAK', 'TRCB', 'TRCH', 'TRCO', 'TRF', 'TRGT', 'TRIL', 'TRIV', 'TRK', 'TRLA', 'TRMR', 'TRND', 'TROV', 'TROVU', 'TROVW', 'TRR', 'TRTL', 'TRW', 'TRXC', 'TSL', 'TSLF', 'TSO', 'TSRA', 'TSRE', 'TSS', 'TST', 'TSU', 'TSYS', 'TTF', 'TTFS', 'TTHI', 'TTPH', 'TTS', 'TUBE', 'TUES', 'TVIZ', 'TWC', 'TWMC', 'TXTR', 'TYC', 'TYPE', 'UACL', 'UAM', 'UBC', 'UBIC', 'UBNK', 'UBSH', 'UCD', 'UCFC', 'UCP', 'UDF', 'UFS', 'UHN', 'UIL', 'ULTI', 'ULTR', 'UMX', 'UN', 'UNIS', 'UNT', 'UNXL', 'UPIP', 'UPL', 'URZ', 'USAG', 'USAT', 'USBI', 'USCR', 'USG', 'USMD', 'USMI', 'USTR', 'UTEK', 'UTIW', 'UWTI', 'VA', 'VAL', 'VAR', 'VCO', 'VDSI', 'VGGL', 'VIAB', 'VIAS', 'VICL', 'VIEW', 'VII', 'VIMC', 'VIP', 'VISI', 'VISN', 'VLCCF', 'VLTC', 'VMEM', 'VNTV', 'VOLC', 'VRD', 'VRML', 'VRNG', 'VRTB', 'VRTU', 'VSAR', 'VSCI', 'VSCP', 'VSI', 'VSLR', 'VSR', 'VTAE', 'VTG', 'VTL', 'VTSS', 'VTTI', 'VVUS', 'VYFC', 'WAC', 'WAGE', 'WAIR', 'WAVX', 'WBAI', 'WBC', 'WBIH', 'WBMD', 'WCG', 'WDR', 'WDTI', 'WEBK', 'WEET', 'WFBI', 'WFD', 'WFM', 'WFT', 'WG', 'WGA', 'WGBS', 'WGP', 'WHX', 'WHZ', 'WIBC', 'WIFI', 'WILN', 'WIN', 'WITE', 'WLB', 'WLH', 'WLRHU', 'WLT', 'WMAR', 'WMGI', 'WMLP', 'WNR', 'WNRL', 'WPCS', 'WPG', 'WPPGY', 'WPT', 'WPX', 'WR', 'WRES', 'WSH', 'WSTC', 'WTR', 'WTSL', 'WUBA', 'WWAV', 'WYN', 'XBKS', 'XCO', 'XEC', 'XGTI', 'XGTIW', 'XIV', 'XL', 'XLRN', 'XLS', 'XNY', 'XON', 'XONE', 'XOOM', 'XOVR', 'XRA', 'XRS', 'XUE', 'XXV', 'YAO', 'YDKN', 'YDLE', 'YGE', 'YHOO', 'YOD', 'YOKU', 'YRCW', 'YUMA', 'YUME', 'YZC', 'ZA', 'ZAGG', 'ZAYO', 'ZBB', 'ZFC', 'ZFGN', 'ZINC', 'ZIONW', 'ZIXI', 'ZLTQ', 'ZMH', 'ZN', 'ZPIN', 'ZQK', 'ZSPH', 'ZU', 'ZX']
article_list_updated = [a for a in article_list if not a['ticker'] in failed_stocks]


Generating list of articles with associated market info...


This segment pulls the stock data from the file rather than YAHOO FINANCE API. It's much much much quicker so if you have the file, run this

In [5]:
# recreate list of stock information
ticker_path = './processed-data/'
pathlist = Path(ticker_path).rglob('*.json')
stock_data = {}
i = 0
for path in pathlist:
    with open(str(path)) as json_file:
        sys.stdout.write('\r')
        sys.stdout.write("pulling file no %d" % (i))
        sys.stdout.flush()#
        i += 1
        data = json.load(json_file)
        data = json.loads(data)
        datetime_dict = {}
        for line in data:
            datetime_line = {}
            if line == 'Close':
                for k in data[line]:
                    # print(str(datetime.fromtimestamp(float(k)/1000.0)))
                    datetime_line[datetime.fromtimestamp(float(k)/1000.0).date()] = data[line][k]
                datetime_dict[line] = datetime_line 
        stock_data[os.path.basename(str(path))[:-5]] = DataFrame.from_dict(datetime_dict)

pulling file no 4144

With the stock data all pulled, we now need to assign these to the appropriate article. It's important to note that there is not stock market information available for every single day, so if there is a day missing, we just take the next available day with information and call this day $t$. Doing this for each article is incredibly time consuming and takes around 30 hours, so instead, we create a lookup table here. Inside, it has the adjusted stock data required using the stock ticker and the date the article was released as the keys.

In [6]:
day_t_data = {}
curr_index = 0
TOTAL_ARTS = len(stock_data)
for t in stock_data:
    # print(t)
    ticker_date_data = {}
    start = min(stock_data[t].index)
    end = max(stock_data[t].index)
    curr_date = start
    while curr_date < end:
        day_t = curr_date
        while not day_t in stock_data[t].index and day_t <= end:
            day_t += dt.timedelta(days=1)
        from_stock = day_t - dt.timedelta(days=2)
        to_stock = day_t + dt.timedelta(days=1)
        while not from_stock in stock_data[t].index and from_stock >= start:
            from_stock -= dt.timedelta(days=1)
        while not to_stock in stock_data[t].index and to_stock <= end:
            to_stock += dt.timedelta(days=1)
        if(from_stock > start and to_stock < end):
            ticker_date_data[curr_date] = {'day_t': day_t, 'from_stock': stock_data[t]['Close'][from_stock], 'to_stock': stock_data[t]['Close'][to_stock]}
        curr_date += dt.timedelta(days=1)
    day_t_data[t] = ticker_date_data
    sys.stdout.write('\r')
    j = (curr_index + 1) / TOTAL_ARTS
    sys.stdout.write("[%-20s] %d%% %d out of %d" % ('='*int(20*j), 100*j, curr_index, TOTAL_ARTS))
    sys.stdout.flush()
    curr_index += 1



With all the tools at our disposal, we now need to compile the updated list of articles. This segment puts them in a file for later use, so we don't even need to do any of the things previously ever again.

In [7]:
article_list_mrkt = []
print(len(article_list))
print(len(article_list_updated))
curr_index = 0
TOTAL_ARTS = len(article_list_updated)
print('Done, assigning stock data to articles...')
with open('archive/analyst_ratings_mrkt.csv', 'w',newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['headline','date','ticker','open','close'])
    for a in article_list_updated:
        article_date = datetime.strptime(a['date'], '%Y-%m-%d %H:%M:%S%z')
        day_t = article_date.date()
        if (article_date.hour > 15):
            day_t = day_t + dt.timedelta(days=1)
        if (day_t in day_t_data[a['ticker']]):
            day_t = day_t_data[a['ticker']][day_t]['day_t']
            from_stock = day_t_data[a['ticker']][day_t]['from_stock']
            to_stock = day_t_data[a['ticker']][day_t]['to_stock']
            new_art = {
                'headline': a['headline'],
                'mrkt_info': {
                    'open': from_stock,
                    'close': to_stock
                },
                'date': a['date'],
                'ticker': a['ticker']
            }
            csvwriter.writerow([a['headline'], a['date'], a['ticker'], str(from_stock), str(to_stock)])
            # article_list_mrkt.append(new_art)
    # else:
    #     print("ticker " + t + " date " + str(day_t) + " not in range")
    # sys.stdout.write('\r')
    # j = (curr_index + 1) / TOTAL_ARTS
    # sys.stdout.write("%d out of %d articles processed" % (curr_index, TOTAL_ARTS))
    # sys.stdout.flush()
    # curr_index += 1

1397891
1048485
Done, assigning stock data to articles...



# START HERE IF `archive/analyst_ratings_mrkt.csv` EXISTS
Now that we have everything at our disposal, we can actually begin testing some things. Let's first read the file into the notebook so we can begin doing some computation

In [5]:
# import market data articles
#loop through list of files
article_list = []
file_name = './archive/analyst_ratings_mrkt.csv'
with open(file_name) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    # FORMAT: line#,headline,date,stock
    for row in csv_reader:
        if line_count > 0:
            new_art = {
                'headline': row[0],
                'date': row[1],
                'mrkt_info': {
                    'open': row[3],
                    'close': row[4]
                },
                'ticker': row[2]
            }
            article_list.append(new_art)
        line_count += 1
    print(f'Processed {line_count-1} lines generating {len(article_list)} usable headlines')

Processed 1027533 lines generating 1027533 usable headlines


## Training the model
The paper 'predicting stock returns with text data' used a rolling window method to train and validate window. They train models of the matrix and minimise a loss function. The tuning parameters are $(\alpha_+, \alpha_-, \kappa, \lambda)$. In each case, a limited number of choices were used.
- For the two $\alpha$ params, they are set such that the number of words in each group (both positive and negative) is either 25, 50, or 100.
- For $\kappa$, five choices are considered, at: 86%, 88%, 90%, 92% and 94% quantiles of the count distribution each year
- For $\lambda$, three choices: 1, 5, and 10
- This totals 45 configurations

The loss function they used is the $\ell^1$-norm of the differences between estimated article sentiment scores and the corresponding standardised return ranks for all events in the validation sample.

They had data all the way from 1989, but we don't have that luxury, with articles spanning 2009-2020. For this reason, we will use slightly different window sizes. We will attempt to estimate and validate the model around 11 times, meaning the whole window will be a three years total total. The training sample will be two years and the validation the following year. We then move the window along by four months, giving us enough evaluations

**IMPORTANT NOTES**
- It is entirely possible that the news articles in a certain month will carry certain sentiment, so investigation may be necessary into random 8 months of the year.
- 2009 and 2010 are comparatively smaller datasets, so I will combine these two years to create a single 'year' `(!!to implement. I am currently ignoring 2009)`
- 2020 has a similar number of headlines as the other years, but only has headlines up to June. The training and validation samples in this year will just be half that of other years..


In [3]:
# reset global variables
sgn = []
y = []
dates = []
global_bow = {}
d = []
kappa_configs   = [86, 88, 90, 92, 94]
alpha_configs   = [25,50,100]
lambda_configs  = [1,5,10]
curr_year = 2010
curr_month = 1
opening_date = datetime(curr_year,curr_month,1,0,0,0,0)
list_dates = [a['date'] for a in article_list]
end_date = datetime.strptime(max(list_dates), '%Y-%m-%d %H:%M:%S%z')
utc = pytz.UTC


while(datetime(curr_year, curr_month, 1, 0,0,0,0).date() < end_date.date()):
    #SELECT ARTICLES
    val_date_start = datetime(curr_year+2,curr_month,1,0,0,0,0)
    val_date_end = datetime(curr_year+3,curr_month,1,0,0,0,0)
    print('Selecting articles by date...')
    print("New training window: " + str(curr_year) + '-' + str(curr_month) + '-01' + " to " + str(curr_year+2) + '-' + str(curr_month) + '-01')
    print("New validation window: " + str(curr_year+2) + '-' + str(curr_month) + '-01' + " to "  + str(curr_year+3) + '-' + str(curr_month) + '-01')
    training_arts   = [a for a in article_list if (a['date'] >= str(curr_year) + '-' + str(curr_month) + '-01' and a['date'] < str(curr_year+2) + '-' + str(curr_month) + '-01')]
    validation_arts = [a for a in article_list if (a['date'] >= str(curr_year+2) + '-' + str(curr_month) + '-01' and a['date'] < str(curr_year+3) + '-' + str(curr_month) + '-01')]

    #PRE-PROCESS
    print('Preprocessing data...')
    train_d = []
    train_sgn = []
    train_y = []
    val_d = []
    val_sgn =[]
    val_y = []
    for train_a in training_arts:
        train_bow = text_to_bow(train_a['headline'])
        (returns, sgn_a) = calc_returns(train_a)
        train_d.append(train_bow)
        train_sgn.append(sgn_a)
        train_y.append(returns)
    for val_a in validation_arts:
        val_bow = text_to_bow(val_a['headline'])
        (returns, sgn_a) = calc_returns(val_a)
        val_d.append(val_bow)
        val_y.append(returns)
    
    train_pi = sum(sgn_i > 0 for sgn_i in train_sgn)/len(train_sgn)

    #PARAM GRID
    print('Beginning trials')
    trials = []
    for alpha in alpha_configs:
        for KAPPA in kappa_configs:
            for LAM in lambda_configs:
                #TRAINING
                print('training...')
                (pos_j, total_j, f) = calc_f(train_d, train_sgn)
                kappa_percentile = np.percentile(np.array(list(total_j.values())),KAPPA) # return the nth percentile of all appearances for KAPPA

                #calculate alpha vals
                ALPHA_PLUS = ALPHA_MINUS = 0
                while(len([w for w in total_j if f[w] >= train_pi + ALPHA_PLUS and total_j[w] >= kappa_percentile]) >= alpha):
                    ALPHA_PLUS += 0.01
                while(len([w for w in total_j if f[w] <= train_pi - ALPHA_MINUS and total_j[w] >= kappa_percentile]) >= alpha):
                    ALPHA_MINUS += 0.01
                sentiment_words = [w for w in total_j if ((f[w] >= train_pi + ALPHA_PLUS or f[w] <= train_pi - ALPHA_MINUS) and total_j[w] >= kappa_percentile)]
                print(len(sentiment_words))

                p           = calc_p(train_y)
                (s, d_s)    = calc_s(sentiment_words, train_d)
                h           = calc_h(sentiment_words, train_d, s, d_s)
                O           = calc_o(p,h)

                #VALIDATING
                print('validating...')
                error_arr = np.array(0)
                val_p = calc_p(val_y)
                for val_index in range(len(val_d)):
                    est_p = 0.5
                    val_bow = val_d[val_index]

                    testing_s = sum(val_bow.get(w,0) for w in sentiment_words)
                    if (testing_s > 0):
                        est_p = fminbound(equation_to_solve, 0, 1, (O,val_bow, sentiment_words,testing_s,LAM))
                    error_arr = np.append(error_arr, est_p - val_p[val_index])
                normalised_error = np.linalg.norm(error_arr, 1)
                trials.append({
                    'alpha_plus': ALPHA_PLUS,
                    'alpha_minus': ALPHA_MINUS,
                    'kappa': KAPPA,
                    'lam': LAM,
                    'o': O,
                    'sentiment_words': sentiment_words,
                    'norm_err': normalised_error
                })
                print('Trial complete')
    
    #RECORD BEST CONFIG
    print("Saving best config...")
    best_config = min(trials, key=lambda x:x['norm_err'])
    with open('data/configurations.csv', 'a') as csv_file:
        csv_file.write([str(opening_date), str(best_config['alpha_plus']), str(best_config['alpha_minus']), str(best_config['kappa']), str(best_config['lam'])])
    with open('data/word-lists/' + str(opening_date) + '.csv', 'w') as csv_file:
        #write header
        csv_file.write(['word', 'O+ value', 'O- value'])
    with open('data/word-lists/' + str(opening_date) + '.csv', 'a') as csv_file:
        for ind in range(len(best_config['sentiment_words'])):
            csv_file.write([str(best_config['sentiment_words'][ind]), str(best_config['o'][ind][0]), str(best_config['o'][ind][1])])

    #record start date, sentiment words, O, and params probably
    #i think maybe start date and params in a different file to sentiment words and O.

    #MOVE WINDOW
    curr_month += 4
    if (curr_month > 12):
        curr_month = 1
        curr_year += 1
    # opening_date += relativedelta(months=4)

#2020 SPECIAL CASE


TOTAL_ARTS = len(article_list)
curr_index = 0
curr_thousand = 0
for a in article_list:
    raw_html = a['headline']
    if(raw_html):
        bow_art = text_to_bow(raw_html)
        (returns, sgn_a) = calc_returns(a)
        dates.append(a['date'])
        d.append(bow_art)
        sgn.append(sgn_a)
        y.append(returns)
    if curr_index >= curr_thousand:
        curr_thousand += 10000
        sys.stdout.write('\r')
        j = (curr_index + 1) / TOTAL_ARTS
        sys.stdout.write("%d out of %d articles processed" % (curr_index, TOTAL_ARTS))
        sys.stdout.flush()
    curr_index += 1

NameError: name 'article_list' is not defined