In [1]:
from preprocess import Parser
from classifier import DiscriminativeClassifier, BinaryGenerativeClassifier
import pandas as pd
from io import StringIO
# from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
# from nltk import word_tokenize
# import nltk
# import re
# import operator
from topicmodel import LDA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.dates as mdates

# from wordcloud import WordCloud, STOPWORDS
np.random.seed(1)

In [2]:
data = pd.read_table(StringIO(''.join(l.replace('\u2028', ' ') for l in open('minutes_data.txt'))),
                    parse_dates=['meeting'])
data.head()

Unnamed: 0,meeting,text,seq,D_NBER
0,197601,"By unanimous vote, the Federal Reserve Bank o...",0,0
1,197601,"Committee, to execute transactions in the Sys...",1,0
2,197601,The information reviewed at this meeting sugg...,2,0
3,197601,"sales rose sharply, but the increase in the f...",3,0
4,197601,The exchange value of the dollar against lead...,4,0


In [3]:
grouped_data = data.groupby('meeting', as_index=False)
full_minutes = pd.concat([grouped_data['text'].apply(' '.join), grouped_data.first()], axis=1)
full_minutes.drop(['text', 'seq'], axis=1, inplace=True)
full_minutes.rename(columns={0: 'text'}, inplace=True)
# full_minutes.head()

In [4]:
nber = pd.read_csv('nber.csv', parse_dates=['date'])
nber['meeting'] = nber['meeting'].astype(str)

nber_lags = pd.concat([nber['meeting'], nber['D_NBER'].shift(-1), 
                       nber['D_NBER'].shift(-3), nber['D_NBER'].shift(-6), 
                       nber['D_NBER'].shift(-12),
                       nber['date']], 
                      axis=1)
nber_lags.columns = ['meeting', 'D_NBER_1', 'D_NBER_3', 'D_NBER_6', 
                     'D_NBER_12',
                     'meeting_date']

full_mins_lags = pd.merge(full_minutes, nber_lags, how='inner', on='meeting')

In [5]:
full_mins_lags.head()

Unnamed: 0,text,meeting,D_NBER,D_NBER_1,D_NBER_3,D_NBER_6,D_NBER_12,meeting_date
0,"By unanimous vote, the Federal Reserve Bank o...",197601,0,0.0,0.0,0.0,0.0,1976-01-20
1,"By unanimous vote, the Federal Reserve Bank o...",197602,0,0.0,0.0,0.0,0.0,1976-02-18
2,"By unanimous vote, the Federal Reserve Bank o...",197603,0,0.0,0.0,0.0,0.0,1976-03-16
3,"By unanimous vote, the Federal Reserve Bank o...",197604,0,0.0,0.0,0.0,0.0,1976-04-20
4,"With Mr. Coldwell, dissenting, the Federal Re...",197605,0,0.0,0.0,0.0,0.0,1976-05-18


# Discriminative classifier

In [6]:
train = full_mins_lags[:200]
test = full_mins_lags[200:]

In [7]:
parser = Parser(lemmatise=False, stem=True, replace_ngrams=True)

In [8]:
full_docs, full_vocab = parser.parse_vocab(full_mins_lags['text'])
full_corpus = parser.parse_corpus(full_docs, full_vocab)

Successfully parsed the corpus. # docs:  322
Vocabulary size, # tokens:  4240


In [89]:
docs, vocab = parser.parse_vocab(train['text'])
train_corpus = parser.parse_corpus(docs, vocab)

test_docs, _ = parser.parse_vocab(test['text'])
test_corpus = parser.parse_corpus(test_docs, vocab)

Successfully parsed the corpus. # docs:  200
Vocabulary size, # tokens:  2680
Successfully parsed the corpus. # docs:  122
Vocabulary size, # tokens:  2680


In [13]:
topic_model = LDA(K=50, alpha=1, eta=0.01)

In [14]:
theta_train, beta, elbo = topic_model.fit(train_corpus, full_vocab, max_iter=1000, verbose=False)
theta_test, elbo_test = topic_model.infer(test_corpus)

Learning completed!
Total time taken: 72s
ELBO = -980042.328727


In [20]:
topic_model.alpha

array([ 2.55116177,  0.30859537,  0.89393774,  0.37074126,  0.41378881,
        0.24315921,  2.66765519,  0.26799801,  0.54033646,  0.17832162,
        1.1371542 ,  0.32263961,  0.87253761,  0.3145196 ,  0.30489355,
        0.2203652 ,  0.26936588,  0.72745534,  0.2592859 ,  0.91879424,
        0.22224837,  1.29277881,  0.46490849,  0.72415806,  3.64086494,
        0.37580007,  0.16948498,  0.1363942 ,  0.57933333,  0.66670881,
        7.23723886,  2.18771144,  0.46065209,  0.31567188,  0.69707819,
        0.29444979,  0.66495872,  0.29481655,  0.59642182,  0.35016847,
        0.53998118,  0.47425719,  0.17569856,  0.27987473,  0.48828804,
        0.19205539,  0.25791276,  0.56347428,  1.18211378,  0.39478349])

In [97]:
predictions = []
predictions_proba = []
coefs = []

In [98]:
theta_train_rolling = theta_train[:]
theta_test_rolling = theta_test[:]
train_rolling = train['D_NBER']

In [99]:
for n in range(theta_test.shape[0]):
    classifier = LogisticRegressionCV(n_jobs=-1, cv=5)
    classifier.fit(theta_train_rolling, train_rolling)
    
    new_doc = theta_test[n, :]
    new_y = test['D_NBER'][200+n]
    
    pred = classifier.predict(new_doc.reshape(1,-1))
    pred_proba = classifier.predict_proba(new_doc.reshape(1,-1))
    
    predictions.append(pred)
    predictions_proba.append(pred_proba)
    coefs.append(classifier.coef_)
    
    theta_train_rolling = np.append(theta_train_rolling, new_doc.reshape(1, -1), axis=0)
    theta_test_rolling = np.delete(theta_test_rolling, 0, 0)
    
    train_rolling = np.append(train_rolling, new_y)

In [30]:
AUC = roc_auc_score(test['D_NBER'], np.array(predictions_proba)[:,0][:,1])
accuracy = accuracy_score(test['D_NBER'], predictions)

print(AUC)

0.8717086834733894


In [102]:
theta_tr = pd.read_csv('theta_train_good.csv')
theta_te = pd.read_csv('theta_test_good.csv')

# Gensim

In [475]:
vocab = Dictionary(docs)

In [460]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary

vocab = Dictionary(docs)
corpus = [vocab.doc2bow(doc) for doc in docs]
test_corpus = [vocab.doc2bow(doc) for doc in test_docs]

In [172]:
train_rolling = train['D_NBER']
predictions = []
predictions_proba = []
coefs = []

In [161]:
for n in range(test.shape[0]):
    
    gamma = lda.inference(corpus)[0]
    theta_train_rolling = gamma / gamma.sum(axis=1).reshape(-1,1)
    classifier = LogisticRegressionCV(n_jobs=-1, cv=5, random_state=0)
    classifier.fit(theta_train_rolling, train_rolling)
    
    new_d = test_docs[n]
    new_doc = vocab.doc2bow(new_d)
    
    new_y = test['D_NBER'][200+n]
    
    lda.update([new_doc])
    
    gamma_test = lda.inference([new_doc])[0]
    theta_test_doc = gamma_test / gamma_test.sum()
    
    pred = classifier.predict(theta_test_doc.reshape(1,-1))
    pred_proba = classifier.predict_proba(theta_test_doc.reshape(1,-1))
    
    predictions.append(pred)
    predictions_proba.append(pred_proba)
    coefs.append(classifier.coef_)
    
    corpus.append(new_doc)
    test_corpus = test_corpus[1:]
    
    train_rolling = np.append(train_rolling, new_y)
    

In [173]:
for n in range(test.shape[0]):
    
    lda = LdaModel(corpus, 
            num_topics=50,
            id2word=vocab, 
            alpha='auto',
            eta=0.01,
            iterations=1000,
              passes=5)
    
    gamma = lda.inference(corpus)[0]
    theta_train_rolling = gamma / gamma.sum(axis=1).reshape(-1,1)
    
    classifier = LogisticRegressionCV(n_jobs=-1, cv=5, random_state=0)
    classifier.fit(theta_train_rolling, train_rolling)
    
    new_d = test_docs[n]
    new_doc = vocab.doc2bow(new_d, allow_update=False)
    
    new_y = test['D_NBER'][200+n]
    
    gamma_test = lda.inference([new_doc])[0]
    theta_test_doc = gamma_test / gamma_test.sum()
    
    pred = classifier.predict(theta_test_doc.reshape(1,-1))
    pred_proba = classifier.predict_proba(theta_test_doc.reshape(1,-1))
    
    predictions.append(pred)
    predictions_proba.append(pred_proba)
    coefs.append(classifier.coef_)
    
    new_doc = vocab.doc2bow(new_d, allow_update=True)
    
    corpus.append(new_doc)
    test_corpus = test_corpus[1:]
    
    train_rolling = np.append(train_rolling, new_y)

In [496]:
roc_auc_score(test['D_NBER'], list(map(lambda x: x[1], predictions_proba))

0.78095238095238095

In [383]:
sum([x > 0.022 for x in temp.predict_proba(theta_test_rolling)[:,1]] == test['D_NBER']) / test.shape[0]

0.87704918032786883

In [None]:
pd.DataFrame(theta_train).to_csv('theta_train.csv')
pd.DataFrame(theta_test).to_csv('theta_test.csv')

In [25]:
pd.DataFrame(beta).to_csv('beta.csv', index=False)

In [24]:
theta_train

array([[ 0.06004202,  0.00150824,  0.03139984, ...,  0.00288369,
         0.0067228 ,  0.00238308],
       [ 0.09128644,  0.00196479,  0.04509662, ...,  0.0028223 ,
         0.00570969,  0.00845637],
       [ 0.05396986,  0.01103424,  0.02498124, ...,  0.00255555,
         0.00561184,  0.00239275],
       ..., 
       [ 0.04430849,  0.00643218,  0.059252  , ...,  0.01256581,
         0.00547095,  0.00019677],
       [ 0.04016044,  0.01423206,  0.04415131, ...,  0.02127906,
         0.00319519,  0.00134138],
       [ 0.04520864,  0.00900138,  0.04339314, ...,  0.02201024,
         0.00280463,  0.00023979]])

In [402]:
n_top_words = 10
for i, topic_dist in enumerate(beta):
    topic_words = np.argsort(topic_dist)[:-(n_top_words+1):-1]
    top_words = []
    for j in topic_words:
        top_words.append(topic_model._vocab[j])
    print('Topic {}: {}'.format(i, ' '.join(top_words)))

Topic 0: report number effect adjust ahead effort countri advers quit discuss
Topic 1: consum produc lower improv price legisl estim econom spend expans
Topic 2: price reserv stabil slightli intermeet restraint period economi spend toward
Topic 3: eas countri export capit perform sale uncertainti yen express turmoil
Topic 4: longterm tax weak littl concern temporari interestr appreci consider comput
Topic 5: sever indic demand reduc weather uncertainti effect unusu result associ
Topic 6: inflationari import invest earlier avail event export polici provid posit
Topic 7: current economi forecast refer support product anticip inflat project stanc
Topic 8: rate year quarter growth busi rang debt committe pressur expans
Topic 9: price economi condit risk inventori increas financi demand appreci tight
Topic 10: polici tighten time increas action point concern potenti broad suppli
Topic 11: household twelv increas previou slightli area food larger amount prospect
Topic 12: rel growth near inf