In [None]:
import pandas as pd
import os
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from allennlp.modules.elmo import Elmo, batch_to_ids
import spacy
import sys
import numpy as np

In [None]:
# load up spacy
nlp = spacy.load('en_core_web_lg', disable=['ner'])

In [None]:
# https://www.consumerfinance.gov/data-research/hmda/

base_path = '/home/datawrestler/data/financial'
fname = 'financial.csv'
full_path = os.path.join(base_path, fname)

df = pd.read_csv(full_path, low_memory=False)
# shuffle the inputs
df = df.sample(n=df.shape[0])

In [None]:
consumer_complaints = df.loc[df['Consumer complaint narrative'].notnull()]

In [None]:
consumer_complaints.head()

In [None]:
# get a sense for the totla number of possible complaint issues
consumer_complaints.groupby('Product')['Complaint ID'].nunique()

In [None]:
# get a sense for how long these narratives are
consumer_complaints['wrdCount'] = consumer_complaints['Consumer complaint narrative'].apply(lambda x: len(x.split()))

In [None]:
# these are very long narratives - lets split them on paragraphs and align with 
# doc id so we have a unique docid for each paragraph that can resolve back to 
# the original docid
consumer_complaints['wrdCount'].describe()

In [None]:
def make_paragraph(text):
    text = text.replace('\n\n', '\n')
    text = text.split('\n')
    return text

consumer_complaints['paragraphs'] = (consumer_complaints['Consumer complaint narrative']
                                     .apply(lambda x: make_paragraph(x)))

In [None]:
# split out so we have one row per paragraph
# expand out topics to one topic per row
tmp = (consumer_complaints.set_index('Complaint ID')['paragraphs']
       .apply(pd.Series)
       .stack()
       .reset_index()
       .drop('level_1', axis=1))

In [None]:
# tmp['index'] = 1
tmp['docid'] = tmp.assign(index=1).groupby('Complaint ID')['index'].transform('cumsum')

In [None]:
# drop index
tmp = tmp.drop('index', axis=1)

In [None]:
# rename 
tmp = tmp.rename(columns={0: 'complaint'})

In [None]:
# concatenate the cumulative sum of the index with the doc id to create a unique index
# based on the paragraph
tmp['docid'] = tmp.apply(lambda x: '{}_{}'.format(x['Complaint ID'], x['docid']), axis=1)

In [None]:
docs = tmp['complaint'].tolist()
docids = tmp['docid'].tolist()

In [None]:
assert len(docs) == len(set(docids)), """docids not unique"""

In [None]:
# take subset
max_ids = 20000
docs = docs[0:max_ids]
docids = docids[0:max_ids]

In [None]:
def clean_complaints(text):
    text = text.replace('\n', '')
    text = strip_multiple_whitespaces(text)
    return text

docs = [clean_complaints(complaint) for complaint in docs]

In [None]:
import torch
# specify device type
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

In [None]:
# convert to sentences for ELMO
paragraphs = []


for ii, doc in enumerate(nlp.pipe(docs, batch_size=10000, n_threads=12)):
    docid = docids[ii]
    sys.stdout.write('\rIndex: {}'.format(ii))
    sys.stdout.flush()
    tokens = [tok.text for tok in doc]
    paragraphs.append(tokens)

In [None]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        chunk = l[i:i + n]
        docids = [x[0] for x in chunk]
        sentences = [x[1] for x in chunk]
        yield docids, sentences

In [None]:
# large weights file

large_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'

large_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

In [None]:
# config file
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
# preliminary weights file
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

In [None]:
from allennlp.commands.elmo import ElmoEmbedder
elmo = ElmoEmbedder(options_file=options_file, weight_file=weight_file, cuda_device=0)


In [None]:
sentences = chunks()
sents = [sent[1] for sent in sentences]

In [None]:
doclen = [len(doc) for doc in docs]
np.mean(doclen)

In [None]:
# trim docs
max_doc_len = 150
docs = [doc[0:max_doc_len] for doc in docs]

In [None]:
vectors = elmo.embed_sentences(docs)

In [None]:
paragraph_vector = []

for vec in vectors:
    paragraph_vector.append(vec)

In [None]:
sentdf = pd.DataFrame({'sents': docs, 
                      'docid': docids,
                      'embedding': paragraph_vector})

In [None]:
# now we need a word to vector lookup mapping for out topic model. However, topic modelling 
# is very sensitive to the word types. We will need better preprocessing than what was used for 
# ELMO - however, we need to keep track of the index in the sentence for the word when we drop 
# punctuation, stopwords, etc. 

def norm_text(input_sentence):
    # input sentence is currently tokenized
    # input_sentence = ' '.join(input_sentence)
    # convert to spacy doc
    doc = nlp(input_sentence)
    return doc

sentdf['spacyDoc'] = sentdf['sents'].apply(lambda x: norm_text(x))

In [None]:
sentdf['sents'].values[0]

In [None]:
sentdf['spacyDoc'].values[0]

In [None]:
def lemmatize(spacy_doc):
    return [token.lemma_ for token in spacy_doc]

sentdf['lemmas'] = sentdf['spacyDoc'].apply(lambda x: lemmatize(x))

In [None]:
from collections import defaultdict

def lemma_to_vec(row):
    lemmas = row['lemmas']
    vector = row['embedding']
    
    if len(lemmas) != vector.shape[1]:
        return None
    
    lemma2vec = defaultdict(lambda: [])
    
    for idx, lemma in enumerate(lemmas):
        lemma2vec[lemma].append(vector[0][idx]) # we want to embedding layer - could take the average of all layers
    
    # finally iterate back over the keys and take the average of each lemmas vector
    # i.e. the same word appears multiple times
    for key in lemma2vec.keys():
        lemma2vec[key] = np.mean(np.array(lemma2vec[key]), axis=0)
        
    return lemma2vec
        
sentdf['lemma2vec'] = sentdf.apply(lambda row: lemma_to_vec(row), axis=1)

##### Topic Modelling

Now we are ready to perform topic modelling. We will do one final pass to extract just the terms we are interested 
in processing, removing punctuation, etc. and feed into gensim ldamulticore model

In [None]:
# download smart stopwords list
import requests
from bs4 import BeautifulSoup

smart = 'http://www.lextek.com/manuals/onix/stopwords2.html'

soup = BeautifulSoup(requests.get(smart).content, 'html.parser')

pre = soup.find('pre').text

In [None]:
smartwords = [line for line in pre.split('\n') if not line.startswith('#') and line != '']
smartwords = [token.lemma_ for line in smartwords for token in nlp(line)]

In [None]:
import string
def tm_text(doc):
    tokens = [token.lemma_ for token in doc if token.lemma_ != '-PRON-' and token.text not in string.punctuation and token.pos_ in ['NOUN', 'VERB']]
    return tokens

sentdf['tm_tokens'] = sentdf['spacyDoc'].apply(lambda x: tm_text(x))

In [None]:
# filter low wrdcount rows
sentdf['wrdCount'] = sentdf['tm_tokens'].apply(lambda x: len(x))

In [None]:
# high word count
sentdf_long = sentdf.loc[sentdf['wrdCount'] > 2]

In [None]:
from gensim.corpora import Dictionary
import gensim.corpora as corpora

id2word = Dictionary(sentdf_long['tm_tokens'].tolist())

In [None]:
id2word.filter_extremes(no_below=2, no_above=0.5)

In [None]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in sentdf_long['tm_tokens']]

In [None]:
from gensim.models.ldamulticore import LdaMulticore

In [None]:
lda = LdaMulticore(num_topics=12, id2word=id2word, corpus=corpus, passes=10, minimum_probability=0.15, 
                  per_word_topics=True, minimum_phi_value=0.15)

In [None]:
lda.show_topics(num_topics=20)

In [None]:
text = sentdf_long['tm_tokens'].values[20]

In [None]:
doctopics, words, phi = lda.get_document_topics(id2word.doc2bow(text), per_word_topics=True, 
                                            minimum_phi_value=0.15, minimum_probability=0.15, 
                                            )

In [None]:
doctopics

In [None]:
def return_doc_topics(text):
    
    doctopics, words, phi = lda.get_document_topics(id2word.doc2bow(text), per_word_topics=True, 
                                            minimum_phi_value=0.15, minimum_probability=0.15, 
                                            )

    topicwords = defaultdict(lambda: [])

    doctopicnums = [topic[0] for topic in doctopics]

    for wrd in words:
        wrdid = wrd[0]
        topics = wrd[1]
        for topicnum in doctopicnums:

            if topicnum in topics:
                topicwords[topicnum].append(lda.id2word[wrdid])
    return doctopics, topicwords
    
sentdf_long['topics'] = sentdf_long['tm_tokens'].apply(lambda x: return_doc_topics(x))

In [None]:
sentdf_long['topicnum'] = sentdf_long['topics'].apply(lambda x: x[0])
sentdf_long['topics2words'] = sentdf_long['topics'].apply(lambda x: x[1])

In [None]:
sentdf_long['topicnum'].values[0]

In [None]:
# we need a unique doc and sentence id
sentdf_long['sentid'] = sentdf_long.index

sentdf_long['docid'] = sentdf_long.apply(lambda x: '{}_{}'.format(x['docid'], x['sentid']), axis=1)

In [None]:
# expand out topics to one topic per row
tmp = (sentdf_long.set_index('docid')['topicnum']
       .apply(pd.Series)
       .stack()
       .reset_index()
       .drop('level_1', axis=1))

In [None]:
tmp['topicnum'] = tmp[0].apply(lambda x: x[0])
tmp['topicprob'] = tmp[0].apply(lambda x: x[1])
tmp = tmp.drop(0, axis=1)

In [None]:
tmp.head()

In [None]:
sentdf_long.shape

In [None]:
sentdf_long['docid'].nunique()

In [None]:
sentdf_long = sentdf_long.drop(['topics', 'topicnum'], axis=1)

sentdf_long = pd.merge(sentdf_long, tmp, on='docid')
assert sentdf_long.shape[0] == tmp.shape[0]

In [None]:
# filter topics2words to topic of row
sentdf_long['topics2words'] = sentdf_long.apply(lambda x: x['topics2words'][x['topicnum']], axis=1)

In [None]:
sentdf_long['topics2words'].values[100]

In [None]:
sentdf_long.head()

In [None]:
sentdf_long.loc[sentdf_long['lemma2vec'].isnull()].shape

In [None]:
# now get average embedding for topic words

def average_topic_embedding(row):
    embs = row['lemma2vec']
    topicwords = row['topics2words']
    all_embs = []
    for wrd in topicwords:
        all_embs.append(embs[wrd])
    return np.mean(np.array(all_embs), axis=0)

sentdf_long = sentdf_long.loc[sentdf_long['lemma2vec'].notnull()]
sentdf_long['topicvector'] = sentdf_long.apply(lambda row: average_topic_embedding(row), axis=1)

In [None]:
lda.show_topics(num_topics=12)

In [None]:
sentdf_long.groupby('topicnum')['docid'].nunique()

In [None]:
# isolate to topic 4
topic4 = sentdf_long.loc[sentdf_long['topicnum'] == 11]

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler


X = topic4['topicvector'].tolist()

X = StandardScaler().fit_transform(X)

# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.1, min_samples=10).fit(X)
labels = db.labels_

In [None]:
topic4['cluster'] = labels

In [None]:
topic4.groupby('cluster')['docid'].nunique()

In [None]:
topic4.columns

In [None]:
topic4.loc[topic4['cluster'] == -1, 'sents'].tolist()

In [None]:
topic4.loc[topic4['cluster'] == 1, 'sents'].tolist()

In [None]:
cluster = topic4.loc[topic4['cluster'] != -1]
X_sne = cluster['topicvector'].tolist()

In [None]:
import numpy as np
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, init='pca').fit_transform(X_sne)
X_embedded.shape

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
from sklearn.utils import check_random_state

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# print __version__ # requires version >= 1.9.0


def matplotlib_to_plotly(cmap, pl_entries):
    h = 1.0/(pl_entries-1)
    pl_colorscale = []
    
    for k in range(pl_entries):
        C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
        
    return pl_colorscale

cmap = matplotlib_to_plotly(plt.cm.rainbow, 4)


init_notebook_mode(connected=True)

In [None]:
cluster.head()

In [None]:
tsne_data = X_embedded.T

trace = go.Scatter(x=tsne_data[0], y=tsne_data[1], 
                   mode='markers', 
                   marker=dict(# color=colors, 
                               colorscale=cmap,
                               showscale=False,
                               line=dict(color='black', width=1)), 
                  text=cluster['sents'].tolist())

iplot([trace])

In [None]:
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs


X = topic4['topicvector'].tolist()
# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

In [None]:
n_clusters_

In [None]:
topic4['clusternum'] = labels

topic4.groupby('clusternum')['docid'].nunique()

In [None]:

# Compute two different representation for each token.
# Each representation is a linear weighted combination for the
# 3 layers in ELMo (i.e., charcnn, the outputs of the two BiLSTM))
elmo = Elmo(options_file, weight_file, 2, dropout=0)
# move to GPU
elmo = elmo.to(device)

In [None]:
import numpy as np

# get sentence lengths
sent_len = [len(x[1]) for x in sentences]

In [None]:
np.mean(sent_len)

In [None]:
dir(character_ids)

In [None]:
MAX_SENT_LEN = 30


all_embeddings = []
all_sentences = []
all_docids = []
for ii, chunk in enumerate(chunks(sentences, 8)):
    print(ii)
    ids = chunk[0]
    sents = chunk[1]
    # truncate the sentence to prevent gpu memory issues
    sents = [sent[0:min(MAX_SENT_LEN, len(sents))] for sent in sents]
    all_sentences.append(sents)
    all_docids.append(ids)
    character_ids = batch_to_ids(sents)
    # move to GPU
    character_ids = character_ids.to(device)
    embeddings = elmo(character_ids)
    all_embeddings.append(embeddings)
    del character_ids

In [None]:
embeddings = elmo(character_ids)

# The first layer corresponds to the context insensitive token representation, 
# followed by the two LSTM layers. See the ELMo paper or follow up work at EMNLP 2018 
# for a description of what types of information is captured in each layer.

# embeddings['elmo_representations'] is length two list of tensors.
# Each element contains one layer of ELMo representations with shape
# (2, 3, 1024).
#   2    - the batch size
#   3    - the sequence length of the batch
#   1024 - the length of each ELMo vector

In [None]:
chunker = chunks(sentences, 128)
chunk = next(chunker)
ids = chunk[0]
sents = chunk[1]
elmo.eval()
character_ids = batch_to_ids(sents)
embeddings = elmo(character_ids)

In [None]:
character_ids

In [None]:
elmo.train(character_ids)

In [None]:
embeddings.keys()

In [None]:
embeddings['elmo_representations'][0].shape

In [None]:
dir(elmo)

In [None]:
from tqdm import trange
from time import sleep

from tqdm import tqdm_notebook as tqdm

def clip_grads(model, clip_weight=0.25):
    # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip_weight)
    for p in model.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    

def train_model(num_epochs=10):
    """One epoch of a training loop"""
    
    for epoch in range(0, num_epochs):
        # turn on training mode
        epoch_loss = 0
        t = tqdm(train_iter)
        batch_ii = 0
        for batch in t:
            batch_ii += 1
            # reset the hidden state or else the model will try to backpropagate to the
            # beginning of the dataset, requiring lots of time and a lot of memory
            elmo.train()
            t.set_description('Epoch: {}'.format(epoch))
            t.refresh()
             #elmo.reset_history()

            elmo.zero_grad()

            text, targets = batch.text, batch.target
            prediction = model(text)
            # pytorch currently only supports cross entropy loss for inputs of 2 or 4 dimensions.
            # we therefore flatten the predictions out across the batch axis so that it becomes
            # shape (batch_size * sequence_length, n_tokens)
            # in accordance to this, we reshape the targets to be
            # shape (batch_size * sequence_length)
            loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
            loss.backward()
            
            # clip gradients
            clip_grads(model)

            optimizer.step()

            # epoch_loss += loss.data[0] * prediction.size(0) * prediction.size(1)
            epoch_loss += loss.item() * prediction.size(0) * prediction.size(1)

            epoch_loss /= len(train.examples[0].text)
            
            
            
        # print('Epoch: {}, Training Loss: {:.4f}'.format(epoch, epoch_loss))
        # capture validation loss for each batch
        valid_loss = validation_loss(valid_iter, model)
        print('Epoch: {} | Training Loss: {:.4f} | Valid Loss: {:.4f}'.format(epoch, 
                                                                             epoch_loss, 
                                                                             valid_loss))
 
    final_val_loss = validation_loss(valid_iter, model)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, 
                                                                             epoch_loss, 
                                                                             final_val_loss))

    
train_model(num_epochs=100)