In [1]:
import sys
import os
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy

from gensim import corpora, models
from gensim.models import LdaModel

from docx import Document

In [26]:
model_path = os.path.join('../../model/mallet_weights_50_2019_01_15')
dictionary_path = os.path.join('../../data/processed/dictionary.dict')
label_definition_path = os.path.join('../../data/processed/Topic Definition_2019_01_15.npy')
text_file_path = "../../documentation/sample_docs/5138964-v5-Brazil_2013_Article_IV_Consultation_-_Policy_Note.DOCX"
processed_file_path = os.path.join('../../data/processed/', text_file_path.split(sep='/')[-1].split(sep ='.')[0]+'.csv')
print(os.getcwd())
print(model_path)
print(dictionary_path)

/mnt/notebook/poc
../../model/mallet_weights_50_2019_01_15
../../data/processed/dictionary.dict


#### Import Model, Dictionary, and Label (manually created)

In [3]:
lda_model = LdaModel.load(model_path)
old_dict = corpora.Dictionary.load(dictionary_path)



In [4]:
label_topic_dict = np.load(label_definition_path)
label_topic_dict = dict(label_topic_dict.tolist())

#### Create Model Topic Dictionary (Topic ID ~ Word List)

In [5]:
model_topic_list = lda_model.show_topics(num_topics= 50, num_words= 15, formatted= False)
model_topic_list = dict(model_topic_list)

model_topic_dict = dict()

for key, value in model_topic_list.items():
    word_list, prob = zip(*value)
    model_topic_dict[key] = list(word_list)

#### Generate Topic-Lable Mapping by applying IOU to manually-created labels

In [17]:
def Calculate_Intersection_Over_Union(list_a, list_b):
    
    inter_set = list(set(list_a) & set(list_b))
    union_set = list(set(list_a) | set(list_b))
    
    return len(inter_set)/len(union_set)

def Map_Topic_Label(model_dict, label_dict):
    
    new_list =dict()
    
    for model_key, model_value in model_dict.items():
        iou_list =[]
        for label_key, label_value in label_dict.items():
            iou_list.append(Calculate_Intersection_Over_Union(model_value, label_value))
        max_id = np.array(iou_list).argmax()
        new_list[model_key] = list(label_dict.keys())[max_id]
    
    return new_list

topic_label_dict = Map_Topic_Label(model_dict= model_topic_dict, label_dict= label_topic_dict )

#### Load Text File

In [7]:
def Read_Doc(f_path,word_length_filter=20):
    if os.path.isfile(f_path):
        doc = Document(f_path)
        text_list = [p.text for p in doc.paragraphs if len(p.text)>10]#[3:]
        text_list = [p.replace('\xa0',' ') for p in text_list] # some clean up 
        text_list = [p for p in text_list if len(p.split()) > word_length_filter]
    else:
        raise Exception('File does not exist: {}'.format(f_path))

    return text_list

In [8]:
new_text = Read_Doc(text_file_path)

new_text[2]

In [10]:
nlp = spacy.load('en') 

def Infer_One_Paragraph(paragraph, ldaModel):
    '''Load raw paragraph and model, return cleaned paragraph and topic_label with highest probability'''
    #### Process text using Spacy for Tokenization and loaded dictionary for bag-of-words
    new_text = nlp(paragraph)
    new_doc = [word.text for word in new_text]
    new_bow = old_dict.doc2bow(new_doc)
    
    ## Make inference and retrieve Top ID
    topic_prob = ldaModel[new_bow]
    n, prob = zip(*topic_prob)
    top_id = np.array(n)[np.array(prob).argmax()]
    
    return new_text, top_id

In [11]:
result = [Infer_One_Paragraph(paragraph, lda_model) for paragraph in new_text]

In [12]:
p, topic_id = zip(*result)

In [13]:
result = pd.DataFrame({'Paragraph': p, 'Topic ID' : topic_id})

In [18]:
result.Paragraph.apply(str)
result['Label'] = result['Topic ID'].apply(lambda x: topic_label_dict[x])

In [28]:
result.to_csv(processed_file_path)