## Topic model

In [1]:
import pandas as pd
import numpy as np

from test_model import (get_patent_fields_list, get_ml_patents, 
                        create_title_abstract_col,trim_data, 
                        structure_dataframe, partition_dataframe, 
                        build_pipeline, process_docs)# TODO (Lee) resolve

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary, mmcorpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models import AuthorTopicModel
from gensim.test.utils import common_dictionary, datapath, temporary_file
from smart_open import smart_open

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
from pandas.io.json import json_normalize
import requests
import re
import os
import calendar
import requests
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

%load_ext autoreload
%autoreload 2



In [2]:
np.random.seed(3)

### Acquire data

In [3]:
%%capture
# acquire dataset of ML patents by making api call from PatentsView API 
raw_data = get_ml_patents()

### Structure data

In [5]:
# specify fields (key:val pairs) to retain from full set of returned fields from api call
retained_keys = ['patent_number', 'patent_date', 'patent_title', 'patent_abstract', 'inventors']
data = trim_data(data=raw_data, keys=retained_keys)

# create new key:value pair from combined values of patent_title and patent_abstract keys
data = create_title_abstract_col(data=data)

# create dataframe, organize columns and sort by patent_date
df = structure_dataframe(data=data)

#### Partition data

# partition data
data_train, data_test = partition_dataframe(df, .8)

# convert dataframe to list
text_data = df.patent_title_abstract.tolist()
text_train = data_train.patent_title_abstract.tolist()
text_train = data_test.patent_title_abstract.tolist()

### Pre-process text data

In [8]:
text_list = []
for i in data:
    text_list.append(i['patent_title_abstract'])

In [11]:
text_list

["Statistical, noninvasive measurement of a patient's physiological state. Tools and techniques for the rapid, continuous, invasive and/or noninvasive measurement, estimation, and/or prediction of a patient's physiological state. In an aspect, some tools and techniques can estimate predict the onset of conditions intracranial pressure, an amount of blood volume loss, cardiovascular collapse, and/or dehydration. Some tools can recommend (and, in some cases, administer) a therapeutic treatment for the patient's condition. In another aspect, some techniques employ high speed software technology that enables active, long term learning from extremely large, continually changing datasets. In some cases, this technology utilizes feature extraction, state-of-the-art machine learning and/or statistical methods to autonomously build and apply relevant models in real-time.",
 'Determining a health condition of a structure. The disclosure relates to structural health monitoring (SHM). In particula

### Pre-process text data

In [None]:
# construct pipeline
# uncomment to download stop words
# !python -m spacy download en
stop_words = stopwords.words('/Users/lee/Documents/techniche/techniche/data/stopwords/english')

nlp = build_pipeline()
print(nlp.pipe_names)

# pre-process documents
processed_docs = process_docs(text_list)

### Build corpus and dictionary

In [None]:
# build dictionary
id_to_word = Dictionary(processed_docs)

# apply term document frequency
# converts documents in corpus to bag-of-words format, a list of (token_id, token_count) tuples
corpus = [id_to_word.doc2bow(doc) for doc in processed_docs]
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# view formatted corpus (term-doc-frequency)
formatted_corpus = [[(id_to_word[id], freq) for id, freq in text] for text in corpus]
#formatted_corpus

### Train model - model #1

In [None]:
# TODO (Lee) - deprecation warnings
# construct LDA model
model_lda = LdaModel(corpus=corpus,
                     id2word=id_to_word,
                     num_topics=25, 
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

In [None]:
# keywords in n topics in corpus
# pprint(model_lda.print_topics())

In [None]:
# most important keywords, and the respective weight, that form topic with index 0
# pprint(model_lda.print_topic(24))

### Evaluate - model #1

In [None]:
# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus)
print(perplexity)

In [None]:
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=processed_docs, dictionary=id_to_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
print(coherence_1)

In [None]:
# calculate coherence metric or each of the n topicss
coherence_1 = coherence.get_coherence_per_topic()
coherence_1

In [None]:
# explore topics
pyLDAvis.enable_notebook()
viz_topics_1 = pyLDAvis.gensim.prepare(model_lda, corpus, id_to_word)
viz_topics_1

### Model 2-  Mallet model

In [None]:
# uncomment to download Mallet topic model
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# update path
path_mallet = '/Users/lee/Documents/techniche/techniche/data/mallet-2.0.8/bin/mallet'

In [None]:
model_2 = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=25, id2word=id_to_word)

In [None]:
# topics
pprint(model_2.show_topics(formatted=False))

In [None]:
# TODO (Lee) - calculate coherence metric
coherence_2 = CoherenceModel(model=model_2, texts=data, dictionary=id_to_word, coherence='c_v')
coherence_2 = coherence_2.get_coherence()
print(coherence_2)

In [None]:
# TODO (Lee)
# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
#     """
#     Compute c_v coherence for various number of topics

#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
#     texts : List of input texts
#     limit : Max num of topics

#     Returns:
#     -------
#     model_list : List of LDA topic models
#     coherence_values : Coherence values corresponding to the LDA model with respective number of topics
#     """
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, limit, step):
#         model = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=num_topics, id2word=id2word)
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())

#     return model_list, coherence_values

# model_list, coherence_values = compute_coherence_values(dictionary=id_to_word, corpus=corpus, texts=data, start=2, limit=40, step=6)

### Model 3 - Author topic model

In [None]:
# construct inventor-to-doc mapping as df from nested inventors column in json api response
df_inventors = json_normalize(results['patents'], record_path=['inventors'], meta=['patent_number', 'patent_date'])
df_inventors = df_inventors[['inventor_id', 'patent_number', 'patent_date']]
df_inventors.sort_values(by=['patent_date'])
df_inventors.head(3)

In [None]:
# construct inventor-to-doc mapping as df from nested inventors column in json api response
df_inventors = json_normalize(raw_data, record_path=['inventors'], meta=['patent_number', 'patent_date'])
df_inventors = df_inventors[['inventor_id', 'patent_number', 'patent_date']]
df_inventors.sort_values(by=['patent_date'])
df_inventors.head(3)

In [None]:
def flatten_pv_table():
    # construct inventor-to-doc mapping as df from nested inventors column in json api response
    df_inventors = json_normalize(results['patents'], record_path=['inventors'], meta=['patent_number', 'patent_date'])
    df_inventors = df_inventors[['inventor_id', 'patent_number', 'patent_date']]
    df_inventors.sort_values(by=['patent_date'])
    df_inventors.head(3)
    

In [None]:
author2doc (dict of (str, list of int), optional)
– A dictionary where keys are the names of authors and values are lists of document IDs that the author contributes to.

doc2author (dict of (int, list of str), optional)
- A dictionary where the keys are document IDs and the values are lists of author names.

In [None]:
# { int(patent_number, [str(inventor_id)]

In [None]:
foodict = {k: v for k, v in mydict.items() if k.startswith('foo')}

In [None]:
new_dict = {}
for dictionary in raw_data:
    dictionary['patent_number']
    new_dict.update(dictionary)

In [None]:
for dictionary in raw_data:
    new_dict = dict(dictionary['patent_number'])

In [None]:
abc = {"type":"insecure","id":"1","name":"peter"}
black_list = {"type"}
rename ={"id":"identity"}  #use a mapping dictionary in case you want to rename multiple items
dic = {rename.get(key,key) : val for key ,val in abc.items() if key not in black_list}
print dic

In [None]:
raw_data[0]

In [None]:
mapdict
{}

'patent_number', 'inventors'

In [None]:
#keys = ['patent_number', 'inventors']
dict2 = {x: raw_data[0][x] for x in ['patent_number', 'inventors']}

In [None]:
inv_list = []
new_dict = {}
for patent in raw_data:
    inv_list = [inventor['inventor_id'] for inventor in patent['inventors']]
    new_dict = {patent['patent_number'] : inv_list for patent in raw_data}
        
    #    inv_list.append(new_dict) = {patent['patent_number'], inv_list for ['patent_number', 'inventors']}
    #new_dict.update({dictionary['patent_number']: inv_list for k,v in dictionary})

In [None]:
def pat_inv_map(data):
    pat_inv_dict = {}
    for patent in data:
        inventors = [inventor['inventor_id'] for inventor in patent['inventors']]
        pat_number = int(patent['patent_number'])
        pat_inv_dict[pat_number] = inventors
    return pat_inv_dict
    #    inv_list.append(new_dict) = {patent['patent_number'], inv_list for ['patent_number', 'inventors']}
    #new_dict.update({dictionary['patent_number']: inv_list for k,v in dictionary})

In [None]:
len(data)

In [None]:
len(pat2inv.items())

In [None]:
len(corpus)

In [None]:
corpus[:1]

In [None]:
id_to_word

In [None]:
pat2inv = pat_inv_map(data)
pat2inv

In [None]:
pat2inv = {k: list(str(v)) for k,v in df_inventors.groupby("patent_number")["inventor_id"]}

In [None]:
patdf2inv = dict((df_pat_idx[key], value) for (key, value) in pat2inv.items())
patdf2inv

#### Construct author-topic model

In [None]:
# construct author-topic model
model_at = AuthorTopicModel(corpus=corpus,
                         doc2author=pat2inv,
                         id2word=id_to_word)

In [None]:
len(raw_data)

In [None]:
len(pat2inv.items())

In [None]:
len(corpus)

In [None]:
# construct vectors for authors
author_vecs = [model_at.get_author_topics(author) for author in model_at.id2author.values()]
author_vecs

In [None]:
# retrieve the topic distribution for an author using use model[name] syntax
# each topic has a probability of being expressed given the particular author, but only the ones above a certain threshold are shown.

model_at['7788103-1']

In [None]:
# def show_author(name):
#     print('\n%s' % name)
#     print('Docs:', model.author2doc[name])
#     print('Topics:')
#     pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [None]:
# calculate per-word bound, which is a measure of the model's predictive performance (reconstruction error?)

build doc2author dictionary

doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [None]:
from gensim.models import atmodel
doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [None]:
gensim.models.atmodel.construct_author2doc(doc2author)
# construct mapping from author IDs to document IDs.

Parameters:	doc2author (dict of (int, list of str)) – Mapping of document id to authors.
Returns:	Mapping of authors to document ids.
Return type:	dict of (str, list of int)

In [None]:
gensim.models.atmodel.construct_doc2author(corpus, author2doc)
construct mapping from document IDs to author IDs

Parameters:	
corpus (iterable of list of (int, float)) – Corpus in BoW format.
author2doc (dict of (str, list of int)) – Mapping of authors to documents.
Returns:	
Document to Author mapping.

Return type:	
dict of (int, list of str)