## Topic model

In [4]:
import pandas as pd
import numpy as np

from test_model import (get_patent_fields_list, get_ml_patents, 
                        create_title_abstract_col,trim_data, 
                        structure_dataframe, partition_dataframe, 
                        build_pipeline, process_docs, pat_inv_map, get_topics)# TODO (Lee) resolve

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary, mmcorpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models import AuthorTopicModel
from gensim.test.utils import common_dictionary, datapath, temporary_file
from smart_open import smart_open

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
from pandas.io.json import json_normalize
import requests
import re
import os
import calendar
import requests
from bs4 import BeautifulSoup
import pickle

import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
np.random.seed(3)

### Acquire data

In [6]:
%%capture
# acquire dataset of ML patents by call to PatentsView API 
raw_data = get_ml_patents()

### Structure data

In [7]:
# specify fields (key:val pairs) to retain from full api response
retained_keys = ['patent_number', 'patent_date', 'patent_title', 'patent_abstract', 'inventors']
data = trim_data(data=raw_data, keys=retained_keys)

# create new key:value pair by combining values from patent_title and patent_abstract keys
data = create_title_abstract_col(data=data)

# create dataframe, organize columns and sort by patent_date
df = structure_dataframe(data=data)

# partition data
data_train, data_test = partition_dataframe(df, .8)

# convert dataframe to list
text_data = df.patent_title_abstract.tolist()
text_train = data_train.patent_title_abstract.tolist()
text_train = data_test.patent_title_abstract.tolist()

In [8]:
# TODO (Lee) - this explores direct structuring from api response without df
text_list = []
for i in data:
    text_list.append(i['patent_title_abstract'])
# text_list

### Pre-process text data

In [9]:
# construct pipeline
# uncomment to download stop words
# !python -m spacy download en
stop_words = stopwords.words('/Users/lee/Documents/techniche/techniche/data/stopwords/english')

nlp = build_pipeline()
print(nlp.pipe_names)

# pre-process documents
processed_docs = process_docs(text_list)

['tagger', 'parser', 'ner']


### Build corpus and dictionary

In [10]:
# build dictionary
id_to_word = Dictionary(processed_docs)

# apply term document frequency
# converts documents in corpus to bag-of-words format, a list of (token_id, token_count) tuples
corpus = [id_to_word.doc2bow(doc) for doc in processed_docs]

In [11]:
# view formatted corpus (term-doc-frequency)
formatted_corpus = [[(id_to_word[id], freq) for id, freq in text] for text in corpus]
#formatted_corpus
#id_to_word.token2id

### Train model - model #1

In [12]:
# TODO (Lee) - deprecation warnings
# construct LDA model
model_lda = LdaModel(corpus=corpus,
                     id2word=id_to_word,
                     num_topics=25, 
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [13]:
# keywords in n topics in corpus
# pprint(model_lda.print_topics())

In [14]:
# most important keywords, and the respective weight, that form topic with index 0
# pprint(model_lda.print_topic(24))

In [15]:
# pickle model
pickle.dump(model_lda, open('/Users/lee/Documents/techniche/techniche/data/model_lda_1.pkl','wb'))

In [16]:
model_lda = pickle.load(open('/Users/lee/Documents/techniche/techniche/data/model_lda_1.pkl','rb'))

### Predict

In [17]:
# extracts topics for given document from Gensim
def get_topics(doc, k=5, model_lda=model_lda):
    topic_id = sorted(model_lda[doc][0], key=lambda x: -x[1])
    top_k_topics = [x[0] for x in topic_id[:k]]
    return [(i, model_lda.print_topic(i)) for i in top_k_topics]

In [18]:
# `get_document_topics()` returns topic probability distribution for given document
# topic_dist_675_a = model_lda.get_document_topics(corpus[50])
# pprint(sorted(topic_dist_50_a))

In [19]:
# topicid = 3
# model_lda.get_topic_terms(topicid, topn=10)

In [20]:
# text_train[doc_id]
# doc_id = 675
# topic_dist_675_b = sorted(get_topics(corpus[doc_id], k=10)), text_train[doc_id]
# pprint(topic_dist_675_b)

In [21]:
# text = 'virtual dictionary lexicon enablement voice'.split()
text_input_1 = 'smart assistant transformer model translation'.split()

In [24]:
id_to_word.doc2bow(text_input_1)

[(104, 1), (159, 1), (309, 1)]

In [25]:
get_topics(id_to_word.doc2bow(text_input_1), k=10)

[(6,
  '0.114*"content" + 0.037*"items" + 0.033*"also" + 0.033*"translation" + 0.022*"machine" + 0.019*"requests" + 0.018*"human" + 0.018*"localization" + 0.015*"rendering" + 0.015*"translations"'),
 (0,
  '0.054*"machine" + 0.054*"learning" + 0.046*"classifier" + 0.036*"user" + 0.029*"data" + 0.021*"loss" + 0.021*"assistant" + 0.021*"upgrade" + 0.021*"digital" + 0.018*"one"')]

In [26]:
text_input_2 = """At the Siri International team within Apple we bring the Siri intelligent assistant to our customers worldwide in over 40 languages and dialects. Join us, and tackle some of the most challenging problems in natural language processing and large scale applied machine learning. You will build cutting edge natural language understanding technologies and deploy them on a global scale. Your work will advance and shape the future vision of our multi-lingual, multi-cultural Siri assistant, and Search applications used by millions across the world Key Qualifications
Extensive track record of scientific research in NLP and Machine Learning, or similar experience in developing language technologies for shipping products.
Strong coding and software engineering skills in a mainstream programming language, such as Python, Java, C/C++.
Familiarity with NLP/ML tools and packages like Caffe, pyTorch, TensorFlow, Weka, scikit-learn, nltk, etc.
Practical experience building production quality applications related to natural language processing and machine learning.
In-depth knowledge of machine learning algorithms and ability to apply them in data driven natural language processing systems.
Ability to quickly prototype ideas / solutions, perform critical analysis, and use creative approaches for solving complex problems.
Attention to detail and excellent communication skills.
Description
We are looking for a highly motivated technologist with a strong background in Natural Language Processing and Machine Learning research. The ideal candidate will have a strong track record of taking research ideas to real-world applications. In this position you will apply your problem solving skills to challenges and opportunities within Siri International, which involves development of large-scale language technologies for many natural languages worldwide. The primary responsibility of this role is to conduct research and develop innovative machine learning, artificial intelligence and NLP solutions for multi-lingual conversational agents. You will have the opportunity to investigate cutting edge research methods that will improve customer experience of our products and enable our engineers to scale these technologies across a variety of natural languages. You will also provide technical leadership and experiment-driven insights for engineering teams on their machine learning modeling and data decisions. You will play a central role in defining the future technical directions of Siri International through quick prototyping, critical analysis and development of core multi-lingual NLP technologies.
Education & Experience
* PhD in Machine Learning, Statistics, Computer Science, Mathematics or related field with specialization in natural language processing and/or machine learning, OR * Masters degree in a related field with a strong academic/industrial track record. * Hands-on research experience in an academic or industrial setting.""".split()

In [27]:
get_topics(id_to_word.doc2bow(text_input_2), k=10)

[(9,
  '0.070*"language" + 0.064*"natural" + 0.049*"may" + 0.042*"entity" + 0.033*"include" + 0.030*"communication" + 0.026*"device" + 0.023*"associated" + 0.019*"determining" + 0.016*"input"'),
 (0,
  '0.054*"machine" + 0.054*"learning" + 0.046*"classifier" + 0.036*"user" + 0.029*"data" + 0.021*"loss" + 0.021*"assistant" + 0.021*"upgrade" + 0.021*"digital" + 0.018*"one"'),
 (18,
  '0.037*"data" + 0.023*"distributed" + 0.021*"model" + 0.019*"cad" + 0.017*"memory" + 0.017*"structure" + 0.017*"state" + 0.015*"machine" + 0.015*"learning" + 0.015*"crowd"'),
 (5,
  '0.018*"based" + 0.017*"knowledge" + 0.017*"queries" + 0.017*"manager" + 0.017*"priority" + 0.017*"database" + 0.017*"scores" + 0.013*"information" + 0.012*"learning" + 0.012*"machine"'),
 (14,
  '0.023*"machine" + 0.023*"learning" + 0.023*"content" + 0.023*"value" + 0.023*"second" + 0.020*"first" + 0.020*"feature" + 0.018*"selected" + 0.018*"input" + 0.015*"using"'),
 (19,
  '0.036*"text" + 0.031*"free" + 0.021*"description" + 0

In [30]:
# print keywords in n topics
sorted(model_lda.show_topics(), key=lambda x: x[1])

[(10,
  '0.001*"language" + 0.001*"stub" + 0.001*"method" + 0.001*"natural" + 0.001*"candidate" + 0.001*"user" + 0.001*"system" + 0.001*"methods" + 0.001*"one" + 0.001*"grammar"'),
 (21,
  '0.001*"may" + 0.001*"include" + 0.001*"entity" + 0.001*"communication" + 0.001*"expert" + 0.001*"identifying" + 0.001*"determining" + 0.001*"whether" + 0.001*"user" + 0.001*"method"'),
 (12,
  '0.001*"precise" + 0.001*"positions" + 0.001*"resulting" + 0.001*"resolved" + 0.001*"research" + 0.001*"ranging" + 0.001*"properties" + 0.001*"slow" + 0.001*"power" + 0.001*"optical"'),
 (22,
  '0.001*"user" + 0.001*"message" + 0.001*"segments" + 0.001*"expert" + 0.001*"messages" + 0.001*"social" + 0.001*"posted" + 0.001*"media" + 0.001*"website" + 0.001*"may"'),
 (14,
  '0.023*"machine" + 0.023*"learning" + 0.023*"content" + 0.023*"value" + 0.023*"second" + 0.020*"first" + 0.020*"feature" + 0.018*"selected" + 0.018*"input" + 0.015*"using"'),
 (1,
  '0.024*"processing" + 0.015*"may" + 0.015*"one" + 0.015*"data

In [31]:
# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[1])

[(10,
  '0.001*"language" + 0.001*"stub" + 0.001*"method" + 0.001*"natural" + 0.001*"candidate" + 0.001*"user" + 0.001*"system" + 0.001*"methods" + 0.001*"one" + 0.001*"grammar"'),
 (21,
  '0.001*"may" + 0.001*"include" + 0.001*"entity" + 0.001*"communication" + 0.001*"expert" + 0.001*"identifying" + 0.001*"determining" + 0.001*"whether" + 0.001*"user" + 0.001*"method"'),
 (12,
  '0.001*"precise" + 0.001*"positions" + 0.001*"resulting" + 0.001*"resolved" + 0.001*"research" + 0.001*"ranging" + 0.001*"properties" + 0.001*"slow" + 0.001*"power" + 0.001*"optical"'),
 (22,
  '0.001*"user" + 0.001*"message" + 0.001*"segments" + 0.001*"expert" + 0.001*"messages" + 0.001*"social" + 0.001*"posted" + 0.001*"media" + 0.001*"website" + 0.001*"may"'),
 (5,
  '0.018*"based" + 0.017*"knowledge" + 0.017*"queries" + 0.017*"manager" + 0.017*"priority" + 0.017*"database" + 0.017*"scores" + 0.013*"information" + 0.012*"learning" + 0.012*"machine"'),
 (17,
  '0.020*"content" + 0.020*"real" + 0.019*"upgrade

In [32]:
# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[1])

[(10,
  '0.001*"language" + 0.001*"stub" + 0.001*"method" + 0.001*"natural" + 0.001*"candidate" + 0.001*"user" + 0.001*"system" + 0.001*"methods" + 0.001*"one" + 0.001*"grammar"'),
 (21,
  '0.001*"may" + 0.001*"include" + 0.001*"entity" + 0.001*"communication" + 0.001*"expert" + 0.001*"identifying" + 0.001*"determining" + 0.001*"whether" + 0.001*"user" + 0.001*"method"'),
 (12,
  '0.001*"precise" + 0.001*"positions" + 0.001*"resulting" + 0.001*"resolved" + 0.001*"research" + 0.001*"ranging" + 0.001*"properties" + 0.001*"slow" + 0.001*"power" + 0.001*"optical"'),
 (22,
  '0.001*"user" + 0.001*"message" + 0.001*"segments" + 0.001*"expert" + 0.001*"messages" + 0.001*"social" + 0.001*"posted" + 0.001*"media" + 0.001*"website" + 0.001*"may"'),
 (5,
  '0.018*"based" + 0.017*"knowledge" + 0.017*"queries" + 0.017*"manager" + 0.017*"priority" + 0.017*"database" + 0.017*"scores" + 0.013*"information" + 0.012*"learning" + 0.012*"machine"'),
 (17,
  '0.020*"content" + 0.020*"real" + 0.019*"upgrade

In [33]:
# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[0])

[(0,
  '0.054*"machine" + 0.054*"learning" + 0.046*"classifier" + 0.036*"user" + 0.029*"data" + 0.021*"loss" + 0.021*"assistant" + 0.021*"upgrade" + 0.021*"digital" + 0.018*"one"'),
 (1,
  '0.024*"processing" + 0.015*"may" + 0.015*"one" + 0.015*"data" + 0.015*"information" + 0.015*"including" + 0.015*"biological" + 0.012*"natural" + 0.012*"language" + 0.012*"gpu"'),
 (2,
  '0.035*"method" + 0.033*"query" + 0.031*"based" + 0.030*"knowledge" + 0.030*"provided" + 0.029*"methods" + 0.023*"systems" + 0.022*"language" + 0.020*"base" + 0.020*"computer"'),
 (4,
  '0.056*"endpoint" + 0.040*"system" + 0.035*"malware" + 0.034*"item" + 0.034*"linguistic" + 0.034*"subsystem" + 0.028*"dependent" + 0.023*"manner" + 0.023*"independent" + 0.022*"based"'),
 (5,
  '0.018*"based" + 0.017*"knowledge" + 0.017*"queries" + 0.017*"manager" + 0.017*"priority" + 0.017*"database" + 0.017*"scores" + 0.013*"information" + 0.012*"learning" + 0.012*"machine"'),
 (6,
  '0.114*"content" + 0.037*"items" + 0.033*"also" +

In [34]:
# show_topic() returns n most important/relevant words, and their weights, that comprise given topic
pprint(model_lda.show_topic(1, topn=10))

[('processing', 0.024000268),
 ('may', 0.015175109),
 ('one', 0.015088425),
 ('data', 0.015068584),
 ('information', 0.015025455),
 ('including', 0.015012451),
 ('biological', 0.014993144),
 ('natural', 0.0120664425),
 ('language', 0.012046633),
 ('gpu', 0.012042157)]


In [35]:
pprint(model_lda.show_topics(num_topics=5, num_words=10))

[(12,
  '0.001*"precise" + 0.001*"positions" + 0.001*"resulting" + 0.001*"resolved" '
  '+ 0.001*"research" + 0.001*"ranging" + 0.001*"properties" + 0.001*"slow" + '
  '0.001*"power" + 0.001*"optical"'),
 (22,
  '0.001*"user" + 0.001*"message" + 0.001*"segments" + 0.001*"expert" + '
  '0.001*"messages" + 0.001*"social" + 0.001*"posted" + 0.001*"media" + '
  '0.001*"website" + 0.001*"may"'),
 (9,
  '0.070*"language" + 0.064*"natural" + 0.049*"may" + 0.042*"entity" + '
  '0.033*"include" + 0.030*"communication" + 0.026*"device" + '
  '0.023*"associated" + 0.019*"determining" + 0.016*"input"'),
 (18,
  '0.037*"data" + 0.023*"distributed" + 0.021*"model" + 0.019*"cad" + '
  '0.017*"memory" + 0.017*"structure" + 0.017*"state" + 0.015*"machine" + '
  '0.015*"learning" + 0.015*"crowd"'),
 (16,
  '0.027*"documentation" + 0.024*"one" + 0.022*"method" + 0.019*"includes" + '
  '0.019*"auto" + 0.016*"data" + 0.016*"system" + 0.016*"network" + '
  '0.016*"engine" + 0.014*"embodiments"')]


### Evaluate - model #1

In [36]:
# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus)
print(perplexity)

-6.26519191441815


  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [37]:
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=processed_docs, dictionary=id_to_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
# print(coherence_1)

In [38]:
# calculate coherence metric or each of the n topicss
coherence_1 = coherence.get_coherence_per_topic()
# print(coherence_1)

In [39]:
# explore topics
pyLDAvis.enable_notebook()
viz_topics_1 = pyLDAvis.gensim.prepare(model_lda, corpus, id_to_word)
# viz_topics_1

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Model 2-  Mallet model

In [40]:
# uncomment to download Mallet topic model
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# update path
path_mallet = '/Users/lee/Documents/techniche/techniche/data/mallet-2.0.8/bin/mallet'

In [41]:
model_2 = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=25, id2word=id_to_word)

In [42]:
# topics
# pprint(model_2.show_topics(formatted=False))

In [43]:
# TODO (Lee) - calculate coherence metric
coherence_2 = CoherenceModel(model=model_2, texts=data, dictionary=id_to_word, coherence='c_v')
coherence_2 = coherence_2.get_coherence()
# print(coherence_2)

  numerator = (co_occur_count / num_docs) + EPSILON
  denominator = (w_prime_count / num_docs) * (w_star_count / num_docs)
  co_doc_prob = co_occur_count / num_docs


In [44]:
# TODO (Lee)
# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
#     """
#     Compute c_v coherence for various number of topics

#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
#     texts : List of input texts
#     limit : Max num of topics

#     Returns:
#     -------
#     model_list : List of LDA topic models
#     coherence_values : Coherence values corresponding to the LDA model with respective number of topics
#     """
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, limit, step):
#         model = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=num_topics, id2word=id2word)
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())

#     return model_list, coherence_values

# model_list, coherence_values = compute_coherence_values(dictionary=id_to_word, corpus=corpus, texts=data, start=2, limit=40, step=6)

### Model 3 - Author topic model

In [45]:
# construct inventor-to-doc mapping as df from nested inventors column in json api response
# df_inventors = json_normalize(data, record_path=['inventors'], meta=['patent_number', 'patent_date'])
# df_inventors = df_inventors[['inventor_id', 'patent_number', 'patent_date']]
# df_inventors.sort_values(by=['patent_date'])
# df_inventors.head(3)

In [46]:
# quick visual index to patent number mapping
# for i in data:
#     print(data.index(i), i['patent_number'])

In [47]:
# TODO (Lee) review fix to pat_inv_map, in which "patent" in mapping is idx of pat, not pat_number from api
pat2inv = pat_inv_map(data)

#### Construct author-topic model

In [48]:
# construct author-topic model
model_at = AuthorTopicModel(corpus=corpus,
                         doc2author=pat2inv,
                         id2word=id_to_word)

In [49]:
# construct vectors for authors
author_vecs = [model_at.get_author_topics(author) for author in model_at.id2author.values()]
author_vecs

[[(96, 0.9562434279413039)],
 [(84, 0.9811603291840066)],
 [],
 [(19, 0.9242604459093523)],
 [(67, 0.9311626980318045)],
 [(28, 0.9237732474038673)],
 [(35, 0.958989045415865)],
 [(31, 0.9607603155206019)],
 [(80, 0.9241599016778429)],
 [(46, 0.9256622536619278)],
 [(64, 0.9227156088237696)],
 [(20, 0.9268230696949888)],
 [(29, 0.9285350666026123)],
 [(38, 0.9263227932159022)],
 [(4, 0.9646821124814157)],
 [(61, 0.9647863869030915)],
 [(84, 0.9948704663212442)],
 [(39, 0.9642973038967201)],
 [(45, 0.9190041673381238)],
 [(57, 0.5404996212970037), (87, 0.4051276107312568)],
 [(58, 0.47842579951245956), (79, 0.49872246400572356)],
 [(25, 0.9647525965544802)],
 [(36, 0.965759139384946)],
 [(71, 0.9103313636926025)],
 [(58, 0.9710462884490945)],
 [(27, 0.13753523075008722), (97, 0.8337925001496213)],
 [(18, 0.8940817460045988), (85, 0.07761755973939068)],
 [(58, 0.8976486060777368)],
 [(26, 0.9038217956220631)],
 [(2, 0.29918393980772634), (19, 0.6777048337033028)],
 [(87, 0.94607931353011

In [50]:
# retrieve topic distribution for author using use model[name] syntax
# each topic has a probability of being expressed given the particular author, 
# but only the ones above a certain threshold are displayed

model_at['7788103-1']

[(40, 0.10136892059437706), (95, 0.888732089506633)]

In [51]:
# def show_author(name):
#     print('\n%s' % name)
#     print('Docs:', model.author2doc[name])
#     print('Topics:')
#     pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [52]:
# build mapping from inventor to patent
inv2pat = gensim.models.atmodel.construct_author2doc(pat2inv)

### Prediction

In [53]:
# prediction functions that take input of new text string, and predict topic distribution