## Techniche - Topic model

In [16]:
import pandas as pd
import numpy as np

import gensim
import gensim.corpora as corpora
from gensim.corpora import mmcorpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models import AuthorTopicModel
from gensim.test.utils import common_dictionary, datapath, temporary_file
from smart_open import smart_open

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
from pandas.io.json import json_normalize
import requests
import re
import os
import calendar
import requests
from bs4 import BeautifulSoup

from test_model import get_patent_fields_list, get_ml_patents, create_title_abstract_col # TODO (Lee) resolve

from smart_open import smart_open

import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
np.random.seed(3)

#### Import data from PatentsView API

In [28]:
%%capture
# retrieve dataset of machine learning patents from PatentsView API 
data = get_ml_patents()

In [29]:
# inspect result
data[0]

{'detail_desc_length': '51618',
 'patent_abstract': "Tools and techniques for the rapid, continuous, invasive and/or noninvasive measurement, estimation, and/or prediction of a patient's physiological state. In an aspect, some tools and techniques can estimate predict the onset of conditions intracranial pressure, an amount of blood volume loss, cardiovascular collapse, and/or dehydration. Some tools can recommend (and, in some cases, administer) a therapeutic treatment for the patient's condition. In another aspect, some techniques employ high speed software technology that enables active, long term learning from extremely large, continually changing datasets. In some cases, this technology utilizes feature extraction, state-of-the-art machine learning and/or statistical methods to autonomously build and apply relevant models in real-time.",
 'patent_average_processing_time': None,
 'patent_date': '2019-03-12',
 'patent_firstnamed_assignee_city': 'Louisville',
 'patent_firstnamed_assi

In [None]:
# explore, outside of .py flow
# raw_df = pd.DataFrame(data)
# raw_df

#### Subset dataframe

In [None]:
# subset dataframe - comment/uncomment to include fields
df = raw_df[['patent_number', 'patent_date', 'patent_title',
             'patent_abstract', 'patent_firstnamed_assignee_id',
             'patent_firstnamed_assignee_location_id',
             'patent_firstnamed_assignee_latitude',
             'patent_firstnamed_assignee_longitude',
             'patent_firstnamed_assignee_city',
             'patent_firstnamed_assignee_state',
             'patent_firstnamed_assignee_country', 
             'patent_firstnamed_inventor_id',
             'patent_firstnamed_inventor_location_id',
             'patent_firstnamed_inventor_latitude',
             'patent_firstnamed_inventor_longitude',
             'patent_firstnamed_inventor_city',
             'patent_firstnamed_inventor_state',
             'patent_firstnamed_inventor_country',
             'patent_year', 'patent_type', 'patent_kind',
             'inventors'
            ]]

In [58]:
def trim_data(data, keys):
    new_data = []
    for dictionary in data:
        new_data.append(dict((k, dictionary[k]) for k in keys if k in dictionary))
    return new_data

In [59]:
my_keys = ['patent_number', 'patent_date', 'patent_title', 'patent_abstract', 'patent_abstract']
t = trim_data(data=data, keys=my_keys)

In [60]:
t

[{'patent_number': '10226194',
  'patent_date': '2019-03-12',
  'patent_title': "Statistical, noninvasive measurement of a patient's physiological state",
  'patent_abstract': "Tools and techniques for the rapid, continuous, invasive and/or noninvasive measurement, estimation, and/or prediction of a patient's physiological state. In an aspect, some tools and techniques can estimate predict the onset of conditions intracranial pressure, an amount of blood volume loss, cardiovascular collapse, and/or dehydration. Some tools can recommend (and, in some cases, administer) a therapeutic treatment for the patient's condition. In another aspect, some techniques employ high speed software technology that enables active, long term learning from extremely large, continually changing datasets. In some cases, this technology utilizes feature extraction, state-of-the-art machine learning and/or statistical methods to autonomously build and apply relevant models in real-time."},
 {'patent_number': '

In [43]:
data

[{'detail_desc_length': '51618',
  'patent_abstract': "Tools and techniques for the rapid, continuous, invasive and/or noninvasive measurement, estimation, and/or prediction of a patient's physiological state. In an aspect, some tools and techniques can estimate predict the onset of conditions intracranial pressure, an amount of blood volume loss, cardiovascular collapse, and/or dehydration. Some tools can recommend (and, in some cases, administer) a therapeutic treatment for the patient's condition. In another aspect, some techniques employ high speed software technology that enables active, long term learning from extremely large, continually changing datasets. In some cases, this technology utilizes feature extraction, state-of-the-art machine learning and/or statistical methods to autonomously build and apply relevant models in real-time.",
  'patent_average_processing_time': None,
  'patent_date': '2019-03-12',
  'patent_firstnamed_assignee_city': 'Louisville',
  'patent_firstname

#### Create new column

In [34]:
# create new key value pair from combined values of patent_title and patent_abstract keys
def create_title_abstract_col(data):
    for dictionary in data:
        dictionary['patent_title_abstract'] = str([dictionary['patent_title'] + '. ' + dictionary['patent_abstract']][0])

In [35]:
t = create_title_abstract_col(data=data)

In [36]:
t

In [37]:
data_2 = create_title_abstract_col(data_1)

#### Partition data

In [None]:
# TODO (Lee) partition data
len(data)
text_train = data[:round(len(data)*.8)]
text_test = data[round(len(data)*.8):]
print(len(data), len(text_train), len(text_test), len(text_train)+len(text_test))

In [None]:
# sort values in dataframe
df.sort_values(by=['patent_date'])

In [None]:
# convert dataframe to list
text_data = df.patent_title_abstract.tolist()
text_data

### Pre-process text data

In [None]:
# uncomment to download stop words from nltk and language package from spacy
# nltk.download('stopwords')
# nltk.download('punkt')
# !python -m spacy download en

In [None]:
# construct pipeline using Spacy Language object and associated pipeline/components
nlp = spacy.load("en")
pprint(nlp.pipeline)

In [None]:
processed_docs = []   

# process patent documents in pipeline
for doc in nlp.pipe(text_train, n_threads=4, batch_size=100):
   
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list.
    doc = [token for token in doc if token not in stop_words]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_docs.append(doc)

processed_docs[0][:5]

In [None]:
nlp.

In [None]:
[token.text for token in doc]

In [None]:
labels = set([w.label_ for w in doc.ents]) 

In [None]:
for label in labels: 
    entities = [cleanup(e.string, lower=False) for e in document.ents if label==e.label_] 
    entities = list(set(entities)) 
    print(label,entities)

In [None]:
pre_processed_docs = []
for doc in nlp.pipe(docs, n_threads=4, batch_size=100):
    # Process document using Spacy NLP pipeline.
    
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list.
    #doc = [token for token in doc if token not in STOPWORDS]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    pre_processed_docs.append(doc)

#### Tokenize

In [None]:
# tokenize documents

def tokenize_docs(docs):
    tokenized_docs = []
    for doc in docs:
        tokenized_docs.append(word_tokenize(doc))
    return tokenized_docs

tokenized_docs = tokenize_docs(text_train)

#### Clean punctuation

In [None]:
# clean punctuation
def clean_docs(tokenized_docs):
    clean_docs = []
    for doc in tokenized_docs:
       clean_docs.append([word for word in doc if word.isalpha()])  
    return clean_docs

In [None]:
cleaned_data = clean_docs(tokenized_docs)
cleaned_data[0]

#### Convert to lowercase

In [None]:
# convert to lowercase
def lower_words(docs):
    lowered_words = []
    for doc in docs:
        lowered_words.append([word.lower() for word in doc])
    return lowered_words

lowered_data = lower_words(cleaned_data)
lowered_data[0]

#### Clean stopwords

In [None]:
# clean stopwords

stop_words = stopwords.words('english')

In [None]:
def filter_stopwords(docs):
    filtered_docs = []
    for doc in docs:
       filtered_docs.append([word for word in doc if word not in stop_words])
    return filtered_docs

# remove stopwords
filtered_data = filter_stopwords(lowered_data)
filtered_data
# TODO (Lee) - resolve un-lowered stopwords "A" and "An", 'By', 'The'

#### Construct bigrams and trigrams

In [None]:
# train bigram phrases model
bigram_model = Phrases(filtered_data, min_count=1, threshold=1)

# train trigram phrases model
trigram_model = Phrases(bigram_model[filtered_data], threshold=100)  

In [None]:
# bigrams
def bigrams(docs):
    """create bigrams"""
    return [bigram_model[doc] for doc in docs]

In [None]:
# initialize bigram and trigram models
bigram_model = gensim.models.phrases.Phraser(bigram_model)
trigram_model = gensim.models.phrases.Phraser(trigram_model)

In [None]:
bigrams(filtered_data)[0]

In [None]:
def trigrams(docs):
    """create trigrams"""
    return [trigram_model[bigram_model[doc]] for doc in docs]

In [None]:
trigrams(filtered_data)[0]

#### Stem and Lemmatize

In [None]:
def lemmatize_docs(docs, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """lemmatize documents"""
    lemmatized_docs = []
    for doc in docs: 
        lemmatized_docs.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return lemmatized_docs

In [None]:
# TODO (Lee)

lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
# for doc in cleaned_data:
#     for token in doc:
#         token.lemma_

# uncomment to use
# download english model with "python -m spacy download en"

# for token in doc:
#     print(token, token.lemma, token.lemma_)

# TODO (Lee) - lemmatize_docs(cleaned_data)

#### Create corpus and dictionary

In [None]:
# using spacy pipeline components
# build dictionary
id_to_word = corpora.Dictionary(processed_docs)

# build corpus
texts = processed_docs

# apply term document frequency
# converts documents in corpus to bag-of-words format, a list of (token_id, token_count) tuples
corpus = [id_to_word.doc2bow(doc) for doc in processed_docs]

In [None]:
 # build dictionary
id_to_word = corpora.Dictionary(filtered_data)

# build corpus
texts = filtered_data

# apply term document frequency
# converts documents in corpus to bag-of-words format, a list of (token_id, token_count) tuples
corpus = [id_to_word.doc2bow(text) for text in texts]

In [None]:
# view formatted corpus (term-doc-frequency)
[[(id_to_word[id], freq) for id, freq in text] for text in corpus][:1]

### Model - model #1

In [None]:
# TODO (Lee) - deprecation warnings
# construct LDA model
model_lda = LdaModel(corpus=corpus,
                     id2word=id_to_word,
                     num_topics=25, 
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

In [None]:
# print keywords in n topics
pprint(model_lda.print_topics())

In [None]:
# print top 10 keywords that comprise topic with index of 0
pprint(model_lda.print_topic(24))
# the most import keywords, and the respective weight, that form topic 0 are

In [None]:
# print top 10 keywords that comprise topic with index of 1
pprint(model_lda.print_topic(1))

In [None]:
# TODO (Lee) - infer topic from keywords?

### Evaluate - model #1

In [None]:
# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus)
perplexity

In [None]:
# TODO (Lee) - confirm that filtered_data is indeed the correct dataset to pass to texts param
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=processed_docs, dictionary=id_to_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
coherence_1

In [None]:
# TODO (Lee) - confirm that filtered_data is indeed the correct dataset to pass to texts param
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=filtered_docs, dictionary=id_to_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
coherence_1

In [None]:
# calculate coherence metric or each of the n topicss
coherence_1 = coherence.get_coherence_per_topic()
coherence_1

In [None]:
# explore topics
pyLDAvis.enable_notebook()
viz_topics_1 = pyLDAvis.gensim.prepare(model_lda, corpus, id_to_word)
viz_topics_1
# TODO (Lee) - salient vs relevant terms in pyLDA ?

### Model 2-  Mallet model

In [None]:
# uncomment to download Mallet topic model
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# update this path
path_mallet = 'data/mallet-2.0.8/bin/mallet'

In [None]:
model_2 = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=25, id2word=id_to_word)

In [None]:
# topics
pprint(model_2.show_topics(formatted=False))

In [None]:
# calculate coherence metric
coherence_model_2 = CoherenceModel(model=model_2, texts=data, dictionary=id_to_word, coherence='c_v')
coherence_model_2 = coherence_model_2.get_coherence()
coherence_model_2

In [None]:
# TODO (Lee)
# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
#     """
#     Compute c_v coherence for various number of topics

#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
#     texts : List of input texts
#     limit : Max num of topics

#     Returns:
#     -------
#     model_list : List of LDA topic models
#     coherence_values : Coherence values corresponding to the LDA model with respective number of topics
#     """
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, limit, step):
#         model = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=num_topics, id2word=id2word)
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())

#     return model_list, coherence_values

# model_list, coherence_values = compute_coherence_values(dictionary=id_to_word, corpus=corpus, texts=data, start=2, limit=40, step=6)

### Model 3 - Author topic model

In [None]:
# construct inventor-to-doc mapping as df from nested inventors column in json api response
df_inventors = json_normalize(results['patents'], record_path=['inventors'], meta=['patent_number', 'patent_date'])
df_inventors = df_inventors[['inventor_id', 'patent_number', 'patent_date']]
df_inventors.sort_values(by=['patent_date'])
df_inventors.head(3)

In [None]:
df.head(3)

In [None]:
# TODO (Lee) - resolve workaround
df_idx = df
df_idx['idx'] = df.index
df_idx
df_idx_1 = df_idx[['patent_number', 'idx', 'inventors']]
df_idx_2 = df_idx_1.set_index('patent_number')
df_idx_2.pop('inventors')
df_idx_2
df_pat_idx = df_idx_2.T.to_dict('records')
for i in df_pat_idx:
    df_pat_idx = dict(i)
df_pat_idx

In [None]:
df_pat_idx = df_idx_2.T.to_dict('records')
for i in df_pat_idx:
    df_pat_idx = dict(i)
df_pat_idx

In [None]:
df_inv_test = json_normalize(results['patents'], record_path=['inventors'], meta=['patent_number', 'patent_date'])
df_inv_test.head(3)

In [None]:
df_idx_pat_inv_map = df[['patent_number', 'inventors']]
df_idx_pat_inv_map.head(3)

In [None]:
# TODO (Lee) - find out how to get list of patents_view_field names from API - I did it accidentally but need to replicate

In [None]:
df.patent_title_abstract[0]

In [None]:
df[:3]

In [None]:
df_inventors.set_index('inventor_id').T.to_dict('list')

In [None]:
# for k, v in pat2inv.items():
#     name_dict[new_key] = name_dict.pop(k)
#     time.sleep(4)

# pprint.pprint(name_dict)

# d = {'x':1, 'y':2, 'z':3}
# d1 = {'x':'a', 'y':'b', 'z':'c'}

# dict((d1[key], value) for (key, value) in d.items())
# {'a': 1, 'b': 2, 'c': 3}

In [None]:
patdf2inv = dict((df_pat_idx[key], value) for (key, value) in pat2inv.items())
patdf2inv

In [None]:
pat2inv = {k: list(v) for k,v in df_inventors.groupby("patent_number")["inventor_id"]}
pat2inv

In [None]:
idx_pat_map = df.patent_number.to_dict()
idx_pat_map = {str(key): value for key, value in idx_pat_map.items()}
idx_pat_map

#### Construct author-topic model

In [None]:
# construct author-topic model
model_at = AuthorTopicModel(corpus=corpus,
                         doc2author=patdf2inv,
                         id2word=id_to_word, 
                         num_topics=25)

In [None]:
# construct vectors for authors
author_vecs = [model_at.get_author_topics(author) for author in model_at.id2author.values()]
author_vecs

In [None]:
# retrieve the topic distribution for an author using use model[name] syntax
# each topic has a probability of being expressed given the particular author, but only the ones above a certain threshold are shown.

model_at['7788103-1']

In [None]:
# def show_author(name):
#     print('\n%s' % name)
#     print('Docs:', model.author2doc[name])
#     print('Topics:')
#     pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [None]:
# calculate per-word bound, which is a measure of the model's predictive performance (reconstruction error?)

build doc2author dictionary

doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [None]:
from gensim.models import atmodel
doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [None]:
gensim.models.atmodel.construct_author2doc(doc2author)
# construct mapping from author IDs to document IDs.

Parameters:	doc2author (dict of (int, list of str)) – Mapping of document id to authors.
Returns:	Mapping of authors to document ids.
Return type:	dict of (str, list of int)

In [None]:
gensim.models.atmodel.construct_doc2author(corpus, author2doc)
construct mapping from document IDs to author IDs

Parameters:	
corpus (iterable of list of (int, float)) – Corpus in BoW format.
author2doc (dict of (str, list of int)) – Mapping of authors to documents.
Returns:	
Document to Author mapping.

Return type:	
dict of (int, list of str)