## Techniche - Topic Model

In [1]:
import pandas as pd
import numpy as np

import gensim
import gensim.corpora as corpora
from gensim.corpora import mmcorpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models import AuthorTopicModel
from gensim.test.utils import common_dictionary, datapath, temporary_file

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
from pandas.io.json import json_normalize
import requests
import re

from topic_model import tokenize_docs

import os, re
from smart_open import smart_open

import matplotlib.pyplot as plt
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
from pprint import pprint



In [2]:
np.random.seed(3)

In [3]:
# uncomment to download stop words from nltk
# nltk.download('stopwords')
# nltk.download('punkt')

### Import Data

#### Import data from PatentsView API

In [132]:
# pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 50)

# patents endpoint
endpoint_url = 'http://www.patentsview.org/api/patents/query'

# build list of possible fields that endpoint request will return
df = pd.read_excel("data/patents_view_patents_fields.xlsx")
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
pat_fields = df.api_field_name.values.tolist()

# build query
query={"_or":[{"_text_phrase":{"patent_title":"natural language"}},{"_text_phrase":{"patent_abstract":"natural language"}}]}
fields=pat_fields
options={"per_page":2500}
sort=[{"patent_date":"desc"}]

params={'q': json.dumps(query),
        'f': json.dumps(fields),
        'o': json.dumps(options),
        's': json.dumps(sort)}

# request and results
resp = requests.get(endpoint_url, params=params)
results = resp.json()

#### structure data

In [172]:
# extract metadata from response
print("status code:", resp.status_code,';', "reason:", resp.reason)
total_patent_count = results["total_patent_count"]
patents_per_page = results['count']
print("total_patent_count:",total_patent_count,';', "patents_per_page:", patents_per_page)

# extract data from response
data_resp = results['patents']
# data[0]
raw_df = pd.DataFrame(data_resp)
raw_df.head(3)

status code: 200 ; reason: OK
total_patent_count: 2482 ; patents_per_page: 2482


Unnamed: 0,IPCs,application_citations,applications,assignees,cited_patents,citedby_patents,cpcs,detail_desc_length,examiners,foreign_priority,gov_interests,inventors,lawyers,nbers,patent_abstract,patent_average_processing_time,patent_date,patent_firstnamed_assignee_city,patent_firstnamed_assignee_country,patent_firstnamed_assignee_id,patent_firstnamed_assignee_latitude,patent_firstnamed_assignee_location_id,patent_firstnamed_assignee_longitude,patent_firstnamed_assignee_state,patent_firstnamed_inventor_city,patent_firstnamed_inventor_country,patent_firstnamed_inventor_id,patent_firstnamed_inventor_latitude,patent_firstnamed_inventor_location_id,patent_firstnamed_inventor_longitude,patent_firstnamed_inventor_state,patent_kind,patent_num_cited_by_us_patents,patent_num_claims,patent_num_combined_citations,patent_num_foreign_citations,patent_num_us_application_citations,patent_num_us_patent_citations,patent_number,patent_processing_time,patent_title,patent_type,patent_year,pct_data,rawinventors,uspcs,wipos
0,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020077823', 'ap...","[{'app_country': 'US', 'app_date': '2013-07-26...","[{'assignee_city': 'Burlington', 'assignee_cou...",[{'cited_patent_category': 'cited by examiner'...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",11570,"[{'examiner_first_name': 'Michael N', 'examine...","[{'forprior_country': None, 'forprior_date': N...","[{'govint_contract_award_number': None, 'govin...","[{'inventor_city': 'Newton', 'inventor_country...","[{'lawyer_first_name': None, 'lawyer_first_see...","[{'nber_category_id': None, 'nber_category_tit...",Designing a natural language understanding (NL...,,2019-03-12,Burlington,US,org_ID497r4tFbCIaMBjGAST,42.5047,42.5047|-71.1961,-71.1961,MA,Newton,US,7788103-1,42.3369,42.3369|-71.2097,-71.2097,MA,B2,0,19,31,0,26,5,10229106,2055,Initializing a workspace for building a natura...,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Jeffrey N.', 'raw...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
1,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020138265', 'ap...","[{'app_country': 'US', 'app_date': '2017-09-11...","[{'assignee_city': 'Mountain View', 'assignee_...",[{'cited_patent_category': 'cited by applicant...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",28118,"[{'examiner_first_name': 'Shreyans A', 'examin...","[{'forprior_country': None, 'forprior_date': N...","[{'govint_contract_award_number': None, 'govin...","[{'inventor_city': 'Adliswil', 'inventor_count...","[{'lawyer_first_name': None, 'lawyer_first_see...","[{'nber_category_id': None, 'nber_category_tit...","Methods, systems, and apparatus, including com...",,2019-03-12,Mountain View,US,org_p6ofWD2xFNSnyYkj6wpA,37.3861,37.3861|-122.0828,-122.083,CA,Adliswil,CH,8352247-1,47.3119,47.3119|8.5287,8.5287,,B1,0,20,15,0,7,8,10229109,547,Allowing spelling of arbitrary words,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Evgeny A.', 'rawi...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
2,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2001/20010029455', 'ap...","[{'app_country': 'US', 'app_date': '2016-09-28...","[{'assignee_city': 'Seattle', 'assignee_countr...",[{'cited_patent_category': 'cited by applicant...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",119654,"[{'examiner_first_name': 'Jialong', 'examiner_...","[{'forprior_country': None, 'forprior_date': N...","[{'govint_contract_award_number': None, 'govin...","[{'inventor_city': 'Seattle', 'inventor_countr...","[{'lawyer_first_name': None, 'lawyer_first_see...","[{'nber_category_id': None, 'nber_category_tit...",A content management system (CMS) and a transl...,,2019-03-12,Seattle,US,org_Vbc6obpnxWM42d0HjlXY,47.6064,47.6064|-122.3308,-122.331,WA,Seattle,US,9177341-1,47.6064,47.6064|-122.3308,-122.331,WA,B1,0,20,74,0,48,26,10229113,895,Leveraging content dimensions during the trans...,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Thibault Pierre',...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."


#### Subset dataframe

In [165]:
# subset dataframe - comment/uncomment to include fields
df = raw_df[['patent_number', 
         'patent_date', 
         'patent_title',
         'patent_abstract', 
         'patent_firstnamed_assignee_id',
         'patent_firstnamed_assignee_location_id',
         'patent_firstnamed_assignee_latitude',
         'patent_firstnamed_assignee_longitude',
         'patent_firstnamed_assignee_city',
         'patent_firstnamed_assignee_state',
         'patent_firstnamed_assignee_country', 
         'patent_firstnamed_inventor_id',
         'patent_firstnamed_inventor_location_id',
         'patent_firstnamed_inventor_latitude',
         'patent_firstnamed_inventor_longitude',
         'patent_firstnamed_inventor_city',
         'patent_firstnamed_inventor_state',
         'patent_firstnamed_inventor_country',
         'patent_year', 
         'patent_type', 
         'patent_kind',
         'inventors'
#          'patent_processing_time', 
#          'patent_num_us_application_citations', 
#          'patent_num_us_patent_citations', 
#          'patent_num_foreign_citations', 
#          'patent_num_combined_citations', 
#          'patent_num_claims', 
#          'patent_num_cited_by_us_patents',
#          'detail_desc_length'
            ]]

In [166]:
df.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_firstnamed_assignee_location_id,patent_firstnamed_assignee_latitude,patent_firstnamed_assignee_longitude,patent_firstnamed_assignee_city,patent_firstnamed_assignee_state,patent_firstnamed_assignee_country,patent_firstnamed_inventor_id,patent_firstnamed_inventor_location_id,patent_firstnamed_inventor_latitude,patent_firstnamed_inventor_longitude,patent_firstnamed_inventor_city,patent_firstnamed_inventor_state,patent_firstnamed_inventor_country,patent_year,patent_type,patent_kind,inventors
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,42.5047|-71.1961,42.5047,-71.1961,Burlington,MA,US,7788103-1,42.3369|-71.2097,42.3369,-71.2097,Newton,MA,US,2019,utility,B2,"[{'inventor_city': 'Newton', 'inventor_country..."
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,37.3861|-122.0828,37.3861,-122.083,Mountain View,CA,US,8352247-1,47.3119|8.5287,47.3119,8.5287,Adliswil,,CH,2019,utility,B1,"[{'inventor_city': 'Adliswil', 'inventor_count..."
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,47.6064|-122.3308,47.6064,-122.331,Seattle,WA,US,9177341-1,47.6064|-122.3308,47.6064,-122.331,Seattle,WA,US,2019,utility,B1,"[{'inventor_city': 'Seattle', 'inventor_countr..."


In [173]:
# create new column that combines the patent title and the patent abstract columns into a single string
df['patent_title_abstract'] = df.patent_title + ' ' + df.patent_abstract
df.patent_title_abstract.head(3)

# 561 different assignees
len(df.patent_firstnamed_assignee_id.unique())

df.patent_firstnamed_assignee_id.value_counts()[:10]

org_q9Bn28RHhpYrQjKvraAH    497
org_JZguWDMfFOBX2wBI9pnD    129
org_ID497r4tFbCIaMBjGAST     88
org_rDyHZBYWMcBEtnkHt05L     80
org_p6ofWD2xFNSnyYkj6wpA     57
org_EilEWQcC6UiqHcSGx9mb     56
org_ccMMcUijAIsKIxUqMTyP     49
org_Vbc6obpnxWM42d0HjlXY     41
org_9D8x1qL3IRASp6GG7Glu     29
org_2wAdIFKssfcLHpZq0u4H     26
Name: patent_firstnamed_assignee_id, dtype: int64

In [174]:
#### TODO (Lee) Partition data into train and test sets

### Pre-process text data

#### Tokenize

In [12]:
# tokenize documents

def tokenize_docs(docs):
    tokenized_docs = []
    for doc in docs:
        tokenized_docs.append(word_tokenize(doc))
    return tokenized_docs

tokenized_docs = tokenize_docs(data)

#### Clean punctuation

In [13]:
# clean punctuation
def clean_docs(tokenized_docs):
    clean_docs = []
    for doc in tokenized_docs:
       clean_docs.append([word for word in doc if word.isalpha()])  
    return clean_docs

In [15]:
data = clean_docs(tokenized_docs)
data[0]

['Translation',
 'between',
 'natural',
 'languages',
 'An',
 'input',
 'sentence',
 'described',
 'by',
 'a',
 'first',
 'natural',
 'language',
 'is',
 'sectioned',
 'into',
 'individual',
 'words',
 'Parts',
 'of',
 'speech',
 'corresponding',
 'to',
 'the',
 'individual',
 'words',
 'are',
 'retrieved',
 'from',
 'a',
 'lexical',
 'word',
 'storage',
 'whereby',
 'the',
 'input',
 'sentence',
 'is',
 'described',
 'by',
 'a',
 'corresponding',
 'string',
 'of',
 'the',
 'as',
 'retrieved',
 'A',
 'translation',
 'pattern',
 'table',
 'is',
 'previously',
 'prepared',
 'which',
 'defines',
 'correspondence',
 'between',
 'patterns',
 'of',
 'strings',
 'of',
 'for',
 'the',
 'first',
 'natural',
 'language',
 'and',
 'those',
 'for',
 'a',
 'second',
 'natural',
 'language',
 'by',
 'which',
 'corresponding',
 'output',
 'sentence',
 'is',
 'to',
 'be',
 'described',
 'By',
 'referring',
 'to',
 'the',
 'translation',
 'pattern',
 'table',
 'the',
 'string',
 'of',
 'the',
 'of',
 '

#### Convert to lowercase

In [17]:
# convert to lowercase
def lower_words(docs):
    lowered_words = []
    for doc in docs:
        lowered_words.append([word.lower() for word in doc])
    return lowered_words

lower_words(data)[0]

['translation',
 'between',
 'natural',
 'languages',
 'an',
 'input',
 'sentence',
 'described',
 'by',
 'a',
 'first',
 'natural',
 'language',
 'is',
 'sectioned',
 'into',
 'individual',
 'words',
 'parts',
 'of',
 'speech',
 'corresponding',
 'to',
 'the',
 'individual',
 'words',
 'are',
 'retrieved',
 'from',
 'a',
 'lexical',
 'word',
 'storage',
 'whereby',
 'the',
 'input',
 'sentence',
 'is',
 'described',
 'by',
 'a',
 'corresponding',
 'string',
 'of',
 'the',
 'as',
 'retrieved',
 'a',
 'translation',
 'pattern',
 'table',
 'is',
 'previously',
 'prepared',
 'which',
 'defines',
 'correspondence',
 'between',
 'patterns',
 'of',
 'strings',
 'of',
 'for',
 'the',
 'first',
 'natural',
 'language',
 'and',
 'those',
 'for',
 'a',
 'second',
 'natural',
 'language',
 'by',
 'which',
 'corresponding',
 'output',
 'sentence',
 'is',
 'to',
 'be',
 'described',
 'by',
 'referring',
 'to',
 'the',
 'translation',
 'pattern',
 'table',
 'the',
 'string',
 'of',
 'the',
 'of',
 '

#### Clean stopwords

In [19]:
# clean stopwords

stop_words = stopwords.words('english')

def remove_stopwords(clean_docs):
    filtered_docs = []
    for doc in clean_docs:
       filtered_docs.append([word for word in doc if word not in stop_words])
    return filtered_docs

# remove stopwords
cleaned_data = remove_stopwords(data)
cleaned_data[0]
# TODO (Lee) - resolve un-lowered stopwords "A" and "An", 'By', 'The'

['Translation',
 'natural',
 'languages',
 'An',
 'input',
 'sentence',
 'described',
 'first',
 'natural',
 'language',
 'sectioned',
 'individual',
 'words',
 'Parts',
 'speech',
 'corresponding',
 'individual',
 'words',
 'retrieved',
 'lexical',
 'word',
 'storage',
 'whereby',
 'input',
 'sentence',
 'described',
 'corresponding',
 'string',
 'retrieved',
 'A',
 'translation',
 'pattern',
 'table',
 'previously',
 'prepared',
 'defines',
 'correspondence',
 'patterns',
 'strings',
 'first',
 'natural',
 'language',
 'second',
 'natural',
 'language',
 'corresponding',
 'output',
 'sentence',
 'described',
 'By',
 'referring',
 'translation',
 'pattern',
 'table',
 'string',
 'input',
 'sentence',
 'transformed',
 'corresponding',
 'string',
 'second',
 'natural',
 'language',
 'The',
 'output',
 'sentence',
 'described',
 'second',
 'natural',
 'language',
 'generated',
 'sequencing',
 'target',
 'words',
 'accordance',
 'sequential',
 'order',
 'parts',
 'speech',
 'string',
 'pa

#### Create bigrams and trigrams

In [20]:
# train bigram phrases model
bigram_model = Phrases(data, min_count=1, threshold=1)

# train trigram phrases model
trigram_model = Phrases(bigram_model[data], threshold=100)  



In [21]:
# bigrams
def bigrams(docs):
    """create bigrams"""
    return [bigram_model[doc] for doc in docs]

In [22]:
# initialize bigram and trigram models
bigram_model = gensim.models.phrases.Phraser(bigram_model)
trigram_model = gensim.models.phrases.Phraser(trigram_model)

In [26]:
bigrams(cleaned_data)[0]

['Translation',
 'natural_languages',
 'An_input',
 'sentence_described',
 'first_natural',
 'language',
 'sectioned',
 'individual_words',
 'Parts',
 'speech',
 'corresponding',
 'individual_words',
 'retrieved',
 'lexical',
 'word',
 'storage',
 'whereby',
 'input_sentence',
 'described',
 'corresponding_string',
 'retrieved_A',
 'translation_pattern',
 'table',
 'previously',
 'prepared',
 'defines',
 'correspondence',
 'patterns',
 'strings',
 'first_natural',
 'language',
 'second_natural',
 'language',
 'corresponding',
 'output_sentence',
 'described',
 'By',
 'referring',
 'translation_pattern',
 'table',
 'string',
 'input_sentence',
 'transformed',
 'corresponding_string',
 'second_natural',
 'language',
 'The',
 'output_sentence',
 'described',
 'second_natural',
 'language',
 'generated',
 'sequencing',
 'target_words',
 'accordance',
 'sequential',
 'order',
 'parts',
 'speech',
 'string',
 'pattern',
 'obtained',
 'transformation']

In [24]:
def trigrams(docs):
    """create trigrams"""
    return [trigram_model[bigram_model[doc]] for doc in docs]

In [25]:
trigrams(cleaned_data)

[['Translation',
  'natural_languages',
  'An_input',
  'sentence_described',
  'first_natural',
  'language',
  'sectioned',
  'individual_words',
  'Parts',
  'speech',
  'corresponding',
  'individual_words',
  'retrieved',
  'lexical',
  'word',
  'storage',
  'whereby',
  'input_sentence',
  'described',
  'corresponding_string',
  'retrieved_A',
  'translation_pattern',
  'table',
  'previously',
  'prepared',
  'defines',
  'correspondence',
  'patterns',
  'strings',
  'first_natural',
  'language',
  'second_natural',
  'language',
  'corresponding',
  'output_sentence',
  'described',
  'By',
  'referring',
  'translation_pattern',
  'table',
  'string',
  'input_sentence',
  'transformed',
  'corresponding_string',
  'second_natural',
  'language',
  'The',
  'output_sentence',
  'described',
  'second_natural',
  'language',
  'generated',
  'sequencing',
  'target_words',
  'accordance',
  'sequential',
  'order',
  'parts',
  'speech',
  'string',
  'pattern',
  'obtained

#### Stem and Lemmatize

In [27]:
def lemmatize_docs(docs, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """lemmatize documents"""
    lemmatized_docs = []
    for doc in docs: 
        lemmatized_docs.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return lemmatized_docs

In [28]:
# TODO (Lee)

lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
# for doc in cleaned_data:
#     for token in doc:
#         token.lemma_

# uncomment to use
# download english model with "python -m spacy download en"

# for token in doc:
#     print(token, token.lemma, token.lemma_)

# TODO (Lee) - lemmatize_docs(cleaned_data)

#### Create corpus and dictionary

In [159]:
 # build dictionary
id_word = corpora.Dictionary(cleaned_data)

# build corpus
texts = cleaned_data

# apply term document frequency
# converts documents in corpus to bag-of-words format, a list of (token_id, token_count) tuples
corpus = [id_word.doc2bow(text) for text in texts]

In [34]:
# view formatted corpus (term-doc-frequency)
[[(id_word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('A', 1),
  ('An', 1),
  ('By', 1),
  ('Parts', 1),
  ('The', 1),
  ('Translation', 1),
  ('accordance', 1),
  ('correspondence', 1),
  ('corresponding', 4),
  ('defines', 1),
  ('described', 4),
  ('first', 2),
  ('generated', 1),
  ('individual', 2),
  ('input', 3),
  ('language', 5),
  ('languages', 1),
  ('lexical', 1),
  ('natural', 6),
  ('obtained', 1),
  ('order', 1),
  ('output', 2),
  ('parts', 1),
  ('pattern', 3),
  ('patterns', 1),
  ('prepared', 1),
  ('previously', 1),
  ('referring', 1),
  ('retrieved', 2),
  ('second', 3),
  ('sectioned', 1),
  ('sentence', 5),
  ('sequencing', 1),
  ('sequential', 1),
  ('speech', 2),
  ('storage', 1),
  ('string', 4),
  ('strings', 1),
  ('table', 2),
  ('target', 1),
  ('transformation', 1),
  ('transformed', 1),
  ('translation', 2),
  ('whereby', 1),
  ('word', 1),
  ('words', 3)]]

### Model 1

In [36]:
# TODO (Lee) - deprecation warnings
# construct LDA model
model_lda = LdaModel(corpus=corpus,
                     id2word=id_word,
                     num_topics=25, 
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [39]:
# print keywords in 10 topics
pprint(model_lda.print_topics())

[(7,
  '0.128*"social" + 0.088*"network" + 0.043*"parse" + 0.039*"concept" + '
  '0.035*"nodes" + 0.031*"The" + 0.022*"kernel" + 0.020*"hashtags" + '
  '0.017*"node" + 0.016*"outputs"'),
 (5,
  '0.089*"entity" + 0.087*"token" + 0.030*"quality" + 0.029*"disambiguation" + '
  '0.028*"Disclosed" + 0.027*"herein" + 0.019*"tasks" + 0.018*"state" + '
  '0.017*"signal" + 0.015*"manually"'),
 (12,
  '0.079*"NLU" + 0.043*"model" + 0.034*"used" + 0.031*"For" + 0.023*"likely" + '
  '0.022*"understanding" + 0.021*"scanned" + 0.019*"grammatical" + '
  '0.017*"using" + 0.017*"accurate"'),
 (23,
  '0.073*"document" + 0.030*"item" + 0.029*"form" + 0.029*"type" + '
  '0.023*"list" + 0.023*"statements" + 0.021*"items" + 0.021*"The" + '
  '0.019*"defined" + 0.018*"analysis"'),
 (0,
  '0.093*"message" + 0.072*"component" + 0.047*"messages" + 0.029*"string" + '
  '0.025*"metadata" + 0.022*"mode" + 0.021*"statement" + 0.017*"processors" + '
  '0.017*"generate" + 0.016*"character"'),
 (19,
  '0.080*"word" + 

In [45]:
# print top 10 keywords that comprise topic with index of 0
pprint(model_lda.print_topic(24))
# the most import keywords, and the respective weight, that form topic 0 are

('0.093*"question" + 0.060*"answer" + 0.041*"The" + 0.032*"user" + '
 '0.030*"approach" + 0.028*"system" + 0.027*"questions" + 0.023*"includes" + '
 '0.023*"based" + 0.020*"potential"')


In [46]:
# print top 10 keywords that comprise topic with index of 1
pprint(model_lda.print_topic(1))

('0.108*"content" + 0.059*"natural" + 0.058*"language" + 0.043*"model" + '
 '0.037*"using" + 0.026*"set" + 0.026*"processing" + 0.023*"The" + '
 '0.023*"information" + 0.021*"features"')


In [47]:
# TODO (Lee) - infer topic from keywords?

### Evaluate model

In [50]:
# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus)
perplexity

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

-6.800299187946384

In [48]:
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=data, dictionary=id_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
coherence_1

0.38964237141847535

In [51]:
# calculate coherence metric or each of the n topicss
coherence_1 = coherence.get_coherence_per_topic()
coherence_1

[0.4106267971162076,
 0.31722446067803695,
 0.5478579066803337,
 0.4132573830140969,
 0.32067875597825285,
 0.5294790508063428,
 0.49700275912588465,
 0.4066938157686664,
 0.26671360789845366,
 0.25329437453386394,
 0.5161361938345662,
 0.5091895950376919,
 0.35429888480919225,
 0.28219213238278434,
 0.47898701912650177,
 0.36814798745852895,
 0.22946963311075047,
 0.3229132041847006,
 0.29459466613983126,
 0.40453055915491465,
 0.363053418841915,
 0.24193727209161947,
 0.6471154823395759,
 0.45047726778912206,
 0.31518705756004844]

In [54]:
# explore topics
pyLDAvis.enable_notebook()
viz_topics_1 = pyLDAvis.gensim.prepare(model_lda, corpus, id_word)
viz_topics_1
# TODO (Lee) - salient vs relevant terms in pyLDA ?

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Model 2-  Mallet model

In [55]:
# uncomment to download Mallet topic model
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# update this path
path_mallet = 'data/mallet-2.0.8/bin/mallet'

In [56]:
model_2 = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=25, id2word=id_word)

In [57]:
# topics
pprint(model_2.show_topics(formatted=False))

[(16,
  [('natural', 0.06486254295532647),
   ('language', 0.05970790378006873),
   ('selected', 0.05197594501718213),
   ('module', 0.04252577319587629),
   ('portion', 0.03865979381443299),
   ('statistical', 0.03178694158075601),
   ('configured', 0.028350515463917526),
   ('engine', 0.027061855670103094),
   ('entry', 0.02620274914089347),
   ('classification', 0.01718213058419244)]),
 (10,
  [('speech', 0.12660485021398002),
   ('model', 0.09807417974322397),
   ('recognition', 0.08345221112696148),
   ('models', 0.043509272467902996),
   ('NLU', 0.03673323823109843),
   ('utterance', 0.03566333808844508),
   ('understanding', 0.029243937232524966),
   ('training', 0.026034236804564907),
   ('processing', 0.01818830242510699),
   ('ASR', 0.014978601997146932)]),
 (8,
  [('user', 0.0837173579109063),
   ('response', 0.051075268817204304),
   ('product', 0.039938556067588324),
   ('service', 0.029185867895545316),
   ('call', 0.024961597542242704),
   ('agent', 0.021121351766513058)

In [58]:
# calculate coherence metric
coherence_model_2 = CoherenceModel(model=model_2, texts=data, dictionary=id_word, coherence='c_v')
coherence_model_2 = coherence_model_2.get_coherence()
coherence_model_2

0.3398511614777415

In [59]:
# TODO (Lee)
# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
#     """
#     Compute c_v coherence for various number of topics

#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
#     texts : List of input texts
#     limit : Max num of topics

#     Returns:
#     -------
#     model_list : List of LDA topic models
#     coherence_values : Coherence values corresponding to the LDA model with respective number of topics
#     """
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, limit, step):
#         model = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=num_topics, id2word=id2word)
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())

#     return model_list, coherence_values

# model_list, coherence_values = compute_coherence_values(dictionary=id_word, corpus=corpus, texts=data, start=2, limit=40, step=6)

### Model 3 - Author topic model

In [145]:
# construct inventor-to-doc mapping as df from nested inventors column in json api response
df_inventors = json_normalize(results['patents'], record_path=['inventors'], meta=['patent_number'])
df_inventors = df_inventors[['inventor_id', 'patent_number']]
df_inventors.head(3)

Unnamed: 0,inventor_id,patent_number
0,7788103-1,10229106
1,8352247-1,10229109
2,8515750-2,10229109


In [151]:
len(df_inventors['inventor_id'].unique())

4039

In [78]:
# #df_inventors.set_index('inventor_id')
# inv_to_doc = dict(zip(df_inventors.inventor_id, df_inventors.patent_number))
# inv_to_doc

{'7788103-1': '8903712',
 '8352247-1': '9514743',
 '8515750-2': '9971758',
 '8849675-1': '9971758',
 '9514743-1': '9514743',
 '9971758-5': '9971758',
 '10223356-1': '10223356',
 '10223356-2': '10223356',
 '10223356-3': '10223356',
 '10223356-5': '10223356',
 '9177341-1': '10223356',
 '9959271-8': '10223356',
 '8281187-1': '10095736',
 '9442919-1': '10002124',
 '9442919-2': '9996526',
 '9442919-3': '10002124',
 '9442919-5': '9996526',
 '8560468-5': '9589060',
 '9037568-4': '9589060',
 '9292545-3': '9589060',
 '9589060-4': '9589060',
 '9589060-5': '9589060',
 '9589060-6': '9589060',
 '10229187-1': '10229189',
 '10229187-2': '10229189',
 '6718519-1': '10229189',
 '7945556-2': '10229189',
 '9280340-1': '10146858',
 '7398209-3': '7398209',
 '8493600-1': '10229678',
 '6804574-3': '9613020',
 '9319838-3': '9613020',
 '9606701-1': '9613020',
 '10229680-1': '10229680',
 '10229680-2': '10229680',
 '5712675-3': '10229687',
 '7475015-5': '9311298',
 '7233733-3': '10225227',
 '7558642-3': '10225227

In [122]:
df_inventors.set_index('inventor_id').T.to_dict('list')

  """Entry point for launching an IPython kernel.


{'7788103-1': ['8903712'],
 '8352247-1': ['9514743'],
 '8515750-2': ['9971758'],
 '8849675-1': ['9971758'],
 '9514743-1': ['9514743'],
 '9971758-5': ['9971758'],
 '10223356-1': ['10223356'],
 '10223356-2': ['10223356'],
 '10223356-3': ['10223356'],
 '10223356-5': ['10223356'],
 '9177341-1': ['10223356'],
 '9959271-8': ['10223356'],
 '8281187-1': ['10095736'],
 '9442919-1': ['10002124'],
 '9442919-2': ['9996526'],
 '9442919-3': ['10002124'],
 '9442919-5': ['9996526'],
 '8560468-5': ['9589060'],
 '9037568-4': ['9589060'],
 '9292545-3': ['9589060'],
 '9589060-4': ['9589060'],
 '9589060-5': ['9589060'],
 '9589060-6': ['9589060'],
 '10229187-1': ['10229189'],
 '10229187-2': ['10229189'],
 '6718519-1': ['10229189'],
 '7945556-2': ['10229189'],
 '9280340-1': ['10146858'],
 '7398209-3': ['7398209'],
 '8493600-1': ['10229678'],
 '6804574-3': ['9613020'],
 '9319838-3': ['9613020'],
 '9606701-1': ['9613020'],
 '10229680-1': ['10229680'],
 '10229680-2': ['10229680'],
 '5712675-3': ['10229687'],
 '

In [153]:
inv2doc = (df_inventors.groupby('inventor_id')
              .apply(lambda x: list(map(list, zip(x['patent_number']))))
              .to_dict())

In [157]:
inv2doc

{'10002607-1': [['10002607']],
 '10002607-3': [['10002607']],
 '10002607-4': [['10002607']],
 '10002607-5': [['10002607']],
 '10002607-6': [['10002607']],
 '10002607-7': [['10002607']],
 '10002607-8': [['10002607']],
 '10003683-1': [['10003683']],
 '10003683-2': [['10003683']],
 '10003683-3': [['10003683']],
 '10003683-4': [['10003683']],
 '10003683-5': [['10003683']],
 '10003683-6': [['10003683']],
 '10003683-7': [['10003683']],
 '10003683-8': [['10003683']],
 '10003683-9': [['10003683']],
 '10009462-1': [['10009462']],
 '10009462-2': [['10009462']],
 '10013266-1': [['10013266']],
 '10013266-2': [['10013266']],
 '10013416-1': [['10013416']],
 '10013416-10': [['10013416']],
 '10013416-11': [['10013416']],
 '10013416-12': [['10013416']],
 '10013416-2': [['10013416']],
 '10013416-3': [['10013416']],
 '10013416-4': [['10013416']],
 '10013416-6': [['10013416']],
 '10013416-7': [['10013416']],
 '10013416-8': [['10013416']],
 '10013416-9': [['10013416']],
 '10013450-2': [['10176163'], ['1016

In [156]:
# construct author-topic model
model_at = AuthorTopicModel(corpus=corpus,
                         author2doc=inv2doc,
                         id2word=id_word, 
                         num_topics=10)

TypeError: can only concatenate list (not "int") to list

In [None]:
# gensim example
# mmcorpus.IndexedCorpus
# from gensim.models import AuthorTopicModel
# from gensim.corpora import mmcorpus
# from gensim.test.utils import common_dictionary, datapath, temporary_file

# gensim author2doc format example:
author2doc = {
     'john': [0, 1, 2, 3, 4, 5, 6],
     'jane': [2, 3, 4, 5, 6, 7, 8],
     'jack': [0, 2, 4, 6, 8]
 }
author2doc

# corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
# corpus[0]

In [None]:
# construct vectors for authors
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

#### Load data

In [None]:
#### Construct a mapping from authors to document IDs

In [None]:
filenames = [data_dir + 'idx/a' + yr + '.txt' for yr in yrs]  # Using the years defined in previous cell.