## Techniche - Topic Model

In [96]:
import pandas as pd
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize

import json
import requests
import re

from topic_model import tokenize_docs

import matplotlib.pyplot as plt
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
from pprint import pprint

In [2]:
np.random.seed(3)

In [3]:
# uncomment to download stop words from nltk
# nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### import data from PatentsView API

In [4]:
# pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 50)

# patents endpoint
endpoint_url = 'http://www.patentsview.org/api/patents/query'

# build list of possible fields that endpoint request will return
df = pd.read_excel("data/patents_view_patents_fields.xlsx")
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
pat_fields = df.api_field_name.values.tolist()

# build query
query={"_or":[{"_text_phrase":{"patent_title":"natural language"}},{"_text_phrase":{"patent_abstract":"natural language"}}]}
fields=pat_fields
options={"per_page":2500}
sort=[{"patent_date":"desc"}]

params={'q': json.dumps(query),
        'f': json.dumps(fields),
        'o': json.dumps(options),
        's': json.dumps(sort)}

# request and results
resp = requests.get(endpoint_url, params=params)
results = resp.json()

#### structure data

In [5]:
# extract metadata from response
print("status code:", resp.status_code,';', "reason:", resp.reason)
total_patent_count = results["total_patent_count"]
patents_per_page = results['count']
print("total_patent_count:",total_patent_count,';', "patents_per_page:", patents_per_page)

# extract data from response
data = results['patents']
# data[0]
df = pd.DataFrame(data)
df.head(3)

# df.columns

df = df[['patent_number', 
         'patent_date', 
         'patent_title',
         'patent_abstract', 
         'patent_firstnamed_assignee_id', 
         'patent_year', 
         'patent_type', 
         'patent_kind']]

df['patent_title_abstract'] = df.patent_title + ' ' + df.patent_abstract
df.patent_title_abstract.head(3)

# 561 different assignees
len(df.patent_firstnamed_assignee_id.unique())

df.patent_firstnamed_assignee_id.value_counts()[:10]

# list of assignees with > 20 patents in df dataset
assignees_list = ['org_q9Bn28RHhpYrQjKvraAH', 'org_JZguWDMfFOBX2wBI9pnD', 'org_ID497r4tFbCIaMBjGAST', 
                  'org_rDyHZBYWMcBEtnkHt05L', 'org_p6ofWD2xFNSnyYkj6wpA', 'org_EilEWQcC6UiqHcSGx9mb',
                  'org_ccMMcUijAIsKIxUqMTyP', 'org_Vbc6obpnxWM42d0HjlXY', 'org_9D8x1qL3IRASp6GG7Glu',
                  'org_2wAdIFKssfcLHpZq0u4H', 'org_iwO2oOJ6VIBd9fAuP7G6', 'org_70D1lR89kQnFiCFdJ6s5',
                  'org_vojVnDkT9CamDETqbqJC']

df_20pats = df[df['patent_firstnamed_assignee_id'].isin(assignees_list) ]

df_20pats.sort_values(by=['patent_date'], inplace=True)

df_20pats[['patent_number','patent_title_abstract', 'patent_firstnamed_assignee_id']].head(1)

status code: 200 ; reason: OK
total_patent_count: 2482 ; patents_per_page: 2482


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,patent_number,patent_title_abstract,patent_firstnamed_assignee_id
2479,4502128,Translation between natural languages An input...,org_70D1lR89kQnFiCFdJ6s5


#### Partition data into train and test sets

In [6]:
# partition data
train_20pats = df_20pats[:894]
len(train_20pats)

test_20pats = df_20pats[894:]
len(test_20pats)

# convert text column to list
data = train_20pats.patent_title_abstract.values.tolist()
len(data)

224

In [14]:
# tokenize documents

def tokenize_docs(docs):
    tokenized_docs = []
    for doc in docs:
        tokenized_docs.append(word_tokenize(doc))
    return tokenized_docs

tokenized_docs = tokenize_docs(data)

In [55]:
# clean punctuation

def clean_docs(tokenized_docs):
    clean_docs = []
    for doc in tokenized_docs:
       clean_docs.append([word for word in doc if word.isalpha()])  
    return clean_docs

data = clean_docs(tokenized_docs)

#### Clean stopwords

In [51]:
# clean stopwords

stop_words = stopwords.words('english')

def remove_stopwords(clean_docs):
    filtered_docs = []
    for doc in clean_docs:
       filtered_docs.append([word for word in doc if word not in stop_words])
    return filtered_docs

# remove stopwords
cleaned_data = remove_stopwords(data)

#### Create bigrams and trigrams

In [34]:
# train bigram phrases model
bigram_model = Phrases(data, min_count=1, threshold=1)

# train trigram phrases model
trigram_model = Phrases(bigram_model[data], threshold=100)  

In [81]:
# bigrams
def bigrams(docs):
    """create bigrams"""
    return [bigram_model[doc] for doc in docs]

In [71]:
# initiatialize bigram and trigram models
bigram_model = gensim.models.phrases.Phraser(bigram_model)
trigram_model = gensim.models.phrases.Phraser(trigram_model)

In [63]:
bigrams(cleaned_data)



[['Translation',
  'natural_languages',
  'An_input',
  'sentence_described',
  'first_natural',
  'language',
  'sectioned',
  'individual_words',
  'Parts',
  'speech',
  'corresponding',
  'individual_words',
  'retrieved',
  'lexical',
  'word',
  'storage',
  'whereby',
  'input_sentence',
  'described',
  'corresponding_string',
  'retrieved_A',
  'translation_pattern',
  'table',
  'previously',
  'prepared',
  'defines',
  'correspondence',
  'patterns',
  'strings',
  'first_natural',
  'language',
  'second_natural',
  'language',
  'corresponding',
  'output_sentence',
  'described',
  'By',
  'referring',
  'translation_pattern',
  'table',
  'string',
  'input_sentence',
  'transformed',
  'corresponding_string',
  'second_natural',
  'language',
  'The',
  'output_sentence',
  'described',
  'second_natural',
  'language',
  'generated',
  'sequencing',
  'target_words',
  'accordance',
  'sequential',
  'order',
  'parts',
  'speech',
  'string',
  'pattern',
  'obtained

In [82]:
def trigrams(docs):
    """create trigrams"""
    return [trigram_model[bigram_model[doc]] for doc in docs]

In [74]:
trigrams(cleaned_data)

[['Translation',
  'natural_languages',
  'An_input',
  'sentence_described',
  'first_natural',
  'language',
  'sectioned',
  'individual_words',
  'Parts',
  'speech',
  'corresponding',
  'individual_words',
  'retrieved',
  'lexical',
  'word',
  'storage',
  'whereby',
  'input_sentence',
  'described',
  'corresponding_string',
  'retrieved_A',
  'translation_pattern',
  'table',
  'previously',
  'prepared',
  'defines',
  'correspondence',
  'patterns',
  'strings',
  'first_natural',
  'language',
  'second_natural',
  'language',
  'corresponding',
  'output_sentence',
  'described',
  'By',
  'referring',
  'translation_pattern',
  'table',
  'string',
  'input_sentence',
  'transformed',
  'corresponding_string',
  'second_natural',
  'language',
  'The',
  'output_sentence',
  'described',
  'second_natural',
  'language',
  'generated',
  'sequencing',
  'target_words',
  'accordance',
  'sequential',
  'order',
  'parts',
  'speech',
  'string',
  'pattern',
  'obtained

#### Lemmatize

In [84]:
def lemmatize_docs(docs, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """lemmatize documents"""
    lemmatized_docs = []
    for doc in docs: 
        lemmatized_docs.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return lemmatized_docs

In [90]:
for doc in cleaned_data:
    for token in doc:
        token.lemma_

AttributeError: 'str' object has no attribute 'lemma_'

In [94]:
# make sure your downloaded the english model with "python -m spacy download en"

In [97]:
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

In [93]:
lemmas = lemmatizer(u"ducks", u"NOUN")

OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [101]:
lemmatizer(['dog'])

TypeError: __call__() missing 1 required positional argument: 'univ_pos'

In [None]:
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")

for token in doc:
    print(token, token.lemma, token.lemma_)

In [85]:
lemmatize_docs(cleaned_data)

AttributeError: 'str' object has no attribute 'pos_'