<a href="https://colab.research.google.com/github/deep1003/deep1003/blob/master/Week%2014.%20Patent%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#1 Import the necessary packages and modules
import os
import math
import time
import pickle
from tqdm import tqdm
import pandas as pd
tqdm.pandas()
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from pprint import pprint
import re
import nltk
import gensim
from gensim.test.utils import datapath
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from random import seed, sample
import random
from ksvd import ApproximateKSVD
import sklearn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import warnings
warnings.filterwarnings('ignore')
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')



In [None]:
#2 Data loading function, to convert the string in each cell of the dataframe to a list
def string_to_ls(text):
word_ls_strip = []
    word_ls = text.strip('][').split(', ')     for w in word_ls:
word_ls_strip.append(w.strip("'"))
return (word_ls_strip)


In [None]:
#3 Download the patents data

# First, download the patent abstracts from https://patentsview.org/download/data-download-tables
# Then, read the dataset into the memory
patent_df = pd.read_csv("patent.tsv", low_memory=False, sep="\t")
print(patent_df.shape)
patent_df.head()

In [None]:
#4 Create the function to clean the text content in the abstract

def string_tokenize(text):
workingIter = []
if isinstance(text, str)==True:
    tokenLst    = nltk.word_tokenize(text) # tokenize
    workingIter = [w.lower() for w in tokenLst if w.isalpha()] # Lowercasing, remove speacial characters and numbers
    return (workingIter)


In [None]:
#5 Create a new column to store the pre-processed tokens and save the data
patent_df["abstract_token"] = patent_df["abstract"].progress_apply(lambda x: string_tokenize(x))
patent_df = patent_df[['id', 'type', 'number', 'date', 'title', 'abstract_token']]
patent_df.to_csv("patents_processed_tokens.csv", index=False)
print(patent_df.shape)
patent_df.head()


In [None]:
#6 Load the processed data from above

patent_df = pd.read_csv("patents_processed_tokens.csv", low_memory=False)
# Clean string the string of text column
patent_df["abstract_token"] = patent_df["abstract_token"].progress_apply(lambda x: string_to_ls(x))


In [None]:
#7 Get the training corpus (all patent text)
training_patent = list(patent_df["abstract_token"])


In [None]:
#8 Train the model (use the default setting min_count=5, threshold=10)
phrase_model = Phrases(training_patent, onnector_words=ENGLISH_CONNECTOR_WORDS)


In [None]:
#9 Get the bigram version of each abstract
patent_df["abstract_token_bigram"]=patent_df["abstract_token"].progress_apply(lambda x: phrase_model[x])

In [None]:
#10 Keep only the necessary columns and save the dataframe
patent_df = patent_df[['id', 'type', 'number', 'date', 'title', 'abstract_token_bigram']]

In [None]:
#11 Load the processed data from above
patent_df = pd.read_csv("patents_processed_bigrams.csv")
# clean string of list to list
patent_df["abstract_token_bigram"] = patent_df["abstract_token_bigram"].progress_apply(lambda x: string_to_ls(x))

In [None]:
#12 Convert the column with abstracts into a list to be fed into the algorithm
trained_bigram = list(patent_df["abstract_token_bigram"])

In [None]:
#13 Set the model parameters and train the model by itreating through the abstracts in the abstract list

# Model parameters
VectorSize = 300 # the number of dimensions into which each word will be embedded.
Window = 5 # The number of words before and after the focal word used to train the embedding.
Epochs = 10 # The number of iterations through the corpus that the algorithm will perform to train the model.
MinCount = 1 # Tells the algorithm to ignore words with a frequency lower than this.
Workers = 6 # My machine has 8 processors, so I am assigning six of these to the task of training the model.
# The default algorithm is CBOW. If we wanted to run the skip-gram algorithm, we would have set sg=1 below
start = time.time()
w2v_model = Word2Vec(sentences=trained_bigram, vector_size=VectorSize, window=Window, min_count=MinCount, workers=Workers, epochs=Epochs)
print("Minutes it took to train the model: ", (time.time() - start) / 60)
w2v_model.save('patentAbstractsW2V_300_10_5.model')

In [None]:
#14 Once the model is trained and saved to the hard drive, we can simply read in the model for subsequent use.
model = Word2Vec.load('patentAbstractsW2V_300_10_5.model')

In [None]:
#15 As a first step in validating the model, we want to see whether the local conceptual structure aligns with our intuition.
# To check this, we will ask for the most similar words of a few different concepts.
# The code below returns a list of the top ten most similar words to our focal word, including the distance to each of these
# words in terms of cosine similarity. The closer the cosine is to 1, the more similar the word is to the focal word.
# A cosine of 0 indicates that the word is orthogonal.
print(model.wv.most_similar('light'))
print("")
print(model.wv.most_similar('chemical'))
print("")
print(model.wv.most_similar('car'))
# The output suggests that our model has indeed generated an embedding model that makes sense at the local conceptual level.

In [None]:
#16 We can then check whether the local structure is also respecting more global structure across the entire conceptual
# space. To do so, we can see whether words that we know to be distant are also distant within the space itself.
print(model.wv.similarity('chemical', 'biological'))
print(model.wv.similarity('chemical', 'drug'))
print(model.wv.similarity('chemical', 'food'))
print(model.wv.similarity('chemical', 'engineering'))
print(model.wv.similarity('chemical', 'software'))
print(model.wv.similarity('chemical', 'car'))

In [None]:
#17 To more fully check that local and global structure are in alignment with intuition, we can create a 2-dimensional
# visualization that plots the most similar terms of different kinds of words. If both local and global distnaces are being
# respected, the most similar terms should cluster together while the clusters themselves should be further away from each
# other in proportion to how similar they themselves are.
keys = ['computer', 'telephone', 'car', 'boat', 'drug', 'chemical']
embedding_clusters = []
word_clusters = []
for word in keys:
embeddings = []
words = []
for similar_word, _ in model.wv.most_similar(word, topn=6):
words.append(similar_word)
embeddings.append(model.wv[similar_word])
embedding_clusters.append(embeddings)
word_clusters.append(words)
perp = 9
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=perp, n_components=2, init='pca', n_iter=50000, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
plt.figure(figsize=(16, 9))
colors = cm.rainbow(np.linspace(0, 1, len(labels)))
for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
x = embeddings[:, 0]
y = embeddings[:, 1]
plt.scatter(x, y, c=color, alpha=a, label=label, s=400)
for i, word in enumerate(words):
plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 8),
textcoords='offset points', ha='right', va='bottom', size=16)
plt.legend(loc=4, prop={'size': 16})
plt.title(title, fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
#plt.grid(True)
plt.grid(False)
if filename:
plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
plt.show()
outfile = "G:\\My Drive\\Projects\\0 Word Embeddings OS\\Embeddings Appendix\\TSNE.png"
tsne_plot_similar_words('Word Similarities from Patent Abstracts', keys, embeddings_en_2d, word_clusters, 0.7, outfile)
# The plot indeed shows that the local and global structure are being respected, with the most similar words for the focal
# word being located within the same cluster, with similar clusters being near each other (e.g., the computer cluster is near
# the telephone cluster, the car cluster is near boat cluster, and the drug cluster is near the chemical cluster).

In [None]:
#18 The above graph was induced from the data and the model, jiving with our intuition. But we may also want to check whether
# the model output also maps on to previously identified structure. Here we use a graph in Leydesdorff et al. 2014, composed
# of the mapping of patent categories through citations. As we can see in figure 2 of the paper, the model's output is in
# agreement here as well.
print((- (model.wv.similarity('cement', 'nanotechnology') + 1) * 50) + 100)
print((- (model.wv.similarity('cement', 'plastics') + 1) * 50) + 100)
print((- (model.wv.similarity('cement', 'yarn') + 1) * 50) + 100)
print((- (model.wv.similarity('cement', 'spraying') + 1) * 50) + 100)
print((- (model.wv.similarity('cement', 'water') + 1) * 50) + 100)
print((- (model.wv.similarity('cement', 'printing') + 1) * 50) + 100)
print((- (model.wv.similarity('cement', 'lighting') + 1) * 50) + 100)
print((- (model.wv.similarity('cement', 'vehicles') + 1) * 50) + 100)
print((- (model.wv.similarity('cement', 'veterinary') + 1) * 50) + 100)
# Note that here we have amended the -1 to 1 continuum so that values range between 0 and 100, with larger values equaling
# greater distances. Doing so can facilitate the interpretation of regression coefficients when our theoretical construct of
# interest in distance instead of similarity.


In [None]:
#19 Thus far, we have been validating our model against our intuition or other measures of structure. Yet, we als owant to
# ensure that our model is capturing the objective regularities of the physical world. In the context of patents, we can
# check whether known distances in the physical world are also represented by the model by seeing whether the table of
# elements can be recreated (see figure XX in the paper).
print((- (model.wv.similarity('hydrogen', 'lithium') + 1) * 50) + 100)
print((- (model.wv.similarity('hydrogen', 'sodium') + 1) * 50) + 100)
print((- (model.wv.similarity('hydrogen', 'potassium') + 1) * 50) + 100)
print((- (model.wv.similarity('hydrogen', 'rubidium') + 1) * 50) + 100)
print((- (model.wv.similarity('hydrogen', 'caesium') + 1) * 50) + 100)
print((- (model.wv.similarity('hydrogen', 'francium') + 1) * 50) + 100)

In [None]:
#20 Another relevant set of knowledge we would want the embedding model of the patents space to represent is that having to do
# with social relations. To check this, we can see how certain concepts project onto relevant social dimensions.
# Here we will use gender as a relevant dimension to see whether concepts are correctly projecting onto the female/male
# continuum.
def normalize(vector):
normalized_vector = vector / np.linalg.norm(vector)
return normalized_vector
def dimension(model, positives, negatives):
diff = sum([normalize(model.wv[x]) for x in positives]) - sum([normalize(model.wv[y]) for y in negatives])
return diff
Gender = dimension(model, ['man','him', 'he', 'male', 'men'], ['woman', 'her', 'she', 'female', 'women'])
Concepts = ['dietitian', 'hygienist', 'lipstick', 'breastpump', 'tampon',
'military', 'farming', 'police', 'hammer', 'fishing']
def makeDF(model, word_list):
g = []
r = []
c = []
for word in word_list:
g.append(sklearn.metrics.pairwise.cosine_similarity(model.wv[word].reshape(1,-1), Gender.reshape(1,-1))[0][0])
df = pd.DataFrame({'gender': g}, index = word_list)
return df
df = makeDF(model, Concepts)
df = df.sort_values(by=['gender'])
df



In [None]:
#21 We also know that embedding models embed sufficiant relational information to solve analogical reasoning tasks.
# Therefore, if our model is well-trained, it should be able to do the same. Table 1 in the paper presents the output of the
# code below.
print(model.wv.most_similar_cosmul(positive=['tiny', 'large'], negative=['big'])[0]) # Synonym
print(model.wv.most_similar_cosmul(positive=['downward', 'liquid'], negative=['solid'])[0]) # Antonym
print(model.wv.most_similar_cosmul(positive=['car', 'animal'], negative=['cat'])[0]) #Type
print(model.wv.most_similar_cosmul(positive=['skate', 'airplane'], negative=['fly'])[1]) #Type
print(model.wv.most_similar_cosmul(positive=['hammer', 'cut'], negative=['knife'])[0]) # Item / Purpose
print(model.wv.most_similar_cosmul(positive=['farmer', 'patient'], negative=['doctor'])[0]) # product / worker
print(model.wv.most_similar_cosmul(positive=['short', 'longest'], negative=['long'])[0]) # Distance
print(model.wv.most_similar_cosmul(positive=['spoon', 'pens'], negative=['pen'])[0]) # Count


In [None]:
#22 Load the dataframe with the patent abstracts and load the embedding model
df = pd.read_csv("patents_processed_bigrams.csv")
df = df.head() # For our practical application examples, we will only use five abstracts
df["abstract_token_bigram"] = df["abstract_token_bigram"].progress_apply(lambda x: string_to_ls(x))
model = Word2Vec.load('patentAbstractsW2V_300_10_5.model')
df.head()

In [None]:
#23 I will print each abstract so that you can see what the text of each abstract states
abstract_list = df['abstract_token_bigram'].tolist()
for abstract in abstract_list:
print("")
print(len(abstract))
print(abstract)


In [None]:
#24 Measuring the conceptual breadth of sets of words
# Because larger values equal greater breadth, we transform the measure to reflect this
l1 = ["chemistry", "biochemistry", "analytical_chemistry"]
l2 = ["chemistry", "oceanography", "computer"]
sims1 = [model.wv.similarity(l1[0], l1[1]), model.wv.similarity(l1[0], l1[2]), model.wv.similarity(l1[1], l1[2]),]
sims1mean = (((sum(sims1) / float(len(sims1)) + 1) * -50) + 100) # higher values equal broader distances
print(l1)
print(sims1mean)
print("")
sims2 = [model.wv.similarity(l2[0], l2[1]), model.wv.similarity(l2[0], l2[2]), model.wv.similarity(l2[1], l2[2]),]
sims2mean = (((sum(sims2) / float(len(sims2)) + 1) * -50) + 100) # higher values equal broader distances
print(l2)
print(sims2mean)



In [None]:
#25 This function samples 100 random word pairs from a document and returns their mean value. This can be useful in situations
# where there are many documents with many words, such that taking the similarity of all word pairs for all documents
# would become computationally time intensive. In the case when it is not computationally intensive, researchers can simply
# compute the mean breadth of of the lower triangular matrix of the word-by-word cosine distances below the diagonal.
def conceptual_breadth(text): # Calculates the average similarity for 100 random pairs of words
numUnique = len(text)
distances = []
for i in range(100):
try:
rand1 = text[random.randrange(0, numUnique)] # numUnique is the number of words in the abstract
rand2 = text[random.randrange(0, numUnique)]
dist = (((model.wv.similarity(rand1, rand2) + 1) * -50) + 100) # higher values equal broader distances
distances.append(dist)
except:
continue
try:
dists = distances
dists.sort(reverse=True)
breadth = float(sum(dists)) / float(len(dists))
except:
breadth = np.nan
return breadth
df['abstract_breadth'] = df['abstract_token_bigram'].apply(conceptual_breadth)
df.head()



In [None]:
#26 Load the dataframe with the patent abstracts and load the embedding model
df = pd.read_csv("patents_processed_bigrams.csv")
df["abstract_token_bigram"] = df["abstract_token_bigram"].progress_apply(lambda x: string_to_ls(x))
model = Word2Vec.load('patentAbstractsW2V_300_10_5.model')
model_vocab=model.wv.key_to_index
df.head()


In [None]:
#27 Function to get the centroid vector of an abstract text (average over all words/bigrams)
def get_centroid_vector(token_ls):
total_vector=list()
for token in token_ls:
if token in model_vocab:
v=model.wv[token]
total_vector.append(v)
if len(total_vector)>0:
centroid_vector=sum(total_vector)/len(total_vector)
else:
centroid_vector=np.nan
return (centroid_vector)
# Below is a faster function to calculate cosine similarity
def alt_cosine(x,y):
return np.inner(x,y)/np.sqrt(np.dot(x,x)*np.dot(y,y))


In [None]:
#28 Get the centroid vector for each abstract in table 2
# For our example, we will only calculate the centroid of the abstracts we want
df = df.loc[df['id'].isin([10000000, 8406638, 10000142])]
df['centroid_vector'] = df['abstract_token_bigram'].progress_apply(lambda x: get_centroid_vector(x))
df.head()


In [None]:
#29 Compare the similarities for the three patent abstracts in table 2
v1 = df.loc[df['id'] == 10000000, 'centroid_vector'].iloc[0]
v2 = df.loc[df['id'] == 8406638 , 'centroid_vector'].iloc[0]
v3 = df.loc[df['id'] == 10000142, 'centroid_vector'].iloc[0]
print(alt_cosine(v1,v2))
print(alt_cosine(v1,v3))
print(alt_cosine(v3,v2))



In [None]:
#30 Load the processed data from above
df = pd.read_csv("patents_processed_bigrams.csv")
df = df[["abstract_token_bigram", "id"]]
# clean string of list to list
df["abstract_token_bigram"] = df["abstract_token_bigram"].progress_apply(lambda x: string_to_ls(x))


In [None]:
#31 Train the doc2vec embedding model
# Model parameters
VectorSize = 300 # the number of dimensions into which each word will be embedded.
Window = 5 # The number of words before and after the focal word used to train the embedding.
Epochs = 10 # The number of iterations through the corpus that the algorithm will perform to train the model.
MinCount = 1 # Tells the algorithm to ignore words with a frequency lower than this.
Workers = 6 # My machine has 8 processors, so I am assigning six of these to the task of training the model.
start = time.time()
trained_bigram = list(df.itertuples(index=False, name=None))
docs = []
for doc in trained_bigram:
T = TaggedDocument(doc[0], [doc[1]])
docs.append(T)
modeldv = Doc2Vec(docs, vector_size = VectorSize, window = Window, epochs = Epochs)
print("Minutes it took to train the model: ", ((time.time() - start) / 60))
modeldv.save('patentAbstractsD2V_300_10_5.model')


In [None]:
#32 Save the model for future use
modeldv = Doc2Vec.load('patentAbstractsD2V_300_10_5.model')


In [None]:
#33 Compare the similarities for the three patent abstracts in table 2
print(modeldv.docvecs.similarity(10000000,8406638))
print(modeldv.docvecs.similarity(10000000,10000142))
print(modeldv.docvecs.similarity(10000142,8406638))


In [None]:
#34 Load the dataframe with the patent abstracts and load the embedding model
df = pd.read_csv("patents_processed_bigrams.csv")
# For our example, we will only calculate the centroid of the abstracts we want
df = df.loc[df['id'].isin([10000000, 8406638, 10000142])]
df["abstract_token_bigram"] = df["abstract_token_bigram"].apply(lambda x: string_to_ls(x))
model = Word2Vec.load('patentAbstractsW2V_300_10_5.model')
model_vocab=model.wv.key_to_index
df.head()


In [None]:
#35 Use the centroid vector function in #27 to calculate the centroid vector for the time and geometry archetypes
time = ['interval','time','period','preceding','time_interval','start','timing','immediately_preceding','nanosecond','seconds',
'minutes','hours','synchronize','synchronized','instant','continuation','duration']
time_vector = get_centroid_vector(time)
geometry = ['plane','ellipse','parabola','straight_line','bisector','arc_circle','tangent','hyperbolic','tangents','curvature',
'angle','circular_arc','sagittal','axis_symmetry','meridian','ellipsoid','paraboloid','regular_polygon','dihedral',
'intersection','geometry','geometric']
geometry_vector = get_centroid_vector(geometry)



In [None]:
#36 Measure the similarity between thet time and geomtry archetype vectors to our sample patents in table 2
# Centroid vectors of our patents
v1 = df.loc[df['id'] == 10000000, 'centroid_vector'].iloc[0]
v2 = df.loc[df['id'] == 8406638 , 'centroid_vector'].iloc[0]
v3 = df.loc[df['id'] == 10000142, 'centroid_vector'].iloc[0]
# Compare the second (8406638) and third (10000142) patents to the time archetype
print("Similarity between the time archetype and a coherent light receiver: ", alt_cosine(v1,time_vector))
print("Similarity between the time archetype and a coherent light receiver: ", alt_cosine(v2,time_vector))
print("Similarity between the time archetype and a head and neck restraint: ", alt_cosine(v3,time_vector))
print("")
# Compare the second (8406638) and third (10000142) patents to the geometry archetype
print("Similarity between the geometry archetype and a coherent light receiver: ", alt_cosine(v1,geometry_vector))
print("Similarity between the geometry archetype and a coherent light receiver: ", alt_cosine(v2,geometry_vector))
print("Similarity between the geometry archetype and a head and neck restraint: ", alt_cosine(v3,geometry_vector))



In [None]:
#37 Terms most associated with different brain regions. Note that we only provide these associations for the model trained
# on all patents. In practice, one would want to first train separate embedding models based on the cleavages of interest.
print("hippocampus: " , model.wv.most_similar('hippocampus', topn=10))
print("")
print("subthalamic_nucleus: ", model.wv.most_similar('subthalamic_nucleus', topn=10))
print("")
print("nucleus_basalis: " , model.wv.most_similar('nucleus_basalis', topn=10))
