In [86]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from ast import literal_eval
import networkx as nx
from tqdm import tqdm
from gensim.models import Word2Vec

# -1.0 Build Graph

In [88]:
import pandas as pd
import numpy as np

patent_df = pd.read_csv("data/apat63_99.txt", dtype={"PATENT": np.int64})
patent_df = patent_df.fillna("")


cite_df = pd.read_csv("data/cite75_99.txt")
cite_df = cite_df.fillna("")

patent_num_to_ids = {}
edge_rows = set()
id_num = 0
cap = 300000
adjacency_dict = {}

seed_connection = cite_df.iloc[1000000]
citing = seed_connection["CITING"]
cited = seed_connection["CITED"]
patent_num_to_ids[citing] = id_num
id_num += 1
patent_num_to_ids[cited] = id_num
id_num += 1

while True:
    if id_num >= cap:
        break
    for i, row in cite_df.iterrows():
        citing = row["CITING"]
        cited = row["CITED"]
        if id_num >= cap:
            break
        if citing not in patent_num_to_ids and cited not in patent_num_to_ids:
            # Only take connections that are fully connected to the seeded graph
            continue
        if citing not in patent_num_to_ids:
            patent_num_to_ids[citing] = id_num
            id_num += 1
        if cited not in patent_num_to_ids:
            patent_num_to_ids[cited] = id_num
            id_num += 1

        citing_id = patent_num_to_ids[citing]
        cited_id = patent_num_to_ids[cited]   
        if citing_id not in adjacency_dict:
            adjacency_dict[citing_id] = [cited_id]
        else:
            adjacency_dict[citing_id].append(cited_id)

# Final pass to make sure all edges make it
for i, row in cite_df.iterrows():
    citing = row["CITING"]
    cited = row["CITED"]

    if citing not in patent_num_to_ids or cited not in patent_num_to_ids:
        # Only take connections that are fully connected to the seeded graph
        continue

    citing_id = patent_num_to_ids[citing]
    cited_id = patent_num_to_ids[cited]   
    if citing_id not in adjacency_dict:
        adjacency_dict[citing_id] = [cited_id]
    else:
        adjacency_dict[citing_id].append(cited_id)

# print("Saving edges...")
# print(len(edge_rows))
# save_df = pd.DataFrame(edge_rows, columns=['node_1', 'node_2'])
# save_df.to_csv("patent_edges.csv", index=False)

#patent_df["id"] = -1
print("Filling in ID numbers...")
patent_df['id'] = patent_df['PATENT'].apply(lambda x: patent_num_to_ids[x] if x in patent_num_to_ids else -1)

patent_df.to_csv("patent_data_with_ids.csv", index=False)

Filling in ID numbers...


# 0. Compute Embeddings (GraphVite)

In [5]:
# import graphvite as gv
# import graphvite.application as gap

In [9]:
edge_list = pd.read_csv("data/patent_edges.csv")
#edge_list.rename(columns = {0:'source', 1: 'target'}, inplace = True)
print(edge_list.head())

   node_1  node_2
0    2783    2620
1  288671  128075
2   58918   61033
3  200952  188200
4   58582   36429


In [10]:
# convert integers into string
edge_list = edge_list.applymap(str)
# create undirected graph from the edgelist
G=nx.from_pandas_edgelist(edge_list, source='node_1', target='node_2', create_using=nx.Graph())
# check the basic properties of the graph
nx.info(G)

'Graph with 300000 nodes and 795677 edges'

In [67]:
with open("data/patent_embeddings.pkl", 'rb') as pkl_file:
    embeddings = pickle.load(pkl_file)
    
with open('data/names_to_node_ids.pkl', 'rb') as pkl_file:
    name_to_id_map = dict(pickle.load(pkl_file))

id_to_name_map = {}
for key, value in name_to_id_map.items():
    id_to_name_map[value] = key

In [11]:
# function to generate random walk sequences of nodes for a particular node
def get_random_walk(node, walk_length):
    # initialization
    random_walk_length = [node]
    
    #loop over to get the nodes visited in a random walk
    for i in range(walk_length-1):
        # list of neighbors
        neighbors = list(G.neighbors(node))
        # if the same neighbors are present in ranom_walk_length list, then donot add them as new neighbors
        neighbors = list(set(neighbors) - set(random_walk_length))    
        if len(neighbors) == 0:
            break
        # pick any one neighbor randomly from the neighbors list
        random_neighbor = random.choice(neighbors)
        # append that random_neighbor to the random_walk_length list
        random_walk_length.append(random_neighbor)
        node = random_neighbor
        
    return random_walk_length

In [12]:
# check random walk for node '2'
get_random_walk('2', 10)

['2', '9', '6', '0', '1', '6981', '1310', '1314', '837', '72734']

In [17]:
# calculate random walks for every node in the Graph
all_nodes = list(G.nodes())
number_of_random_walks = 5
random_walks = []

for node in tqdm(all_nodes):
    # number of random walks
    for i in range(number_of_random_walks):
        # append the random walk sequence of a node from a specified length
        random_walks.append(get_random_walk(node, 10))

100%|██████████| 300000/300000 [00:50<00:00, 5892.90it/s]


In [20]:
# train word2vec model
model = Word2Vec(window = 4, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
    )

model.build_vocab(random_walks, progress_per=2)

In [21]:
model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1)

(263576440, 263576440)

In [25]:
numbers = list(G.nodes)
embeddings = list(model.wv[numbers])

In [68]:
# year = GYEAR
# category = CAT
# country = COUNTRY
# Assignee Code = ASSCODE
# 3. The Assignee type code is a one-character code having the following meaning:

#      1    = unassigned
#      2    = assigned to a U.S. nongovernment organization
#      3    = assigned to a non-U.S., nongovernment organization
#      4    = assigned to a U.S. individual
#      5    = assigned to a non-U.S. individual
#      6    = assigned to the U.S. (Federal) Government
#      7    = assigned to a non-U.S. government
#      8,9  = assigned to a U.S. non-Federal Government agency (do not appear in the dataset)

# 0.1 Node2Vec Embeddings

In [70]:
import networkx as nx
import pandas as pd
from karateclub import DeepWalk, Node2Vec
import pickle

df = pd.read_csv("data/patent_edges.csv", dtype={'node_1': int, 'node_2': int})
print(len(df))

G = nx.from_pandas_edgelist(df, 'node_1', 'node_2', create_using=nx.DiGraph())
print(len(G))

print("Fitting Node2Vec model...")
model = Node2Vec()
model.fit(G)
embeddings = model.get_embedding()

795677
300000
Fitting Node2Vec model...


KeyboardInterrupt: 

# 0.2 DeepWalk Embeddings

In [89]:
import networkx as nx
import pandas as pd
from karateclub import DeepWalk, Node2Vec
import pickle

#edge_list = pd.read_csv("data/patent_edges.csv")
#edge_list = edge_list.applymap(str)
G=nx.DiGraph(adjacency_dict)
#G=nx.from_pandas_edgelist(edge_list, source='node_1', target='node_2', create_using=nx.DiGraph())
print(G.is_directed())
print(len(G))

print("Fitting Deepwalk model...")
model = DeepWalk(walk_length=10, dimensions=32, window_size=4)
model.fit(G)
embeddings = model.get_embedding()

True
300000
Fitting Deepwalk model...


TypeError: 'tuple' object is not callable

# 1.0 Data Prep

In [90]:
df = pd.read_csv("data/patent_data_with_ids.csv", index_col=False)
df = df[df['id'] != -1]
df = df.sort_values(by=['id'])
print(len(df))

299610


In [91]:
embedding_list = []

category_list = []
country_list = []
binary_country_list = []
assignee_list = []
year_list = []


for i, row in df.iterrows():
    id_num = row['id']
    embedding_list.append(embeddings[id_num])
    category_list.append(row['CAT'])
    country_list.append(row['COUNTRY'])
    if row['COUNTRY'] == 'US':
        binary_country_list.append(0)
    else:
        binary_country_list.append(1)
    assignee_list.append(row['ASSCODE'])
    year_list.append(row['GYEAR'])

In [92]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
label_array = np.array(category_list).reshape(-1, 1)
enc.fit(label_array)
encoded = enc.transform(label_array).toarray()
print(encoded.shape)
print(encoded)

(299610, 6)
[[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 ...
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]]


In [93]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [94]:
#x_train, x_test, y_train, y_test = train_test_split(embedding_list, encoded, test_size=0.2, random_state=21)
x_train, x_test, y_train, y_test = train_test_split(embedding_list, binary_country_list, test_size=0.2, random_state=21)

In [95]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [96]:
print(x_train[:10])

[[-1.58568351e-02 -2.70832748e+00 -3.16555554e-01  3.16182426e-01
  -2.70953569e+00  5.96758685e-02  4.93076326e-01  1.24301492e+00
   2.39017681e-01 -1.67994700e+00  7.27793832e-01  7.68271630e-01
   3.65124288e-01 -1.11328929e+00 -1.09892861e+00 -6.65144417e-01
   3.24715300e-02  7.76537506e-01 -2.09135628e-02  6.45926129e-01
   1.89796660e-01  1.46688595e-01 -3.67241159e-01 -9.68897761e-01
  -3.54491382e-01  6.11716515e-01 -1.19033051e-01 -3.60793962e-01
   7.12082817e-01 -7.96215519e-01  2.69770611e-01  4.80311114e-01]
 [ 5.82952577e-02  4.93403373e-01  7.57725372e-02  1.13790132e-01
  -4.75421946e-01  4.42620530e-01 -2.11507146e-01  8.35724587e-01
  -4.73989128e-01 -1.03572686e-01 -2.48149861e-01  2.01248737e-01
  -1.11791134e-02 -3.88569913e-01  1.91792081e-01 -2.88217486e-01
  -1.78046860e-01 -1.00213386e-01 -3.33667303e-02 -3.69168139e-01
   5.40124102e-01 -2.43781920e-01  2.38695905e-01  1.88946582e-01
  -1.79509103e-01  4.97302039e-01 -1.52231223e-01 -3.88140550e-01
   4.9854

In [97]:
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42)
classifier.fit(x_train, y_train)

KeyboardInterrupt: 

In [None]:
y_pred = classifier.predict(x_test)

In [83]:
num_correct = 0
for i in range(len(y_test)):
    if y_pred[i] == y_test[i]:
        num_correct += 1
        
print(num_correct / len(y_pred))
print(sum(binary_country_list)/len(binary_country_list))

0.5338773739194286
0.43444477821167515


# 1.1 Logistic Regression

In [98]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000000)
log_model.fit(x_train, y_train)

train_score = log_model.score(x_train, y_train)
test_score = log_model.score(x_test, y_test)
print(train_score, test_score)

0.5658981676179032 0.5641834384700111
