In [2]:
! pip install cdindex



In [15]:
import urllib.request
import cdindex
import zipfile
import requests, zipfile, io
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

## CDIndex on Latest PatentView Data

### Test

In [4]:
# dummy vertices for python module tests
pyvertices= [{"name": "0Z", "time": datetime(1992, 1, 1)},
				{"name": "1Z", "time": datetime(1992, 1, 1)},
				{"name": "2Z", "time": datetime(1993, 1, 1)},
				{"name": "3Z", "time": datetime(1993, 1, 1)},
				{"name": "4Z", "time": datetime(1995, 1, 1)},
				{"name": "5Z", "time": datetime(1997, 1, 1)},
				{"name": "6Z", "time": datetime(1998, 1, 1)},
				{"name": "7Z", "time": datetime(1999, 1, 1)},
				{"name": "8Z", "time": datetime(1999, 1, 1)},
				{"name": "9Z", "time": datetime(1998, 1, 1)},
				{"name": "10Z", "time": datetime(1997, 1, 1)}]

# dummy edges for python module tests
pyedges = [{"source": "4Z", "target": "2Z"},
				{"source": "4Z", "target": "0Z"},
				{"source": "4Z", "target": "1Z"},
				{"source": "4Z", "target": "3Z"},
				{"source": "5Z", "target": "2Z"},
				{"source": "6Z", "target": "2Z"},
				{"source": "6Z", "target": "4Z"},
				{"source": "7Z", "target": "4Z"},
				{"source": "8Z", "target": "4Z"},
				{"source": "9Z", "target": "4Z"},
				{"source": "9Z", "target": "1Z"},
				{"source": "9Z", "target": "3Z"},
				{"source": "10Z", "target": "4Z"}]


# create graph
graph = cdindex.Graph()

# add vertices
for vertex in pyvertices:
	graph.add_vertex(vertex["name"], cdindex.timestamp_from_datetime(vertex["time"]))

# add edges
for edge in pyedges:
	graph.add_edge(edge["source"], edge["target"])

In [5]:
print (graph.cdindex("4Z", int(timedelta(days=1825).total_seconds())))
print (graph.mcdindex("4Z", int(timedelta(days=1825).total_seconds())))

0.16666666666666666
0.8333333333333333


### Download Datasets

In [17]:
PATENT_ZIP_FILE = 'https://s3.amazonaws.com/data.patentsview.org/download/patent.tsv.zip'
CITATIONS_ZIP_FILE = 'https://s3.amazonaws.com/data.patentsview.org/download/uspatentcitation.tsv.zip'

def downloadAndExtract(url):
  r = requests.get(url)
  z = zipfile.ZipFile(io.BytesIO(r.content))
  z.extractall("HOME_DIR")

In [11]:
downloadAndExtract (PATENT_ZIP_FILE)

In [12]:
downloadAndExtract (CITATIONS_ZIP_FILE)

In [9]:
HOME_DIR = '//Users/Sanjay.K.Arora/data/web_of_innovation/'
PAT_FILE = HOME_DIR + 'patent.tsv'
CIT_FILE = HOME_DIR + 'uspatentcitation.tsv'

pat_df = pd.read_csv(PAT_FILE, sep='\t', dtype=str)
# cit_df = pd.read_csv(CIT_FILE, sep='\t', dtype=str)

In [10]:
# print dataframe contents
pat_df.head()

Unnamed: 0,id,type,number,country,date,abstract,title,kind,num_claims,filename,withdrawn
0,10000000,utility,10000000,US,2018-06-19,A frequency modulated (coherent) laser detecti...,Coherent LADAR using intra-pixel quadrature de...,B2,20,ipg180619.xml,0
1,10000001,utility,10000001,US,2018-06-19,The injection molding machine includes a fixed...,Injection molding machine and mold thickness c...,B2,12,ipg180619.xml,0
2,10000002,utility,10000002,US,2018-06-19,The present invention relates to: a method for...,Method for manufacturing polymer film and co-e...,B2,9,ipg180619.xml,0
3,10000003,utility,10000003,US,2018-06-19,The invention relates to a method for producin...,Method for producing a container from a thermo...,B2,18,ipg180619.xml,0
4,10000004,utility,10000004,US,2018-06-19,The present invention relates to provides a do...,"Process of obtaining a double-oriented film, c...",B2,6,ipg180619.xml,0


In [20]:
# describe date
res = [datetime.strptime(dt, "%Y-%m-%d").strftime("%d-%m-%Y") for dt in pat_df.date]

In [28]:
pd.Series(res).max()

'31-12-2019'

In [0]:
cit_df.tail()

Unnamed: 0,uuid,patent_id,citation_id,date,name,kind,country,category,sequence
117189467,zzzzwloy340w4xrnpngahtr6p,6289398,5412661,1995-05-01,Hao et al.,,US,,19
117189468,zzzzws5a59zyn06u1aotvayxj,9694946,3465908,1969-09-01,Acton,A,US,cited by applicant,16
117189469,zzzzycnb8175jpa3poze4fi4t,10835247,10064621,2018-09-01,Kerr et al.,B2,US,cited by applicant,4415
117189470,zzzzywzbtsvlits7lwa41ii63,8997525,3627504,1971-12-01,Johnson et al.,A,US,cited by applicant,46
117189471,zzzzzm8wuh09bvu6b069vlv8x,8789741,7336184,2008-02-01,Smith et al.,B2,US,cited by applicant,1173


In [0]:
# print patent ids max length
print (pat_df.id.str.len().max())

In [0]:
# print patent ids max length
print(cit_df.patent_id.str.len().max())
print(cit_df.citation_id.str.len().max())

In [0]:
# length of citation id looks long. let's take a look
lengths = cit_df.citation_id.str.len()
argmax = np.where(lengths == lengths.max())[0]
cit_df.iloc[argmax]

Unnamed: 0,uuid,patent_id,citation_id,date,name,kind,country,category,sequence
20014958,658pl0odzsfgfm4mche26h43l,4015716,341047419681100,,217 53,,US,,4
61766402,iz0o18syv90w8jnti139cqoqh,6674859,PCT/US 96/16348,1996-11-01,,B1,US,cited by other,1
75419156,n5zjqrtlu9p4jjfp2sdxasqly,6674859,PCT/US 93/08069,1991-06-01,,B1,US,cited by other,0
75861054,navmooilzku8ceoqp3p94w6ce,6826645,PCT/US 01/43638,2001-11-01,,A2,US,cited by other,17
106334846,wnydaf4cocs96nxhht0snw7s2,6572230,US-2001 0016695,2001-08-01,,B1,US,cited by other,36


### Filter
Only retain those patents that are in the focal web of innovation panel or those that cite or were cited by the paeel 

### Load Graph

In [0]:
# create graph
graph = cdindex.Graph()

In [0]:
# create vertices
pat_vertices =[{"name": x, "time": datetime.fromisoformat(y)} for x, y in zip(pat_df['id'], pat_df['date'])]

# test out
pat_vertices[0:5]

In [0]:
# add vertices to graph
for vertex in pat_vertices:
	graph.add_vertex(vertex["name"], cdindex.timestamp_from_datetime(vertex["time"]))

In [0]:
# create edges
cit_edges = [{"source": x, "target": y} for x, y in zip(cit_df['patent_id'], cit_df['citation_id'])]

In [0]:
list(filter(lambda pat_vertices: pat_vertices['name'] == 'D809697', pat_vertices))

In [0]:
'D809697' in list(graph.vertices())

In [0]:
graph_vertices = list(graph.vertices())
# add edges
for edge in tqdm(cit_edges):
  if (edge["source"] not in graph_vertices):
    print ('source not in vertex list')
    print (edge, flush=True)
    continue 
  if (edge["target"] not in graph_vertices):
    print ('target not in vertex list')
    print (edge, flush=True)
    continue 
  graph.add_edge(edge["source"], edge["target"])

In [0]:
# confirm that some patent ids are not in the main list of patents for some reason 
pat_df[pat_df.id=='1963218']

### Compute scores

In [0]:
cd_index_5 = {}
for patent_id in graph_vertices:
  cd_index_5[patent_id] = graph.cdindex(patent_id, int(datetime.timedelta(days=1825)))