## AI patent Dynamic Graph Generation

In [59]:
# Read global AI patents from local files (2002-2022)
import pandas as pd
import json
import time

all_patents = []
pubdate_dict = {}
citation_triples = {}
cpc_triples = {}
raw_file_path = '/Users/zongchang/OneDrive/1-知识计算引擎-2030项目/10-数据集/Patent-AI/'
for i in range(137):
    df = pd.read_excel(raw_file_path+str(i+1)+'.xls')
    df = df[['公开（公告）号', '被引证专利', 'CPC', '公开（公告）日', '摘要', '摘要（翻译）', '标题']]
    df.rename(columns = {'公开（公告）号':'pubid', '被引证专利':'citedby', 'CPC': 'cpc', '公开（公告）日': 'pubdate', '摘要':'abstract', '摘要（翻译）': 'abstract_trans', '标题': 'title'}, inplace = True)
    df_json = df.to_json(orient ='records', force_ascii=False)
    patent_json = json.loads(df_json)
    for item in patent_json:
        tmp = {'pubid': item['pubid'], 'pubdate': item['pubdate']/1000, 'abstract': item['abstract'], 'abstract_trans': item['abstract_trans'], 'title': item['title']}
        if item['citedby']:
            tmp['citedby'] = item['citedby'].split('; ')
        else:
            tmp['citedby'] = []
        if item['cpc']:
            tmp['cpc'] = item['cpc'].split('; ')
        else:
            tmp['cpc'] = []
        all_patents.append(tmp)
        # process publication date
        public_date = time.strftime("%Y-%m-%d", time.localtime(item['pubdate']))
        if public_date in pubdate_dict:
            pubdate_dict[public_date].append(item['pubid'])
        else:
            pubdate_dict[public_date] = [item['pubid']]
        # process citation relations
        if item['citedby']:
            cite_list = item['citedby'].split('; ')
            citation_triples[item['pubid']] = cite_list
        # process cpc relations
        if item['cpc']:
            cpc_list = item['cpc'].split('; ')
            cpc_triples[item['pubid']] = cpc_list
    print('processed file ' + str(i+1) + '.xls')

processed file 1.xls
processed file 2.xls
processed file 3.xls
processed file 4.xls
processed file 5.xls
processed file 6.xls
processed file 7.xls
processed file 8.xls
processed file 9.xls
processed file 10.xls
processed file 11.xls
processed file 12.xls
processed file 13.xls
processed file 14.xls
processed file 15.xls
processed file 16.xls
processed file 17.xls
processed file 18.xls
processed file 19.xls
processed file 20.xls
processed file 21.xls
processed file 22.xls
processed file 23.xls
processed file 24.xls
processed file 25.xls
processed file 26.xls
processed file 27.xls
processed file 28.xls
processed file 29.xls
processed file 30.xls
processed file 31.xls
processed file 32.xls
processed file 33.xls
processed file 34.xls
processed file 35.xls
processed file 36.xls
processed file 37.xls
processed file 38.xls
processed file 39.xls
processed file 40.xls
processed file 41.xls
processed file 42.xls
processed file 43.xls
processed file 44.xls
processed file 45.xls
processed file 46.x

In [67]:
# get all patents after public time t
patent_target = []
for patent in all_patents:
    # later than 2022-01-01
    if patent['pubdate'] >= 1640966400:
        patent_target.append((patent['pubid'], patent['pubdate']))

In [69]:
len(patent_target)

100000

In [70]:
# generate all cpc co-occurrence edges after time t
import itertools

edges_dict = {}
for patent_date in patent_target:
    # only consider top 3 cpc code to limit the edge size
    cpc_pairs = list(itertools.combinations(cpc_triples[patent_date[0]][:3], 2))
    for pair in cpc_pairs:
        tmp1 = (pair[0], pair[1], patent_date[1])
        tmp2 = (pair[1], pair[0], patent_date[1])
        if tmp1 not in edges_dict and tmp2 not in edges_dict:
            edges_dict[tmp1] = 1
        elif tmp1 in edges_dict:
            edges_dict[tmp1] += 1
        else:
            edges_dict[tmp2] += 1
edges = []
for k, v in edges_dict.items():
    edges.append((k[0], k[1], v, k[2]))

In [71]:
len(edges)

249252

In [72]:
# give id to each cpc node
cpc_ids = {}
node_id = 1
for edge in edges:
    if edge[0] not in cpc_ids:
        cpc_ids[edge[0]] = node_id
        node_id += 1
    if edge[1] not in cpc_ids:
        cpc_ids[edge[1]] = node_id
        node_id += 1

In [52]:
with open('../dataset/raw/ai-patent', 'w') as f:
    for edge in edges:
        f.write(str(cpc_ids[edge[0]])+' '+str(cpc_ids[edge[1]])+' '+str(edge[2])+' '+str(edge[3])+'\n')

In [53]:
len(list(cpc_ids.keys()))

18498

In [93]:
for k, v in cpc_ids.items():
    if v == 1052:
        print(k)

G06F16/3329


## NIH-NCI Dynamic Graph Generation

In [94]:
# Read NCI projects from local files (2002-2021)
import pandas as pd
import json
import time

raw_file_path = '/Users/zongchang/OneDrive/1-知识计算引擎-2030项目/10-数据集/Funding-NIH/'
all_projects = []
for i in range(20):
    df = pd.read_excel(raw_file_path+'NIH-NCI-'+str(2002+i)+'.xlsx')
    df = df[['Project Terms', 'Project Title', 'Award Notice Date', 'Application ID', 'Organization Name', 'Organization Country', 'Total Cost', 'Project Abstract']]
    df_json = df.to_json(orient ='records', force_ascii=False)
    patent_json = json.loads(df_json)
    for item in patent_json:
        tmp = {'title': item['Project Title'], 'app_id': item['Application ID'], 'org_name': item['Organization Name'], 'country': item['Organization Country'], 'cost': item['Total Cost'], 'abstract':item['Project Abstract']}
        if item['Project Terms']:
            tmp['keywords'] = [k.strip().lower() for k in item['Project Terms'].split(';')]
        if item['Award Notice Date']:
            timeArray = time.strptime(item['Award Notice Date'], "%m/%d/%Y")
            tmp['notice_date'] = int(time.mktime(timeArray))
        all_projects.append(tmp)
    print('processed '+str(i)+' files')
        

processed 0 files
processed 1 files
processed 2 files
processed 3 files
processed 4 files
processed 5 files
processed 6 files
processed 7 files
processed 8 files
processed 9 files
processed 10 files
processed 11 files
processed 12 files
processed 13 files
processed 14 files
processed 15 files
processed 16 files
processed 17 files
processed 18 files
processed 19 files


In [95]:
all_projects[-10:]

[{'title': 'Operations support for the NCI at Frederick Campus within the fence at Fort Detrick',
  'app_id': 10329812,
  'org_name': 'NATIONAL CANCER INSTITUTE',
  'country': 'UNITED STATES',
  'cost': 6660581.0,
  'abstract': 'This IAA funds costs to operate the NCI at Frederick Campus within the fence at Fort Detrick.',
  'keywords': ['funding',
   'maintenance',
   'poaceae',
   'training',
   'water',
   'cost',
   'operation']},
 {'title': 'NCI Early Detection Research Network: Biomarker Reference and Resource Center (BRRC)',
  'app_id': 10506456,
  'org_name': 'NATIONAL CANCER INSTITUTE',
  'country': 'UNITED STATES',
  'cost': 800000.0,
  'abstract': 'The purpose of this IAA is to obtain diagnostic services from Pacific Northwest National Laboratory (PNNL), Department of Energy (DOE) to support the Early Detection Research Network’s (EDRN) efforts to prioritize, verify and validate candidate biomarkers for cancer early detection, diagnosis and early prognosis. The laboratory, h

In [96]:
keyword_date = []
for item in all_projects:
    if 'keywords' in item and 'notice_date' in item:
        # later than 2020-01-01
        if item['notice_date'] >= 1577808000:
            if len(item['keywords']) > 5:
                keyword_date.append([item['keywords'][:5], item['notice_date']])
            else:
                keyword_date.append([item['keywords'], item['notice_date']])
        else:
            continue
    else:
        continue

In [97]:
len(keyword_date)

21542

In [98]:
keyword_dict = {} 
idx = 1
for item in keyword_date:
    for keyword in item[0]:
        if keyword not in keyword_dict:
            keyword_dict[keyword] = idx
            idx += 1
        else:
            continue

In [99]:
len(list(keyword_dict.keys()))

2339

In [100]:
# generate all keyword co-occurrence edges after time t
import itertools

edges_dict = {}
for item in keyword_date:
    keyword_pairs = list(itertools.combinations(item[0], 2))
    for pair in keyword_pairs:
        tmp1 = (keyword_dict[pair[0]], keyword_dict[pair[1]], item[1])
        tmp2 = (keyword_dict[pair[1]], keyword_dict[pair[0]], item[1])
        if tmp1 not in edges_dict and tmp2 not in edges_dict:
            edges_dict[tmp1] = 1
        elif tmp1 in edges_dict:
            edges_dict[tmp1] += 1
        else:
            edges_dict[tmp2] += 1
edges = []
for k, v in edges_dict.items():
    edges.append((k[0], k[1], v, k[2]))

In [101]:
len(edges)

185717

In [126]:
with open('../dataset/raw/nci-project', 'w') as f:
    for edge in edges:
        f.write(str(edge[0])+' '+str(edge[1])+' '+str(edge[2])+' '+str(edge[3])+'\n')

In [124]:
for k, v in keyword_dict.items():
    if v == 120:
        print(k)

award


In [125]:
for k, v in keyword_dict.items():
    if v == 1177:
        print(k)

breast epithelial cells


In [56]:
# Counting nodes and edges number for each dataset
nodes = {}
edges = {}
with open('../dataset/raw/uci', 'r') as f:
    for line in f.readlines()[2:]:
        items = line.split(' ')
        if items[0] not in nodes:
            nodes[items[0]] = 1
        if items[1] not in nodes:
            nodes[items[1]] = 1
        if (items[0], items[1], items[3]) not in edges:
            edges[(items[0], items[1], items[3])] = 1

In [57]:
len(list(nodes.keys()))

1899

In [58]:
len(edges)

59798