## Network visualization

This project attempts to create a network of collaboration among patent holders with more than 500 patents, using OECD HAN and European Patent Office's PATSTAT databases.

link to gitHub repository:
https://github.com/UOA-MEDSCI-736/dvasques83-medsci736

- Obs: include invalid input statements


In [1]:
#import necessary modules
import networkx as nx
from networkx.algorithms import bipartite
import numpy as np
import math
import random
import matplotlib.pyplot as plt
import xlwt
import csv
from itertools import izip_longest
import collections
from collections import defaultdict
import psycopg2 as pg
import time
import pandas as pd
import json
import operator

In [2]:
#function to get the data and return array with data
def query_han_patents():  
    
    t0 = time.time()

    #CONNECT TO POSTGRESQL
    connection_string = "host= 'localhost' dbname='postgres' user='postgres' password=''"
    conn = pg.connect(connection_string)
    cur = conn.cursor()
    
    #Patent Number
    patent_nbr_query = "SELECT patent_number FROM final201602.pat_name_ctry_ipc_year"
    cur.execute(patent_nbr_query)
    patent_nbr_list = cur.fetchall()
    #print 'The length of patents list is: ' + str(len(patent_nbr_list))
    
    #HAN ID
    han_id_query = "SELECT han_id FROM final201602.pat_name_ctry_ipc_year"
    cur.execute(han_id_query)
    han_id_list = cur.fetchall()
    #print 'The length of han id list is: ' + str(len(han_id_list))
    
    #Clean name
    clean_name_query = "SELECT clean_name FROM final201602.pat_name_ctry_ipc_year"
    cur.execute(clean_name_query)
    clean_name_list = cur.fetchall()
    #print 'The length of clean name list is: ' + str(len(clean_name_list))

    #Country code
    ctry_code_query = "SELECT ctry_code FROM final201602.pat_name_ctry_ipc_year"
    cur.execute(ctry_code_query)
    ctry_code_list = cur.fetchall()
    #print 'The length of country code list is: ' + str(len(ctry_code_list))
    
    #IPC
    ipc_query = "SELECT ipc FROM final201602.pat_name_ctry_ipc_year"
    cur.execute(ipc_query)
    ipc_list = cur.fetchall()
    #print 'The length of IPC list is: ' + str(len(ipc_list))
    
    #Application year
    app_year_query = "SELECT app_year FROM final201602.pat_name_ctry_ipc_year"
    cur.execute(app_year_query)
    app_year_list = cur.fetchall()
    #print 'The length of application year list is: ' + str(len(app_year_list))
    
    appln_pathan_array = np.column_stack((patent_nbr_list, han_id_list, clean_name_list, ctry_code_list, ipc_list, app_year_list))
    
    print 'Running time to query: ' + str(time.time() - t0)
    
    cur.close()
    conn.commit()
    conn.close()
    
    return appln_pathan_array

In [3]:
def unique_array():
    
    appln_pathan_array = query_han_patents()
    
    df = pd.DataFrame(data=appln_pathan_array, columns=['patent number','han_id','name','country','ipc code','year'])
    df_unique = df.drop_duplicates()
    
    patent_number = df_unique['patent number'].values.tolist()
    han_id = df_unique['han_id'].values.tolist()
    name = df_unique['name'].values.tolist()
    country = df_unique['country'].values.tolist()
    ipc = df_unique['ipc code'].values.tolist()
    year = df_unique['year'].values.tolist()
    
    data = np.column_stack((patent_number, han_id, name, country, ipc, year))
    
    return data

In [1]:
def create_network(appln_pathan_array):

    t0 = time.time()
    
    #CREATE NETWORK
    B = nx.Graph()
    for patent in appln_pathan_array:
        B.add_node(patent[0], bipartite=0, year=patent[5])
        B.add_node(patent[1], bipartite=1, name=str(patent[2]), ctry=str(patent[3]))
        B.add_edge(patent[0], patent[1])

    print 'Running time to create bipartite innovation network: ' + str(time.time() - t0)
    
    return B

In [11]:
def more500_bipnetwork(B):
    
    t0 = time.time()
    
    #data = unique_array()
    #B = create_network(data)
    
    top_nodes = set(node for node,d in B.nodes(data=True) if d['bipartite']==0) #dlist
    bottom_nodes = set(B) - top_nodes #klist
    #deg_top, deg_bottom = bipartite.degrees(B,bottom_nodes) #dictionary
    
    B500 = B.copy()
    
    for node in bottom_nodes:
        if B.degree(node) < 501:
            B500.remove_node(node)
            
    top_nodes = set(node for node,d in B500.nodes(data=True) if d['bipartite']==0) #dlist
    bottom_nodes = set(B500) - top_nodes #klist 
    
    print 'Running time to create bipartite innovation network - 500: ' + str(time.time() - t0)
    
    return B500, top_nodes, bottom_nodes

In [7]:
#projected network of companies with more than 500 patents

def more500_copatenting(B500):
    
    t0 = time.time()

    #BIPARTITE NETWORK        
    top_nodes = set(node for node,d in B500.nodes(data=True) if d['bipartite']==0) #dlist
    bottom_nodes = set(B500) - top_nodes #klist
    deg_top, deg_bottom = bipartite.degrees(B,bottom_nodes) #dictionary
    
           
    #WEIGHTED PROJECTED NETWORK
    G_w = bipartite.weighted_projected_graph(B500,bottom_nodes)
    
    print 'Running time to create projected collaboration network - 500: ' + str(time.time() - t0)
    
    return G_w

In [None]:
#create bipartite network with patents as top nodes (code 0) and applicants as bottom nodes (code 1)
#patents have attributes year and ipc
#applicants have attributes name, country, year and ipc

def create_network(appln_pathan_array):

    t0 = time.time()
    
    #CREATE NETWORK
    B = nx.Graph()
    t = 0
    
    for record in appln_pathan_array:
        if record[0] in B.nodes():
            B.node[record[0]]['ipc'].append(record[4])
        else:
            B.add_node(record[0], bipartite=0, year=record[5], ipc=[record[4]])
            
        if record[1] in B.nodes():
            B.node[record[1]]['year'].append(record[5])
            B.node[record[1]]['ipc'].append(record[4])
        else:    
            B.add_node(record[1], bipartite=1, name=str(record[2]), ctry=str(record[3]), year=[record[5]], ipc=[record[4]])
            
        #if (record[0], record[1]) not in B.edges():    
        B.add_edge(record[0], record[1])
        t += 1
        if t % 1000 == 0:
            print t

    print 'Running time to create network: ' + str(time.time() - t0)
    
    return B

In [5]:
data = unique_array()
B = create_network(data)

Running time to query: 64.7629959583
Running time to create bipartite innovation network: 62.7017679214


In [12]:
B2, top_nodes, bottom_nodes = more500_bipnetwork(B)

Running time to create bipartite innovation network - 500: 107.528404951


In [13]:
G_w = more500_copatenting(B2)

Running time to create projected collaboration network - 500: 24.7157599926


### Create a json file with node and link information from pandas unique dataframe

In [14]:
#jsonfile = open("co-patenting500.json", "wb")

#create nodes
#print >> jsonfile, "{"
#print >> jsonfile, '"nodes": ['


nodes = []
for node in bottom_nodes:
    json_node = {
        "id": str(B2.node[node]['name']),
        "group": str(B2.node[node]['ctry']),
        "bipdegree": str(B2.degree(node)),
        "projdegree": str(G_w.degree(node))
    }
    nodes.append(json_node)
    
links = []
for link in G_w.edges(data=True):
    json_link = {
        "source": str(B2.node[link[0]]['name']),
        "target": str(B2.node[link[1]]['name']),
        "value": str(link[2]['weight'])
    }
    links.append(json_link)
    
output = {
    "nodes": nodes,
    "links": links
}

with open('co-patenting500.json', 'wb') as jsonfile:
    json.dump(output,jsonfile, indent=2)
    
#json_string = json.dumps(output)

#print >> jsonfile, json_string

In [None]:
print type(G_w.edges(data=True))

In [None]:

'''
for node in bottom_nodes:
    print >> jsonfile, '{"id": '+'"'+str(B.node[node]['name'])+'"'+', '+\
                        '"group": '+'"'+str(B.node[node]['ctry'])+'"'+', '+\
                        '"bipdegree": '+str(B.degree(node))+', '\
                        '"projdegree": '+str(G_w.degree(node))+'},'

print >> jsonfile, '],'

#create links
print >> jsonfile, '"links": ['

for link in G_w.edges(data=True):
    print >> jsonfile, '{"source": '+'"'+str(B.node[link[0]]['name'])+'"'+', '+\
                        '"target": '+'"'+str(B.node[link[1]]['name'])+'"'+', '+\
                        '"value": '+str(link[2]['weight'])+'},'
            
print >> jsonfile, '],'
print >> jsonfile, '}'
jsonfile.close()             
    
#jsonfile.write('\n,')
'''

## Testing area

In [None]:
top_nodes = set(node for node,d in B.nodes(data=True) if d['bipartite']==0) #dlist
bottom_nodes = set(B) - top_nodes #klist
deg_top, deg_bottom = bipartite.degrees(B,bottom_nodes)

In [None]:
sorted_deg_bottom = sorted(deg_bottom.items(), key=operator.itemgetter(1))
top30_applicants = sorted_deg_bottom[-30:]
sortedtop30 = reversed(top30_applicants)

In [None]:
print reversed(sorted_deg_bottom[:100])

In [None]:
degrees = deg_bottom.values()
sort = sorted(degrees, reverse=True)

In [None]:
print sort[:100]
print sum(sort[:100])
print sum(sort[:100])/float(sum(sort))