In [1]:
import base64
import datetime
import pandas as pd
import json
from google.cloud import bigquery

client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))

Client creating using default project: research-311404


In [2]:
## Load data directly from GCP and then process from here
## I wonder if we want to use granted data or pre grant data. We will use granted for now (consistency consideration)
from google.cloud import storage
client = storage.Client()
print("Client created using default project: {}".format(client.project))
bucket = client.get_bucket('rawdataupload')
blob = bucket.get_blob('Inventor/g_assignee_disambiguated.tsv')

print("Name: {}".format(blob.id))
print("Size: {} bytes".format(blob.size))
print("Content type: {}".format(blob.content_type))
print("Public URL: {}".format(blob.public_url))

Client created using default project: research-311404
Name: rawdataupload/Inventor/g_assignee_disambiguated.tsv/1700305825794328
Size: 992341318 bytes
Content type: application/octet-stream
Public URL: https://storage.googleapis.com/rawdataupload/Inventor/g_assignee_disambiguated.tsv


In [3]:
output_file_name = "g_assignee_disambiguated.tsv"
blob.download_to_filename('g_assignee_disambiguated.tsv')

print("Downloaded blob {} to {}.".format(blob.name, 'g_assignee_disambiguated.tsv'))

Downloaded blob Inventor/g_assignee_disambiguated.tsv to g_assignee_disambiguated.tsv.


In [4]:
blob = bucket.get_blob('Inventor/g_inventor_disambiguated.tsv')

print("Name: {}".format(blob.id))
print("Size: {} bytes".format(blob.size))
print("Content type: {}".format(blob.content_type))
print("Public URL: {}".format(blob.public_url))
output_file_name = "g_inventor_disambiguated.tsv"
blob.download_to_filename('g_inventor_disambiguated.tsv')

print("Downloaded blob {} to {}.".format(blob.name, 'g_inventor_disambiguated.tsv'))

Name: rawdataupload/Inventor/g_inventor_disambiguated.tsv/1700305916842029
Size: 2025092416 bytes
Content type: application/octet-stream
Public URL: https://storage.googleapis.com/rawdataupload/Inventor/g_inventor_disambiguated.tsv
Downloaded blob Inventor/g_inventor_disambiguated.tsv to g_inventor_disambiguated.tsv.


In [2]:
## Start with inventor dataset
import pandas as pd
import numpy as np
inventor = pd.read_csv('g_inventor_disambiguated.tsv', sep='\t')
inventor.head()

Unnamed: 0,patent_id,inventor_sequence,inventor_id,disambig_inventor_name_first,disambig_inventor_name_last,gender_code,location_id
0,6584128,0,fl:ri_ln:kroeger-1,Richard,Kroeger,M,
1,4789863,0,fl:th_ln:bush-1,Thomas A.,Bush,M,
2,11161990,1,fl:ma_ln:boudreaux-4,Matthew F.,Boudreaux,M,04726932-16c8-11ed-9b5f-1234bde3cd05
3,6795487,1,fl:ge_ln:whitworth-1,Gerald,Whitworth,M,
4,D474886,0,fl:th_ln:fleming-4,Thomas W.,Fleming,M,1893009c-16c8-11ed-9b5f-1234bde3cd05


In [16]:
inventor_gender = inventor[['inventor_id','gender_code']]
mapping = {'M': 1, 'F': 2, 'U': 3, np.nan: 4}
inventor_gender['gender_code'] = inventor_gender['gender_code'].replace(mapping)

In [19]:
# Gender code: 1. Male, 2. Female, 3, Unknown, 4. Missing (perhaps Not individual)
dataset = client.get_dataset('research-311404.Inventor_Consolidated')  # API request
table_ref = dataset.table('inventor_gender')
job = client.load_table_from_dataframe(inventor_gender, table_ref, location="US")
job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))
### 3,959,261 unique inventor, Assume they do not change over time

Loaded dataframe to /projects/research-311404/datasets/Inventor_Consolidated/tables/inventor_gender


In [7]:
inventor_solo = inventor[['inventor_id','inventor_sequence','patent_id']]
dataset = client.get_dataset('research-311404.temp')  # API request
table_ref = dataset.table('inventor_solo')
job = client.load_table_from_dataframe(inventor_solo, table_ref, location="US")
job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))

Loaded dataframe to /projects/research-311404/datasets/temp/tables/inventor_solo


In [12]:
sql = """
        CREATE TABLE research-311404.Inventor_Consolidated.inventor_solo_full_data
        AS 
        SELECT t1.patent_id,year,inventor_id,is_solo_inventor FROM
        ((SELECT patent_id,year,inventor_id FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT patent_id, CASE WHEN MAX(inventor_sequence) = 0 THEN 1 ELSE 0 END AS is_solo_inventor FROM research-311404.temp.inventor_solo GROUP BY patent_id)t2
        ON t1.patent_id = t2.patent_id)
        WHERE is_solo_inventor IS NOT NULL
        """

job = client.query(sql)  # API request.
job.result()  # Waits for the query to finish.
table_id = "research-311404.Inventor_Consolidated.inventor_solo_full_data"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.Inventor_Consolidated.inventor_solo_full_data now contains 130702105 rows


In [13]:
## 130,702,105 patents with inventor-sequence information
## This should be in accordance with AI2P Construction
for t in range(36):
    start_year = t + 1980
    end_year = t + 1982
    sql = f"""
        CREATE TABLE research-311404.temp.inventor_solo_{end_year}
        AS SELECT inventor_id, {end_year} AS fyear,  AVG(is_solo_inventor) AS is_solo_inventor FROM `research-311404.Inventor_Consolidated.inventor_solo_full_data`
        WHERE year >= {start_year} and year <= {end_year}
        group by inventor_id
        """

    job = client.query(sql)  # API request.
    job.result()  # Waits for the query to finish.

In [14]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.inventor_solo
        AS SELECT * FROM `research-311404.temp.inventor_solo_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.inventor_solo"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))
## 20,183,061 inventor-year level observations

Table research-311404.AI2Patent_Tree_Consolidated.inventor_solo now contains 20183061 rows


In [15]:
## process the assignee dataset
import pandas as pd
import numpy as np
assignee= pd.read_csv('g_assignee_disambiguated.tsv', sep='\t')
assignee.head()

Unnamed: 0,patent_id,assignee_sequence,assignee_id,disambig_assignee_individual_name_first,disambig_assignee_individual_name_last,disambig_assignee_organization,assignee_type,location_id
0,4488683,0,b942050d-150f-42e4-83d4-a7cde0870f82,,,Metal Works Ramat David,3.0,50dc5d46-16c8-11ed-9b5f-1234bde3cd05
1,5856666,0,dd8cd1db-a0f0-4b3b-ba4a-25038814e332,,,U.S. Philips Corporation,2.0,92237ca2-16c8-11ed-9b5f-1234bde3cd05
2,5204210,0,8d18afca-9c87-4e93-acd9-db539730dc3b,,,Xerox Corporation,2.0,0cd1998f-16c8-11ed-9b5f-1234bde3cd05
3,5302149,1,add68f05-55da-4cd8-9c3a-85d2cf1e1798,,,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,3.0,4d36742f-16c8-11ed-9b5f-1234bde3cd05
4,D397841,0,a8173451-b1cb-4071-bf4f-b844df2703a7,,,adidas AG,3.0,280c9de4-16c8-11ed-9b5f-1234bde3cd05


In [16]:
assignee = assignee[['patent_id','disambig_assignee_organization']]
dataset = client.get_dataset('research-311404.temp')  # API request
table_ref = dataset.table('assignee')
job = client.load_table_from_dataframe(assignee, table_ref, location="US")
job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))

Loaded dataframe to /projects/research-311404/datasets/temp/tables/assignee


In [17]:
sql = """
        CREATE TABLE research-311404.Inventor_Consolidated.assignee_full_data
        AS 
        SELECT t1.patent_id,year,inventor_id,with_organization FROM
        ((SELECT patent_id,year,inventor_id FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT patent_id, MAX(CASE WHEN disambig_assignee_organization IS NULL THEN 0 ELSE 1 END) AS with_organization FROM research-311404.temp.assignee GROUP BY patent_id)t2
        ON t1.patent_id = t2.patent_id)
        WHERE with_organization IS NOT NULL
        """

job = client.query(sql)  # API request.
job.result()  # Waits for the query to finish.
table_id = "research-311404.Inventor_Consolidated.assignee_full_data"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.Inventor_Consolidated.assignee_full_data now contains 126407113 rows


In [18]:
## 126,407,113 patents with inventor-assignee information
## This should be in accordance with AI2P Construction
for t in range(36):
    start_year = t + 1980
    end_year = t + 1982
    sql = f"""
        CREATE TABLE research-311404.temp.assignee_{end_year}
        AS SELECT inventor_id, {end_year} AS fyear,  AVG(with_organization) AS with_organization FROM `research-311404.Inventor_Consolidated.assignee_full_data`
        WHERE year >= {start_year} and year <= {end_year}
        group by inventor_id
        """

    job = client.query(sql)  # API request.
    job.result()  # Waits for the query to finish.

In [19]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.assignee
        AS SELECT * FROM `research-311404.temp.assignee_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.assignee"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))
## 18,437,036 inventor-year level observations

Table research-311404.AI2Patent_Tree_Consolidated.assignee now contains 18437036 rows
