# This is a consolidated version of all the data


In [1]:
import base64
import datetime
import pandas as pd
import json
from google.cloud import bigquery

client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))

Client creating using default project: research-311404


**AI2P data is suuposed to be 'good'**


 So check a three-year inventory on the D data
 This time, put the adjusted D data in there as well

In [24]:
#this dataset: research-311404.AI2Patent_TreeConstruction_202308.AI2P_w_paper_full_inversed
# Used the full AI2P Tree and appended paper data
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.AI2P_patent_level_min_full
        AS SELECT patentid,year,  inverse_D, (SUM(D_Cf)/NULLIF(SUM(C_f),0)) AS D_Cf, (SUM(D_Hit)/NULLIF(SUM(Hit_10pct),0)) AS D_Hit, (SUM(D_Disruption)/NULLIF(SUM(Disruption),0)) AS D_Disruption, (SUM(D_Exposure)/NULLIF(SUM(Exposure),0)) AS D_Exposure, (SUM(D_time)/NULLIF(SUM(Timelag_inv),0)) AS D_time FROM  `research-311404.AI2Patent_TreeConstruction_202308.AI2P_w_paper_full_inversed`
        group by patentid,year,inverse_D
        """

job = client.query(sql)  # API request.
job.result()  # Waits for the query to finish.
# 44,899,250 Entries (patent_ids)

<google.cloud.bigquery.table._EmptyRowIterator at 0x7fcd56dd42e0>

In [32]:
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full
        AS 
        SELECT t1.patent_id,inventor_id,CAST(application_year AS INT64) AS year,  IFNULL(inverse_D,0) AS inverse_D,  IFNULL(D_Cf,0) AS D_Cf,  IFNULL(D_Hit,0) AS D_Hit,  IFNULL(D_Disruption,0) AS D_Disruption,  IFNULL(D_Exposure,0) AS D_Exposure,  IFNULL(D_time,0) AS D_time FROM
        (SELECT patent_id,inventor_id FROM `patents-public-data.patentsview.patent_inventor`)t1 
        LEFT JOIN
        (SELECT patent_id,LEFT(CAST(date AS STRING),4) AS application_year FROM `patents-public-data.patentsview.application` )t2
        ON t1.patent_id = t2.patent_id
        LEFT JOIN
        (SELECT patentid,year,  inverse_D, D_Cf, D_Hit,D_Disruption, D_Exposure, D_time FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_patent_level_min_full`)t3
        ON t1.patent_id = t3.patentid
        """

job = client.query(sql)  # API request.
job.result()  # Waits for the query to finish.
table_id = "research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))
##  130,702,105 unique patents

Table research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full now contains 130702105 rows


In [33]:
# AI2P dataset contains from year 1980-2021. Therefore to have available data from t-3 to t+3, 
# we need to construct year-level data (t) ranging from 1982 to 2017
# Construting AI2P SCORE
# D values are calculated on the average of the pre-three year patent inventory
for t in range(36):
    start_year = t + 1980
    end_year = t + 1982
    sql = f"""
        CREATE TABLE research-311404.temp.AI2P_{end_year}
        AS SELECT inventor_id, {end_year} AS fyear,  AVG(inverse_D) AS inverse_D, AVG(D_Cf) AS D_Cf, AVG(D_Hit) AS D_Hit, AVG(D_Disruption) AS D_Disruption, AVG(D_Exposure) AS D_Exposure ,AVG(D_time)  AS  D_time FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`
        WHERE year >= {start_year} and year <= {end_year}
        group by inventor_id
        """

    job = client.query(sql)  # API request.
    job.result()  # Waits for the query to finish.


In [34]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.AI2P
        AS SELECT * FROM `research-311404.temp.AI2P_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.AI2P"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))
## 20,183,061 inventor-year level observations

Table research-311404.AI2Patent_Tree_Consolidated.AI2P now contains 20183061 rows


In [35]:
# The respective examiner toughness metric should also be constructed here
# We use this table: research-311404.Examiner_toughness_by_date.examiner_table_fillna
# Limited by OCE PAIR dataset: 9,817,693 application_number-examiner pairs
# 9,739,630 not Null examiner names
# 7,051,680 patents. Because limitations from the OCE PAIR dataset
# Check details in the ExaminerAmendments.ipynb
# 
for t in range(36):
    start_year = t + 1980
    end_year = t + 1982
    sql = f"""
        CREATE TABLE research-311404.temp.toughness_{end_year}
        AS SELECT inventor_id, {end_year} AS fyear, AVG(scaled_toughness) AS scaled_toughness  FROM
        ((SELECT patent_id,year,inventor_id FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT patent_number,scaled_toughness FROM `research-311404.Examiner_Toughness_Amendments_Consolidated.examiner_scaled_toughness`)t3
        ON t1.patent_id = t3.patent_number
        )WHERE year >= {start_year} and year <= {end_year} AND scaled_toughness IS NOT NULL
        group by inventor_id
        """

    job = client.query(sql)  # API request.
    job.result()  # Waits for the query to finish.


In [36]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.Examiner_toughness
        AS SELECT * FROM `research-311404.temp.toughness_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.Examiner_toughness"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

## 19,325,293 inventor-year level

Table research-311404.AI2Patent_Tree_Consolidated.Examiner_toughness now contains 19325293 rows


In [38]:
# Construting Dependent variables: They should be three years looking forward: t+1 - t+3
# Note that the 'fyear' here is just 
# 1. PatMV

for t in range(36):
    start_year = t + 1983
    fyear = t + 1982
    end_year = t + 1985
    sql = f"""
        CREATE TABLE research-311404.temp.PatMV_{end_year}
        AS SELECT inventor_id, {fyear} AS fyear,  AVG(xi_real) AS xi_real FROM
        ((SELECT patent_id,inventor_id,year FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT CAST(patent_num AS STRING) AS patid,xi_real FROM `research-311404.KPSS_2022.KPSS_2022`)t3
        ON t1.patent_id = t3.patid
        )WHERE year >= {start_year} and year <= {end_year}
        group by inventor_id
        """

    job = client.query(sql)  # API request.
    job.result()  # Waits for the query to finish.


In [39]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.PatMV
        AS SELECT * FROM `research-311404.temp.PatMV_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.PatMV"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.AI2Patent_Tree_Consolidated.PatMV now contains 22020950 rows


PatTV： 每个专利的TV为该专利在授权后3年内的引用数目/该专利申请当年同一类专利在未来3年的引用量的平均<br>
专利类别用的是cpc 分类的group id  -- 会有专利没有类别记录<br>
logistics： <br>
1. 确定专利的第一个cpc类别作为它的类别 
2. 确定在1980-2021年内申请的专利以及他们的类别 
3. 确定在1980-2021年申请的专利授权后36个月内的引用关系
4. 确定每个类别在1980-2021年申请的专利中平均每个的3年引用数量
5. 计算1980-2021申请的专利中每个的 scaled_TV
6. 计算inventor当年PatTV总和

In [21]:
# PatTV: Note that PatTV is forward looking: meaning that since our dataset ranges up to 2021, then the maximum year for PatTV is supposed to be 2018, further backwarding the maximum t to be 2015 (to include t+1 - t+3)
#step 1
sql = """
CREATE TABLE
  `research-311404.PatTV_Construction.first_cpc_category` AS
SELECT * from(
SELECT *, ROW_NUMBER() OVER(PARTITION BY patent_id ORDER BY sequence) AS row_number
    from (
SELECT patent_id,subsection_id, group_id,sequence 
FROM `patents-public-data.patentsview.cpc_current` 
where category = "inventional"
order by patent_id,sequence)) 
WHERE row_number = 1
"""
job = client.query(sql)  # API request.
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7fed812035b0>

In [22]:
#step 2
sql = """
CREATE TABLE
  `research-311404.PatTV_Construction.application` AS
select * from (SELECT t1.patent_id,cast (substr(date,1,4) as int64) as application_year, 
extract(date from DATETIME(cast(substr(t1.date, 0,4) as int64) , cast(substr(t1.date, 6,2) as int64),1,1,1,1)) as application_month, 
subsection_id, group_id,  
FROM `patents-public-data.patentsview.application` as t1
left join `research-311404.PatTV_Construction.first_cpc_category` as t2
on t1.patent_id=t2.patent_id
where country = "US") where application_year<=2021 and application_year>=1980
order by application_year
"""
job = client.query(sql)  # API request.
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7fed82238af0>

In [24]:
#step 3 -->部分专利没有cpc类别，group_id为空
sql = """
CREATE TABLE
  `research-311404.PatTV_Construction.3_year_uspatentcitation` AS
select * from (select *, 
date_diff(citation_application_month,patent_grant_month, month) as month_diff
 from (select t1.patent_id, subsection_id, group_id, application_month,t3.patent_id as citation_id,
extract(date from DATETIME(cast(substr(t2.date, 0,4) as int64) , cast(substr(t2.date, 6,2) as int64),1,1,1,1)) as patent_grant_month,
extract(date from DATETIME(cast(substr(t3.date, 0,4) as int64) , cast(substr(t3.date, 6,2) as int64),1,1,1,1)) as citation_application_month,
from `research-311404.PatTV_Construction.application` as t1
join (select * from `patents-public-data.patentsview.uspatentcitation` where cast(substr(date,1,4) as int64) >= 1000) as t2
on t1.patent_id=t2.citation_id
join (select * from `patents-public-data.patentsview.application` where cast(substr(date,1,4) as int64) >= 1000) as t3
on t2.patent_id = t3.patent_id))
where month_diff <=3*12 and month_diff>=0
order by patent_id
"""
job = client.query(sql)  # API request.
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7fed81201420>

In [25]:
#step 4 -->把空的cpc都归为了一类，计算平均citation数量
sql = """
CREATE TABLE
  `research-311404.PatTV_Construction.avg_3_year_citation_for_each_class_year` AS
select group_id, application_year,avg(citation_num) as avg_citation from
(select t1.patent_id, t1.subsection_id, t1.group_id,t1.application_year,
case when citation_num is null then 0 else citation_num end as citation_num
from `research-311404.PatTV_Construction.application` as t1
left join  
(select patent_id, count(citation_id) as citation_num from
`research-311404.PatTV_Construction.3_year_uspatentcitation`
group by patent_id) as t2
on t1.patent_id=t2.patent_id) group by group_id,application_year
order by application_year
"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.PatTV_Construction.avg_3_year_citation_for_each_class_year"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.PatTV_Construction.avg_3_year_citation_for_each_class_year now contains 25146 rows


In [40]:
#step 5 ———> 要分两步，由于有一些专利没有cpc类别，他们的group_id无法直接匹配，因此先计算出有cpc类别的专利的scaled citation，再往表格中插入这些没有的
sql = """
CREATE TABLE
  `research-311404.PatTV_Construction.scaled_citation` AS
select A.*,avg_citation, case when avg_citation = 0 then 0 else citation_count/avg_citation end as scaled_citation from (
select inventor_id, t1.patent_id, t1.year,t2.group_id,
case when citation_count is null then 0 else citation_count end as citation_count
from (SELECT patent_id,inventor_id,year FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`) as t1
left join `research-311404.PatTV_Construction.first_cpc_category` as t2
on t1.patent_id =t2.patent_id 
left join
(SELECT patent_id,count(*) as citation_count FROM `research-311404.PatTV_Construction.3_year_uspatentcitation` 
  group by patent_id) as t3
on t1.patent_id = t3.patent_id where group_id is not null) as A 
left join (select application_year,avg_citation, group_id from `research-311404.PatTV_Construction.avg_3_year_citation_for_each_class_year` where group_id is not null) as B
on A.year =B.application_year and A.group_id = B.group_id
order by inventor_ID,year
"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.PatTV_Construction.scaled_citation"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.PatTV_Construction.scaled_citation now contains 128943824 rows


In [41]:
sql = """
Insert `research-311404.PatTV_Construction.scaled_citation`  
select A.*,avg_citation, case when avg_citation = 0 then 0 else citation_count/avg_citation end as scaled_citation from (
select  inventor_id, t1.patent_id, t1.year,t2.group_id,
case when citation_count is null then 0 else citation_count end as citation_count
from (SELECT patent_id,inventor_id,year FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`) as t1
left join`research-311404.PatTV_Construction.first_cpc_category` as t2
on t1.patent_id =t2.patent_id 
left join
(SELECT patent_id,count(*) as citation_count 
  FROM `research-311404.PatTV_Construction.3_year_uspatentcitation` 
  group by patent_id) as t3
  on t1.patent_id = t3.patent_id
 where group_id is  null) as A 
left join (select application_year,avg_citation, group_id from `research-311404.PatTV_Construction.avg_3_year_citation_for_each_class_year` where group_id is null) as B
on A.year =B.application_year 
order by avg_citation
"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.PatTV_Construction.scaled_citation"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.PatTV_Construction.scaled_citation now contains 130702105 rows


In [43]:
sql = """
CREATE TABLE
  `research-311404.PatTV_Construction.scaled_citation_unique` AS 
select inventor_id, patent_id, year, group_id, citation_count, avg_citation, scaled_citation
from `research-311404.PatTV_Construction.scaled_citation`  
GROUP BY inventor_id, patent_id, year, group_id, citation_count, avg_citation, scaled_citation
"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.PatTV_Construction.scaled_citation_unique"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.PatTV_Construction.scaled_citation_unique now contains 19376954 rows


In [45]:
# 2. PatTV
# t gets up to 2015

for t in range(34):
    start_year = t + 1983
    fyear = t + 1982
    end_year = t + 1985
    sql = f"""
        CREATE TABLE research-311404.temp.PatTV_{end_year}
        AS SELECT inventor_id, {fyear} AS fyear,  AVG(citation_count) AS citation_count, AVG(scaled_citation) AS scaled_citation FROM
        ((SELECT  patent_id,inventor_id,year FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT patent_id,citation_count,scaled_citation FROM `research-311404.PatTV_Construction.scaled_citation_unique`)t3
        ON t1.patent_id = t3.patent_id
        )WHERE year >= {start_year} and year <= {end_year}
        group by inventor_id
        """

    job = client.query(sql)  # API request.
    job.result()  # Waits for the query to finish.


In [46]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.PatTV
        AS SELECT * FROM `research-311404.temp.PatTV_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.PatTV"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.AI2Patent_Tree_Consolidated.PatTV now contains 20502022 rows


In [49]:
# 3. PatNum
for t in range(37):
    start_year = t + 1983
    fyear = t + 1982
    end_year = t + 1985
    sql = f"""
    CREATE TABLE
      research-311404.temp.PatNum_{end_year} AS
        SELECT inventor_id, {fyear} AS fyear,  IFNULL(COUNT(patent_id),0) AS PatNum FROM `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`
            WHERE year >= {start_year} and year <= {end_year}
            group by inventor_id
    """
    job = client.query(sql)  # API request.
    job.result()

In [50]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.PatNum
        AS SELECT * FROM `research-311404.temp.PatNum_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.PatNum"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.AI2Patent_Tree_Consolidated.PatNum now contains 22403385 rows


In [4]:
## 4. Exploration Patents:
# Exploration patents: adjusted to 3-years so that it corresponds to the time periods of t+1 - t+3

# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.database.Exploration_all
        AS SELECT 
            patentid, 
            inventor_id, 
            application_year, 
            group_id,
            CASE 
                WHEN application_year - LAG(application_year, 1) OVER(PARTITION BY inventor_id, group_id ORDER BY application_year) <= 3 THEN 0
                ELSE 1
            END AS exploration
        FROM 
           `research-311404.database.EE_rawdata_inventor_after_1980`
        WHERE inventor_id IS NOT NULL AND group_id IS NOT NULL
        ORDER BY 
            inventor_id, 
            group_id, 
            application_year;
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.database.Exploration_all"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))


Table research-311404.database.Exploration_all now contains 10710423 rows


In [51]:
## Exploration: three year aggregation
# Exploration patents should be good when t+1 >= 1983 AND t+3 <= 2019
# Therefore available t values are from 1982 to 2016
for t in range(35):
    start_year = t + 1983
    fyear = t + 1982
    end_year = t + 1985
    sql = f"""
    CREATE TABLE research-311404.temp.Exploration_{end_year}
        AS SELECT inventor_id, {fyear} AS fyear,  AVG(exploration) AS exploration FROM
        ((SELECT patent_id,inventor_id,year FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT patentid,exploration FROM `research-311404.database.Exploration_all`)t3
        ON t1.patent_id = t3.patentid
        )WHERE year >= {start_year} and year <= {end_year}
        group by inventor_id
    """
    job = client.query(sql)  # API request.
    job.result()

In [52]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.Exploration
        AS SELECT * FROM `research-311404.temp.Exploration_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.Exploration"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.AI2Patent_Tree_Consolidated.Exploration now contains 21365081 rows


In [53]:
## 5. CD Index
## Original CD Index data has information on 5,894,529 patents
## We build this based on mcd_5
## This information only goes from 1980 to 2013
## Therefore we can only limit t from 1982 to 2010
for t in range(29):
    start_year = t + 1983
    fyear = t + 1982
    end_year = t + 1985
    sql = f"""
    CREATE TABLE research-311404.temp.CDIndex_{end_year}
        AS SELECT inventor_id, {fyear} AS fyear,  AVG(mcd_5) AS mcd_5 FROM
        ((SELECT patent_id,inventor_id,year FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT CAST(patent_number AS STRING) AS patent_number,mcd_5 FROM `research-311404.CDIndex.cdindex_cross` )t3
        ON t1.patent_id = t3.patent_number
        )WHERE year >= {start_year} and year <= {end_year}
        group by inventor_id
    """
    job = client.query(sql)  # API request.
    job.result()


In [54]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.CDIndex
        AS SELECT * FROM `research-311404.temp.CDIndex_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.CDIndex"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.AI2Patent_Tree_Consolidated.CDIndex now contains 15709654 rows


In [55]:
## 6. RETech & Breadth
## 9,575,245 data available from RETECH original file
## data's time range is 1980-2017.
## Therefore we can only limit t from 1982 to 2014
for t in range(33):
    start_year = t + 1983
    fyear = t + 1982
    end_year = t + 1985
    sql = f"""
    CREATE TABLE research-311404.temp.RETech_{end_year}
        AS 
        SELECT inventor_id,{fyear} AS fyear, AVG(RETech) AS RETech,AVG(Breadth) AS Breadth  FROM
        ((SELECT patent_id,inventor_id,year FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT CAST(pnum AS STRING) AS patent_number,RETech,Breadth FROM `research-311404.TechScopeBreadth.raw` )t3
        ON t1.patent_id = t3.patent_number
        )
        WHERE RETech IS NOT NULL AND Breadth IS NOT NULL AND year >= {start_year} and year <= {end_year} 
        GROUP BY inventor_id
    """
    job = client.query(sql)  # API request.
    job.result()

In [56]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.RETech
        AS SELECT * FROM `research-311404.temp.RETech_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.RETech"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.AI2Patent_Tree_Consolidated.RETech now contains 18580737 rows


In [64]:
## 7. Generality & Originality
## 6,330,939 rows on originality (backward citation)
## 5,123,492 rows on generality (forward citation)
## Start with originality
## Originality ranges from 1980 - 2021. Therefore allowing for at least 3 years (t + 1 to t + 3), t's range is from 1982 to 2018
for t in range(37):
    start_year = t + 1983
    fyear = t + 1982
    end_year = t + 1985
    sql = f"""
    CREATE TABLE research-311404.temp.originality_{end_year}
        AS SELECT inventor_id,{fyear} AS fyear, AVG(originality) AS originality FROM
        ((SELECT patent_id,inventor_id,year FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT patent_id,originality FROM `research-311404.Generality_Consolidated.originality` )t3
        ON t1.patent_id = t3.patent_id
        )
        WHERE originality IS NOT NULL AND year >= {start_year} and year <= {end_year} 
        GROUP BY inventor_id
    """
    job = client.query(sql)  # API request.
    job.result()

In [65]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.originality
        AS SELECT * FROM `research-311404.temp.originality_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.originality"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.AI2Patent_Tree_Consolidated.originality now contains 20117073 rows


In [66]:
## Generality ranges from 1980 - 2021. Therefore allowing for at least 3 years (t + 1 to t + 3), t's range is from 1982 to 2015
for t in range(34):
    start_year = t + 1983
    fyear = t + 1982
    end_year = t + 1985
    sql = f"""
    CREATE TABLE research-311404.temp.generality_{end_year}
        AS SELECT inventor_id,{fyear} AS fyear, AVG(generality) AS generality FROM
        ((SELECT patent_id,inventor_id,year FROM  `research-311404.AI2Patent_Tree_Consolidated.AI2P_inventor_patent_level_min_full`)t1
        LEFT JOIN
        (SELECT patent_id,generality FROM `research-311404.Generality_Consolidated.generality` )t3
        ON t1.patent_id = t3.patent_id
        )
        WHERE generality IS NOT NULL AND year >= {start_year} and year <= {end_year} 
        GROUP BY inventor_id
    """
    job = client.query(sql)  # API request.
    job.result()

In [67]:
# Consolidate to a single file
sql = """
        CREATE TABLE research-311404.AI2Patent_Tree_Consolidated.generality
        AS SELECT * FROM `research-311404.temp.generality_*`
        """

job = client.query(sql)  # API request.
job.result()
table_id = "research-311404.AI2Patent_Tree_Consolidated.generality"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.AI2Patent_Tree_Consolidated.generality now contains 15425602 rows
