In [1]:
import base64
import datetime
import pandas as pd
import json
from google.cloud import bigquery

client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))

Client creating using default project: research-311404


In [4]:
#step 1
# 每个专利的TV为该专利在授权后3年内的引用数目/该专利申请当年同一类专利在未来3年的引用量的平均
sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.first_cpc_category` AS
SELECT * from(
SELECT *, ROW_NUMBER() OVER(PARTITION BY patent_id ORDER BY sequence) AS row_number
    from (
SELECT patent_id,subsection_id, group_id,sequence 
FROM `patents-public-data.patentsview.cpc_current` 
where category = "inventional"
order by patent_id,sequence)) 
WHERE row_number = 1
"""
job = client.query(sql)  # API request.
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7eff33c8e950>

In [7]:
#step 2
sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.application` AS
select * from (SELECT t1.patent_id,cast (substr(date,1,4) as int64) as application_year, 
extract(date from DATETIME(cast(substr(t1.date, 0,4) as int64) , cast(substr(t1.date, 6,2) as int64),1,1,1,1)) as application_month, 
subsection_id, group_id,  
FROM `patents-public-data.patentsview.application` as t1
left join `research-311404.Patent_Litigation_Patent_Variables.first_cpc_category` as t2
on t1.patent_id=t2.patent_id
where country = "US") where application_year<=2024 and application_year>=1970
order by application_year
"""
job = client.query(sql)  # API request.
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7eff3311e2f0>

In [11]:
#step 3 -->部分专利没有cpc类别，group_id为空
sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.3_year_uspatentcitation` AS
select * from (select *, 
date_diff(citation_application_month,patent_grant_month, month) as month_diff
 from (select t1.patent_id, t1.application_year, subsection_id, group_id, application_month,t3.patent_id as citation_id,
extract(date from DATETIME(cast(substr(t2.date, 0,4) as int64) , cast(substr(t2.date, 6,2) as int64),1,1,1,1)) as patent_grant_month,
extract(date from DATETIME(cast(substr(t3.date, 0,4) as int64) , cast(substr(t3.date, 6,2) as int64),1,1,1,1)) as citation_application_month
from `research-311404.Patent_Litigation_Patent_Variables.application` as t1
join (select * from `patents-public-data.patentsview.uspatentcitation` where cast(substr(date,1,4) as int64) >= 1000) as t2
on t1.patent_id=t2.citation_id
join (select * from `patents-public-data.patentsview.application` where cast(substr(date,1,4) as int64) >= 1000) as t3
on t2.patent_id = t3.patent_id))
where month_diff <=3*12 and month_diff>=0
order by patent_id
"""
job = client.query(sql)  # API request.
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7eff32a216f0>

In [13]:
#step 4 -->把空的cpc都归为了一类，计算平均citation数量
sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.avg_3_year_citation_for_each_class_year` AS
select group_id, application_year,avg(citation_num) as avg_citation from
(select t1.patent_id, t1.subsection_id, t1.group_id,t1.application_year,
case when citation_num is null then 0 else citation_num end as citation_num
from `research-311404.Patent_Litigation_Patent_Variables.application` as t1
left join  
(select patent_id, count(citation_id) as citation_num from
`research-311404.Patent_Litigation_Patent_Variables.3_year_uspatentcitation`
group by patent_id) as t2
on t1.patent_id=t2.patent_id) group by group_id,application_year
order by application_year
"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.avg_3_year_citation_for_each_class_year"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.Patent_Litigation_Patent_Variables.avg_3_year_citation_for_each_class_year now contains 30121 rows


In [25]:
# Construct forward citation: include scaled and non-scaled

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.scaled_citation_general` AS
select A.*,avg_citation, case when avg_citation = 0 then 0 else citation_count/avg_citation end as scaled_citation from (
select t1.*,t2.group_id,t3.application_year,
case when citation_count is null then 0 else citation_count end as citation_count
from (SELECT * FROM  `research-311404.Patent_Litigation.dataset_general_litigation_dataset`) as t1
left join `research-311404.Patent_Litigation_Patent_Variables.first_cpc_category` as t2
on t1.patent_id =t2.patent_id 
left join
(SELECT patent_id,application_year,count(*) as citation_count FROM `research-311404.Patent_Litigation_Patent_Variables.3_year_uspatentcitation` 
  group by patent_id,application_year) as t3
on t1.patent_id = t3.patent_id) as A 
left join (select application_year,avg_citation, group_id from `research-311404.Patent_Litigation_Patent_Variables.avg_3_year_citation_for_each_class_year` where group_id is not null) as B
on A.application_year =B.application_year and A.group_id = B.group_id

"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.scaled_citation_general"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.Patent_Litigation_Patent_Variables.scaled_citation_general now contains 4109 rows


In [26]:
# Construct forward citation: include scaled and non-scaled

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.scaled_citation_PAE` AS
select A.*,avg_citation, case when avg_citation = 0 then 0 else citation_count/avg_citation end as scaled_citation from (
select t1.*,t2.group_id,t3.application_year,
case when citation_count is null then 0 else citation_count end as citation_count
from (SELECT * FROM  `research-311404.Patent_Litigation.dataset_litigation_PAE`) as t1
left join `research-311404.Patent_Litigation_Patent_Variables.first_cpc_category` as t2
on t1.patent_id =t2.patent_id 
left join
(SELECT patent_id,application_year,count(*) as citation_count FROM `research-311404.Patent_Litigation_Patent_Variables.3_year_uspatentcitation` 
  group by patent_id,application_year) as t3
on t1.patent_id = t3.patent_id) as A 
left join (select application_year,avg_citation, group_id from `research-311404.Patent_Litigation_Patent_Variables.avg_3_year_citation_for_each_class_year` where group_id is not null) as B
on A.application_year =B.application_year and A.group_id = B.group_id

"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.scaled_citation_PAE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.Patent_Litigation_Patent_Variables.scaled_citation_PAE now contains 2101 rows


In [27]:
# Construct forward citation: include scaled and non-scaled

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.scaled_citation_PE` AS
select A.*,avg_citation, case when avg_citation = 0 then 0 else citation_count/avg_citation end as scaled_citation from (
select t1.*,t2.group_id,t3.application_year,
case when citation_count is null then 0 else citation_count end as citation_count
from (SELECT * FROM  `research-311404.Patent_Litigation.dataset_litigation_output_PE`) as t1
left join `research-311404.Patent_Litigation_Patent_Variables.first_cpc_category` as t2
on t1.patent_id =t2.patent_id 
left join
(SELECT patent_id,application_year,count(*) as citation_count FROM `research-311404.Patent_Litigation_Patent_Variables.3_year_uspatentcitation` 
  group by patent_id,application_year) as t3
on t1.patent_id = t3.patent_id) as A 
left join (select application_year,avg_citation, group_id from `research-311404.Patent_Litigation_Patent_Variables.avg_3_year_citation_for_each_class_year` where group_id is not null) as B
on A.application_year =B.application_year and A.group_id = B.group_id

"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.scaled_citation_PE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.Patent_Litigation_Patent_Variables.scaled_citation_PE now contains 2530 rows


In [30]:
# Backward Citation
# Paperwise

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_general` AS
select t1.*,
case when backward_paper_citation is null then 0 else backward_paper_citation end as backward_paper_citation
from (SELECT * FROM  `research-311404.Patent_Litigation_Patent_Variables.scaled_citation_general`) as t1
left join (SELECT PatentID,COUNT(PaperID) AS backward_paper_citation FROM `research-311404.SciSciNet.paper_patent` 
GROUP BY PatentID) as t2 
on t1.patent_id =t2.PatentID


"""
job = client.query(sql)  # API request.
job.result()


table_id = "research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_general"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))


sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_PAE` AS
select t1.*,
case when backward_paper_citation is null then 0 else backward_paper_citation end as backward_paper_citation
from (SELECT * FROM `research-311404.Patent_Litigation_Patent_Variables.scaled_citation_PAE`) as t1
left join (SELECT PatentID,COUNT(PaperID) AS backward_paper_citation FROM `research-311404.SciSciNet.paper_patent` 
GROUP BY PatentID) as t2 
on t1.patent_id =t2.PatentID


"""
job = client.query(sql)  # API request.
job.result()


table_id = "research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_PAE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_PE` AS
select t1.*,
case when backward_paper_citation is null then 0 else backward_paper_citation end as backward_paper_citation
from (SELECT * FROM `research-311404.Patent_Litigation_Patent_Variables.scaled_citation_PAE`) as t1
left join (SELECT PatentID,COUNT(PaperID) AS backward_paper_citation FROM `research-311404.SciSciNet.paper_patent` 
GROUP BY PatentID) as t2 
on t1.patent_id =t2.PatentID


"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_PE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_general now contains 4109 rows
Table research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_PAE now contains 2101 rows
Table research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_PE now contains 2101 rows


In [32]:
# Backward Citation
# patents

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_general` AS
select t1.*,
case when backward_patent_citation is null then 0 else backward_patent_citation end as backward_patent_citation
from (SELECT * FROM  `research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_general`) as t1
left join (SELECT patent_id,COUNT(citation_id) AS backward_patent_citation  FROM `patents-public-data.patentsview.uspatentcitation` 
GROUP BY patent_id) as t2 
on t1.patent_id =t2.patent_id


"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_general"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_PAE` AS
select t1.*,
case when backward_patent_citation is null then 0 else backward_patent_citation end as backward_patent_citation
from (SELECT * FROM  `research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_PAE`) as t1
left join (SELECT patent_id,COUNT(citation_id) AS backward_patent_citation  FROM `patents-public-data.patentsview.uspatentcitation` 
GROUP BY patent_id) as t2 
on t1.patent_id =t2.patent_id


"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_PAE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_PE` AS
select t1.*,
case when backward_patent_citation is null then 0 else backward_patent_citation end as backward_patent_citation
from (SELECT * FROM `research-311404.Patent_Litigation_Patent_Variables.backward_paper_citation_PE`) as t1
left join (SELECT patent_id,COUNT(citation_id) AS backward_patent_citation  FROM `patents-public-data.patentsview.uspatentcitation` 
GROUP BY patent_id) as t2 
on t1.patent_id =t2.patent_id


"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_PE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_general now contains 4109 rows
Table research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_PAE now contains 2101 rows
Table research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_PE now contains 2101 rows


In [33]:
## CD Index
## Original CD Index data has information on 5,894,529 patents
## We build this based on mcd_5
## This information only goes from 1980 to 2013
sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.CDIndex_general` AS
select t1.*,mcd_5 AS CD_Index
from ((SELECT * FROM  `research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_general`) as t1
left join (SELECT CAST(patent_number AS STRING) AS patent_number,mcd_5 FROM `research-311404.CDIndex.cdindex_cross`) as t2 
on t1.patent_id =t2.patent_number
)

"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.CDIndex_general"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.CDIndex_PAE` AS
select t1.*,mcd_5 AS CD_Index
from ((SELECT * FROM  `research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_PAE`) as t1
left join (SELECT CAST(patent_number AS STRING) AS patent_number,mcd_5 FROM `research-311404.CDIndex.cdindex_cross`) as t2 
on t1.patent_id =t2.patent_number
)

"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.CDIndex_PAE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.CDIndex_PE` AS
select t1.*,mcd_5 AS CD_Index
from ((SELECT * FROM  `research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_PE`) as t1
left join (SELECT CAST(patent_number AS STRING) AS patent_number,mcd_5 FROM `research-311404.CDIndex.cdindex_cross`) as t2 
on t1.patent_id =t2.patent_number
)


"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.CDIndex_PE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

Table research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_general now contains 4109 rows
Table research-311404.Patent_Litigation_Patent_Variables.backward_patent_citation_PAE now contains 2101 rows
Table research-311404.Patent_Litigation_Patent_Variables.CDIndex_PE now contains 2101 rows


In [34]:
#orginality & generality

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.Final_general` AS
select t1.*,originality,generality
from ((SELECT * FROM  `research-311404.Patent_Litigation_Patent_Variables.CDIndex_general`) as t1
left join (SELECT patent_id,originality FROM `research-311404.Generality_Consolidated.originality`) as t2 
on t1.patent_id =t2.patent_id
left join (SELECT patent_id,generality FROM `research-311404.Generality_Consolidated.generality`) as t3 
on t1.patent_id =t3.patent_id
)

"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.Final_general"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.Final_PAE` AS
select t1.*,originality,generality
from ((SELECT * FROM  `research-311404.Patent_Litigation_Patent_Variables.CDIndex_PAE`) as t1
left join (SELECT patent_id,originality FROM `research-311404.Generality_Consolidated.originality`) as t2 
on t1.patent_id =t2.patent_id
left join (SELECT patent_id,generality FROM `research-311404.Generality_Consolidated.generality`) as t3 
on t1.patent_id =t3.patent_id
)

"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.Final_PAE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))

sql = """
CREATE TABLE
  `research-311404.Patent_Litigation_Patent_Variables.Final_PE` AS
select t1.*,originality,generality
from ((SELECT * FROM  `research-311404.Patent_Litigation_Patent_Variables.CDIndex_PE`) as t1
left join (SELECT patent_id,originality FROM `research-311404.Generality_Consolidated.originality`) as t2 
on t1.patent_id =t2.patent_id
left join (SELECT patent_id,generality FROM `research-311404.Generality_Consolidated.generality`) as t3 
on t1.patent_id =t3.patent_id
)


"""
job = client.query(sql)  # API request.
job.result()

table_id = "research-311404.Patent_Litigation_Patent_Variables.Final_PE"
table = client.get_table(table_id)
print("Table {} now contains {} rows".format(table_id, table.num_rows))




Table research-311404.Patent_Litigation_Patent_Variables.Final_general now contains 4109 rows
Table research-311404.Patent_Litigation_Patent_Variables.Final_PAE now contains 2101 rows
Table research-311404.Patent_Litigation_Patent_Variables.Final_PE now contains 2101 rows
