### Step 2: check patent data against WIPO sources

Since the patent application counts don't quite match Pan et al. (2019), check them against a static download of the WIPO data (which has no easily accessible API).

Also, supplement missing WB data with WIPO data where necessary.

In [1]:
import glob
import os
import pandas as pd
import pycountry
from io import StringIO

# find target path for data files, assuming the notebook is in the right place
data_path = _dh[0]
assert data_path.endswith(os.path.join('em-2020','data'))

The WIPO file format is hideous and needs a custom loader:

In [2]:
wipo_file = 'patent_1 - Total patent applications (direct and PCT national phase entries)_Total count by filing office_1980_2018.csv'
wipo_raw = open(os.path.join(data_path, wipo_file)).readlines()
wipo_clean = StringIO()
wipo_clean.write(wipo_raw[7].strip())
wipo_clean.write(',')
wipo_clean.write(wipo_raw[6][3:])
for line in wipo_raw[8:]:
    wipo_clean.write(line)
wipo_clean.seek(0)
wipo_data = pd.read_csv(wipo_clean, index_col=False)
wipo_data.iloc[:5,:5]

Unnamed: 0,Office,Office (Code),Origin,1980,1981
0,Bangladesh,BD,Total,136,172
1,United Kingdom,GB,Total,41612,39214


In [3]:
iso_alpha3 = wipo_data['Office (Code)'].map(lambda x: pycountry.countries.get(alpha_2=x).alpha_3)
wipo_data.index = pd.Index(iso_alpha3, name=False) 
wipo_data = wipo_data.drop(columns=['Office', 'Office (Code)', 'Origin']).T
wipo_data.index = wipo_data.index.astype(int)

In [12]:
for wb_file in glob.glob(os.path.join(data_path, 'WB_*.csv')):
    country = os.path.basename(wb_file).split('_')[1]
    wb_data = pd.read_csv(wb_file).set_index('year')
    wb_patents = wb_data.TI
    wipo_patents = wipo_data[country]
    compare_df = pd.DataFrame({'WB': wb_patents, 'WIPO': wipo_patents})
    mismatch = (~compare_df.isnull().any(axis=1) 
                & (compare_df['WB'] != compare_df['WIPO']))
    if any(mismatch):
        display(wb_file)
        display(compare_df.loc[mismatch])
        raise ValueError
    missing = (compare_df.WB.isnull() != compare_df.WIPO.isnull())
    display(os.path.basename(wb_file))
    display(compare_df.loc[missing])
#     compare_df.TI = wb_patents.combine_first(wipo_patents)

'WB_GBR_current.csv'

Unnamed: 0,WB,WIPO


'WB_BGD_current.csv'

Unnamed: 0,WB,WIPO


'WB_GBR_201910.csv'

Unnamed: 0,WB,WIPO
2018,,20941.0


'WB_BGD_201910.csv'

Unnamed: 0,WB,WIPO
2018,,368.0


'WB_BGD_201805.csv'

Unnamed: 0,WB,WIPO
1974,245.0,
1976,154.0,
1977,119.0,
1978,149.0,
1979,131.0,
2017,,302.0
2018,,368.0
