### Process raw WIPO patent data

In [1]:
import codecs
import glob
import os
import pandas as pd
import pycountry
from io import StringIO

# find target path for data files, assuming the notebook is in the right place
data_path = _dh[0]
assert data_path.endswith(os.path.join('em-2020','data'))

output_path = os.path.join(data_path, 'WIPO_reshaped.csv')

The WIPO file format is hideous and needs a custom loader:

In [2]:
wipo_file = 'patent_1 - Total patent applications (direct and PCT national phase entries)_Total count by filing office_1980_2018.csv'
wipo_raw = codecs.open(os.path.join(data_path, wipo_file), encoding='ISO-8859-1').readlines()
wipo_clean = StringIO()
wipo_clean.write(wipo_raw[7].strip())
wipo_clean.write(',')
wipo_clean.write(wipo_raw[6][3:])
for line in wipo_raw[8:]:
    wipo_clean.write(line)
wipo_clean.seek(0)
wipo_data = pd.read_csv(wipo_clean, index_col=False, na_values='', keep_default_na=False)
wipo_data.iloc[-5:,:5]

Unnamed: 0,Office,Office (Code),Origin,1980,1981
184,Yemen,YE,Total,,
185,Yugoslavia,YU,Total,3358.0,3156.0
186,Zaire,ZR,Total,103.0,82.0
187,Zambia,ZM,Total,115.0,108.0
188,Zimbabwe,ZW,Total,320.0,309.0


In [3]:
def map_iso_alpha2(x):
    country = pycountry.countries.get(alpha_2=x)
    if country is None:
        return None
    else:
        return country.alpha_3

iso_alpha3 = wipo_data['Office (Code)'].map(map_iso_alpha2)
valid = ~iso_alpha3.isnull()
wipo_reshaped = pd.DataFrame(wipo_data.loc[valid]
                             .set_index(iso_alpha3.loc[valid])
                             .drop(columns=['Office', 'Office (Code)', 'Origin'])
                             .stack())
wipo_reshaped.index.names = ['Country Code', 'Year']
wipo_reshaped.index.set_levels(wipo_reshaped.index.levels[-1].astype(int), level=-1, inplace=True)
wipo_reshaped.columns = ['WIPO patents']
wipo_reshaped.iloc[:5, :5]

Unnamed: 0_level_0,Unnamed: 1_level_0,WIPO patents
Country Code,Year,Unnamed: 2_level_1
ALB,1993,16.0
ALB,1994,8.0
ALB,1995,5.0
ALB,1996,5.0
ALB,1997,9.0


In [4]:
wipo_reshaped.to_csv(output_path)