### Data Extraction

- Used the PATENTSCOPE Artificial Intelligence Index to filter patents: https://www.wipo.int/tech_trends/en/artificial_intelligence/patentscope.html

- Downloaded data 10,000 at a time by changing adjusting Application Dates.

- Retrieved all patents till 2020.

In [1]:
#Import Libraries

import pandas as pd
from tqdm import tqdm
import time
import numpy as np
import plotly.express as px

import warnings
warnings.simplefilter('ignore')

import re
tqdm.pandas()
# pd.options.plotting.backend = "plotly"
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import spacy
from p_tqdm import p_map

### Data Cleaning

- Duplicate 'Patent Number' were dropped keeping the first entry.
- HTML tags in the text columns were removed using regex.
- 'Application Year' and 'Publication Year' columns were added.

In [2]:
dfs=[]
for i in tqdm(range(1,26)):
    df=pd.read_excel(f'/Users/cotraak/Desktop/Research/ResultLists-WIPO/resultList-{i}.xls', skiprows=6)
    dfs.append(df)
dtf=pd.concat(dfs, ignore_index=True)
print(dtf.shape)
dtf['Application Year']=dtf['Application Date'].progress_apply(lambda x: str(x).split('.')[2])
dtf=dtf.sort_values(by='Application Year')
dtf.drop_duplicates(inplace=True)
dtf.head()

In [3]:
dtf.to_excel('/Users/cotraak/Desktop/Research/WIPO Process/Data/total_wipo_ai.xlsx', index=False)

In [4]:
def app_pro(x):
    res=''
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    temp=str(x).lower().split(';')
    res=[text_pro(x) for x in temp]
    res1=[x.title() for x in res]
    return res1

In [5]:
def striphtml(data): #function for removing html tags
    p = re.compile(r'<.*?>')
    return p.sub('',data.lower())

In [6]:
def text_pro(data): #function for removing html tags and punctuations
    res=''
    temp=striphtml(data)
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for ch in temp:
        if ch not in punctuations:
            res+=ch
    return res

In [7]:
def inv_pro(x):
    return ' '.join(sorted(x.split()))

In [8]:
df=pd.read_excel('/Users/cotraak/Desktop/Research/WIPO Process/Data/wipo_ai_raw.xlsx')
df=df[(df['Application Year']<2019) & (df['Application Year']>=2000)]
df=df.reset_index(drop=True)
print(df.shape[0], 'records')

104738 records


In [9]:
df=df[(~df['Abstract'].isnull()) & (~df['Title'].isnull())]
df=df[(~df['Applicants'].isnull()) & (~df['Inventors'].isnull())]
df=df.reset_index(drop=True)
print(df.shape[0], 'records')

102579 records


In [10]:
def is_entity(x):
    for word in companies:
        if word in x:
            return True
    for word in companies_ew:
        if word.endswith(x):
            return True
    for word in universities:
        if word in x:
            return True
    for word in research:
        if word in x:
            return True
    for word in government:
        if word in x:
            return True
    return False

In [11]:
def first_entity(x):
    for comp in x:
        if is_entity(comp):
            return comp
    return x[0]

In [12]:
companies=['Ltd', 'Llc', 'Technologies', 'Corporation', 'Corp', 'Properties', 'Philips', 'Ericsson', 'Novozymes', 'Novartis', 'Alcatel', 
           'Telecom', 'Industry', 'International', 'Loreal', 'Lucent', 'Bayer', 'Basf Se', 'Holdings', 'Merck & Co', 
           ' Company', 'Limited', 'Pharma', 'Solutions', 'Therapeutics','Kabushiki Kaisha','Intellectual Property',
           'Aktiengesellschaft', 'Gmbh', 'Ecole', 'Bv', 'Biologicals', 'Licensing', 'Astrazeneca', 'Nordisk','Enterprise', 
           'Electronics', 'electronics', 'Ag ', 'Thales', 'Holding', 'As ', 'Plc', 'Spa ', 'Systems', 'Genetics', 
           ' Nv', 'Industrie', 'Association', 'Products', 'Schlumberger', 'Tyre', 'Nokia','Malaysian', ' Trust', 
           'Roche', 'Technology', 'Group', 'Compagnie', 'Arkema', 'Testing Service', 'Clinic', 'City Of Hope', 'Operations', 
           'Center', 'Services', 'Hospital', 'Aktiebolaget', 'Sas ', 'Norsk', 'Geophysical', 'Procter', 'Centre', 'Health', 
           'Laboratories',' Spa', 'Largan','Basell', 'Wells Fargo', 'Labs', 'Denko', 'Dupont', 'Coltd', 
           'Biolipox Ab', 'Bank', 'bank', 'Volvo', 'Microelektronica', 'Industeel', 'factory', 'Trucks', 
           'Systemes', 'Automotive', 'Automation', 'Engines', 'Teknologian', 'Merck Patent', 'Devices', 
           'Helicopters', 'Laboratoires', 'Biosciences', 'Nv ', 'Electricite', 'Global', 'Metabolic', 'Laboratory', 
           'Exxonmobil', 'Toshiba', 'Incorporated', 'Qualcomm', 'Instruments', 'Unilever', 'Energies','Participations',
          'Hewlett Packard', 'Motor', 'Lockheed Martin', 'American Express', 'Sun Microsystems', 'Controls',
           'Du Pont', 'Electric ', 'Corning','3D', 'Incorporated', 'Dow ', 'Precision', 'Haas And Rohm', '&', 'Co ',
          'Leap Motion', 'Nuance Communications', 'Document', 'Smithkline', 'Opticals', ' Chemical']

companies_ew=[' As',' Se', 'Skf', ' Ag', ' Ab', ' Lp', ' Sa', ' Srl', 'Lllp', 'Llp',' Mbh', ' Sl', ' Sprl', ' Oy',
             'Infirmary', ' Lc', ' Aps', ' Na', ' Ev', ' Ai', ' Inc', 'Electric', ' Dow', ' Sas', ' Co']

universities=['Universi', 'College', 'School', 'Educational', 'Regents', 'Institute Of Technology']

research=['Research','Institut', 'Recherche', 'Energies Ifp Nouvelles', 'Foerderung', 'Scientifique', 'Istitut', 
          'Academia', 'Foundation ', 'Academy', 'Förderung', 
          'Nederlandse Organisatie Voor Toegepastnatuurwetenschappelijk Onderzoek Tno',
         'Anonyme', 'Consejo Superior De Investigaciones Científicas Csic', 'Instituut',
         'Crf Societa Consortile Per Azioni']

government=['Commissariat','Foundation', 'National', 'Commonwealth', 'Nederlandse ', 'Secretary Of State',
            'Investigaciones']

In [13]:
df['type']=['Individual']*df.shape[0]

df['Full Text']=df['Title']+'. '+df['Abstract']
df['Full Text']=df['Full Text'].progress_apply(lambda x: text_pro(str(x)))

df['Application Year']=df['Application Date'].progress_apply(lambda x: str(x).split('.')[2])
df['Publication Year']=df['Publication Date'].progress_apply(lambda x: str(x).split('.')[2])

df=df.drop(df[(df['Applicants'].astype(str).apply(lambda x: x.lower())=='nan') | (df['Inventors'].astype(str).apply(lambda x: x.lower())=='nan')].index)

df['Inventors']=df['Inventors'].progress_apply(lambda x: [' '.join(sorted(str(y).strip().replace(',','').split())).title() for y in x.split(';')])
df['Inventor']=df['Inventors'].progress_apply(lambda x: x[0])

df['Applicants']=p_map(app_pro, df['Applicants'].astype(str).tolist())
df['Applicant'] = df['Applicants'].progress_apply(lambda x: first_entity(x))

df['I P C']=df['I P C'].progress_apply(lambda x: str(x).split(';'))
df['IPC main']=df['I P C'].progress_apply(lambda x: sorted(list(set(y.strip().split()[0][0] for y in x))))
df['IPC subclass']=df['I P C'].progress_apply(lambda x: sorted(list(set(y.strip().split()[0] for y in x))))

df.shape

100%|██████████| 102579/102579 [00:10<00:00, 9781.61it/s]
100%|██████████| 102579/102579 [00:00<00:00, 605364.25it/s]
100%|██████████| 102579/102579 [00:00<00:00, 609634.20it/s]
100%|██████████| 102579/102579 [00:00<00:00, 156777.23it/s]
100%|██████████| 102579/102579 [00:00<00:00, 793997.71it/s]


HBox(children=(FloatProgress(value=0.0, max=102579.0), HTML(value='')))

  9%|▉         | 9520/102579 [00:00<00:00, 95199.18it/s]




100%|██████████| 102579/102579 [00:01<00:00, 86225.19it/s]
100%|██████████| 102579/102579 [00:00<00:00, 282596.12it/s]
100%|██████████| 102579/102579 [00:00<00:00, 239175.48it/s]
100%|██████████| 102579/102579 [00:00<00:00, 162477.97it/s]


(102579, 19)

In [14]:
maps_sw=[('Google', 'Google'),
         ('Amazon', 'Amazon'),
      ('Microsoft','Microsoft'),
         ('Visa', 'Visa'),
         ('Ge ','General Electric'),
         ('Electric General', 'General Electric'),
         ('Nuance ','Nuance Communications'),
         ('Johnson Controls','Johnson Controls'),
         ('Lsi ','Lsi Corporation'),
         ('Intezyne ','Intezyne Technologies'),
         ('Toray','Toray Industries'),
         ('Symbol ','Symbol Technologies'),
         ('Indiana University','Indiana University'),
         ('Elwha ','Elwha'),
         ('Council Of Scientific  Industrial Research','Council Of Scientific Industrial Research'),
         ('Albemarle','Albemarle'),
         ('Hong Fu Jin Precision','Hong Fu Jin Precision Industry Shenzhen Co Ltd'),
         ('Zhejiang Sunny','Zhejiang Sunny Opticals'),
         ('Oki ','Oki'),
         ('National Semiconductor','National Semiconductor Corp'),
         ('Sandia','Sandia'),
         ('Isis ','Isis Pharmaceuticals'),
         ('Cumminsallison','Cumminsallison Corp'),
         ('Deka','Deka Products'),
         ('Baker Hughes','Baker Hughes'),
         ('Chevron','Chevron'),
         ('Resmed','Resmed Ltd'),
         ('Uab','Uab Foundation'),
         ('Cornell ','Cornell University'),
         ('University Of Colorado','University Of Colorado'),
         ('Wyeth','Wyeth'),
         ('Iceutica','Iceutica'),
         ('National Research Council Of Canada','National Research Council Of Canada'),
         ('Tellabs ','Tellabs'),
         ('Agere ','Agere Systems'),
         ('Verizon','Verizon'),
         ('The Washington University','Washington University'),
         ('Rohm And Haas','Rohm And Haas'),
         ('Boe','Boe Technology Group'),
         ('Smithkline','Smithkline Beecham'),
         ('Sap ','Sap'),
         ('Kyocera Document','Kyocera Document'),
         ('Nissan ','Nissan'),
         ('Medtronic','Medtronic'),
         ('Janssen Pharmac','Janssen Pharmaceuticals'),
         ('Sumitomo ','Sumitomo'),
         ('Yamaha','Yamaha'),
         ('Seagate','Seagate Technology'),
         ('Actelion','Actelion Pharmaceuticals'),
         ('Furuno ','Furuno Electric'),
         ('Leap Motion','Leap Motion'),
         ('Biogen ','Biogen'),
         ('Tencent ','Tencent Technology'),
         ('Sas','Sas Inc'),
         ('Malaysian Palm Oil','Malaysian Palm Oil Board'),
         ('Hyundai','Hyundai'),
         ('Takeda','Takeda'),
         ('Esperion ','Esperion'),
         ('Taiwan Semiconductor','Taiwan Semiconductor Manufacturing Company'),
         ('Zte','Zte'),
         ('Sanyo ','Sanyo'),
         ('Sabic','Sabic'),
         ('Rutgers','Rutgers University'),
         ('Shionogi','Shionogi Co Ltd'),
         ('Tata Consultancy Services','Tata Consultancy Services'),
         ('France Telecom','France Telecom'),
         ('Agilent ','Agilent Technologies'),
         ('British Telecommunications','British Telecommunications'),
         ('Nitto ','Nitto Corp'),
         ('Boehringer Ingelheim','Boehringer Ingelheim'),
         ('Stmicroelectronics','Stmicroelectronics'),
         ('University Of Washington','University Of Washington'),
         ('Synta ','Synta Pharmaceuticals'),
         ('University Of Utah','University Of Utah'),
         ('Basf ','Basf Se'),
         ('University Of Tennessee','University Of Tennessee'),
         ('Unilever','Unilever'),
         ('Pasteur','Pasteur Institute'),
         ('Henkel ','Henkel'),
         ('Novo Nordisk','Novo Nordisk'),
         ('Georgia Tech','Georgia Tech Research Corp'),
         ('Klatencor','Klatencor'),
         ('Baylor','Baylor University'),
         ('Millennium ','Millennium Pharma'),
         ('Commonwealth Scientific','Commonwealth Scientific And Industrial Research'),
         ('Sun Microsystems','Sun Microsystems'),
         ('Basell','Basell Polyolefine'),
         ('Mediatek','Mediatek'),
         ('Jfe ','Jfe Corp'),
         ('Lockheed','Lockheed Martin'),
         ('Mastercard ','Mastercard'),
         ('Salesforce','Salesforce'),
         ('Yahoo','Yahoo'),
         ('Corning','Corning Corporation'),
         ('American Express','American Express'),
         ('Ciena','Ciena Corp'),
         ('Yeda','Yeda Research'),
         ('Nvidia','Nvidia Corporation'),
         ('Danisco','Danisco'),
         ('Alcatel','Alcatel-Lucent'),
         ('Tokyo Electron','Tokyo Electron Ltd'),
         ('Angiotech','Angiotech'),
         ('Hello','Hello'),
         ('Omron','Omron'),
         ('Exxonmobil','Exxonmobil'),
         ('Facebook','Facebook'),
         ('Hand Held','Hand Held Products'),
         ('Eastman Chemical','Eastman Chemical'),
         ('Konica','Konica Corporation'),
         ('Automotive Technologies','Automotive Technologies'),
      ('Schneider Electric','Schneider Electric'),
      ('Largan Precision','Largan Precision'),
         ('Olympus','Olympus'),
         ('Danafarber Cancer','Danafarber Cancer'),
         ('Hon Hai Precision','Hon Hai Precision'),
     ('Samsung','Samsung'),
         ('Marvell ','Marvell'),
         ('Purdue','Purdue'),
         ('Denso ','Denso Corp'),
    ('Att ','At&t'),
         ('Palo Alto','Palo Alto Research'),
         ('Motorola','Motorola'),
         ('Monsanto','Monsanto'),
         ('Micron ','Micron Technology'),
    ('Intel ','Intel'),
         ('Hitachi ','Hitachi'),
         ('Seimens ','Seimens'),
         ('Glaxosmithkline','Glaxosmithkline'),
         ('Nokia ','Nokia'),
         ('Halliburton Energy','Halliburton Energy Services'),
         ('Panasonic','Panasonic'),
         ('Apple','Apple'),
         ('Infineon Technologies','Infineon Technologies'),
         ('Asml','Asml'),
         ('Mitsubishi','Mitsubishi'),
         ('Pioneer Hibred','Pioneer Hibred'),
         ('Borealis','Borealis'),
         ('Gilead','Gilead'),
         ('Thomson Licensing','Thomson Licensing'),
         ('Sharp ','Sharp'),
         ('Accenture','Accenture'),
         ('Rockwell Automat','Rockwell Automation'),
         ('Oracle','Oracle'),
         ('Shinetsu Chemical','Shinetsu Chemical'),
         ('Astrazeneca','Astrazeneca'),
         ('Mondobiotech ','Mondobiotech Labs'),
         ('Fisherrosemount','Fisherrosemount'),
         ('Fuji ','Fuji'),
         ('Sumitomo Chemical','Sumitomo Chemical'),
         ('Fujinon','Fujinon Coporation'),
         ('Dolby','Dolby'),
         ('Interdigital','Interdigital'),
         ('Colgatepalmolive','Colgatepalmolive'),
         ('Adobe','Adobe'),
         ('Glaxo','Glaxosmithkline'),
         ('Ricoh','Ricoh Company'),
         ('Renesas','Renesas'),
         ('Ntt','NTT'),
         ('Kodak','Kodak'),
         ('Ciba ','Ciba Speciality'),
          ('Merck','Merck & Co')         
     ]
maps_con=[(' Business Machine', 'IBM'),
          ('The University Of California','The University Of California'),
          ('University Of Georgia','University Of Georgia Research Foundation'),
          ('Princeton University','Princeton University'),
          ('University Of British Columbia',' The University Of British Columbia'),
          ('The University Of Texas','The University Of Texas'),
          ('University Of Southern California','University Of Southern California'),
         ('Fujifilm ','Fujifilm'),
          ('Michigan State University','Michigan State University'),
         ('Canon ', 'Canon'),
          ('University Of Chicago','The University Of Chicago'),
          ('Hoffmannla','Roche'),
          ('Roche','Roche'),
          ('Toshiba','Toshiba'),
          ('Ecole','Ecole'),
          ('Dow Chemical', 'Dow Chemical'),
          ('Dow Corning', 'Dow Corning'),
          ('Dow Silicones','Dow Corning'),
          ('Lucent','Alcatel-Lucent'),
         ('Fujitsu','Fujitsu'),
          ('Northwestern University','Northwestern University'),
          ('Brigham And Womens','Brigham And Womens Hospital'),
          ('Timken','Timken & Co'),
          ('Nippon ','Nippon'),
         ('Sony ','Sony'),
          ('Smithkline','Smithkline'),
          ('United Tech','United Technologies Corp'),
          ('Largan Precision','Largan Precision'),
          ('Qualcomm','Qualcomm'),
          ('Novozymes','Novozymes'),
          ('Childrens Medical Center','Childrens Medical Center Corporation'),
          ('Philips ','Philips'),
          ('Broadcom ','Broadcom'),
          ('Applied Materials','Applied Materials'),
          ('Lg ','LG'),
          ('Baidu ','Baidu'),
          ('Ford Motor','Ford'),
          ('Louisiana State','Louisiana State University'),
          ('Ford Global','Ford'),
          ('Novartis','Novartis'),
          ('Broad ','Broad Institute Inc'),
          ('Hewlett Packard','Hewlett Packard'),
          ('Hewlettpackard','Hewlett Packard'),
          ('Lm Ericsson','Lm Ericsson'),
          ('Syngenta Participations','Syngenta Participations'),
          ('General Electric','General Electric'),
          ('Epson','Epson'),
          ('Comcast','Comcast'),
          ('Bosch','Bosch'),
          ('Huawei','Huawei'),
          ('Kia Silverbrook','Silverbrook Kia'),
          ('Silverbrook Re', 'Silverbrook Research'),
          ('Du Pont','Du Pont'),
          ('3M Innovative','3M Innovative Properties'),
          ('Nec ','NEC Corporation'),
          ('Xerox','Xerox Corporation'),
          ('Cisco','Cisco'),
          ('Bristolmyers Squibb','Bristolmyers Squibb Company'),
          ('L M Ericsson','Lm Ericsson'),
          ('Texas Instruments','Texas Instruments'),
          ('Toyota','Toyota'),
          ('Johns Hopkins','The John Hopkins University'),
          ('John Hopkins','The John Hopkins University'),
          ('Honeywell','Honeywell'),
          ('Schlumberger','Schlumberger'),
          ('Nortel ','Nortel'),
          ('Nikon','Nikon'),
          ('Matsushita Electric','Matsushita Electric'),
          ('Bayer','Bayer'),
          ('National Instruments','National Instruments'),
          ('Loreal','Loreal'),
          ('Pfizer','Pfizer'),
          ('Alnylam Pharmaceuticals','Alnylam Pharmaceuticals'),
          ('Honda Motor','Honda'),
          ('Uab', 'Uab Foundation')
          ('Boeing','The Boeing Company'),
          ('University Of Massachusetts','University Of Massachusetts'),
          ('Procter ','Procter & Gamble'),
         ('Cambridge','Cambridge Ltd')
          
         ]

for comp in tqdm(maps_con):
    df.loc[df['Applicant'].str.contains(comp[0]), 'Applicant']=comp[1]
for comp in tqdm(maps_sw):
    df.loc[df['Applicant'].str.startswith(comp[0]), 'Applicant']=comp[1]

100%|██████████| 77/77 [00:04<00:00, 17.34it/s]
100%|██████████| 161/161 [00:08<00:00, 18.90it/s]


In [15]:
for comp in tqdm(maps_con):
    df['Applicants']=df['Applicants'].apply(lambda x: [comp[1] if comp[0] in y else y.upper() for y in x])
for comp in tqdm(maps_sw):
    df['Applicants']=df['Applicants'].apply(lambda x: [comp[1] if (y.startswith(comp[0])) else y.upper() for y in x])

100%|██████████| 77/77 [00:17<00:00,  4.42it/s]
100%|██████████| 161/161 [00:38<00:00,  4.14it/s]


In [16]:
for kw in tqdm(government, desc='Gov'):
    df.loc[df['Applicant'].str.contains(kw), 'type']='Government'

for kw in tqdm(research, desc='Research Institutes'):
    df.loc[df['Applicant'].str.contains(kw), 'type']='Research Institute'
    
for kw in tqdm(universities, desc='Universities'):
    df.loc[df['Applicant'].str.contains(kw), 'type']='University'

for kw in tqdm(companies, desc='Companies'):
    df.loc[df['Applicant'].str.contains(kw), 'type']='Company'

for kw in tqdm(companies_ew, desc='Companies Endswith'):
    df.loc[df['Applicant'].str.endswith(kw), 'type']='Company'
    
df.loc[df['Applicant'].str.split().str.len()==1, 'type']='Company'

df.loc[df['type']=='Individual', 'Applicant']=df[df['type']=='Individual']['Applicant'].progress_apply(lambda x: inv_pro(x))

Gov: 100%|██████████| 7/7 [00:00<00:00, 17.56it/s]
Research Institutes: 100%|██████████| 16/16 [00:00<00:00, 18.12it/s]
Universities: 100%|██████████| 6/6 [00:00<00:00, 17.82it/s]
Companies: 100%|██████████| 127/127 [00:06<00:00, 18.36it/s]
Companies Endswith: 100%|██████████| 24/24 [00:01<00:00, 22.18it/s]
100%|██████████| 13676/13676 [00:00<00:00, 505323.59it/s]


In [17]:
df['type'].value_counts()

Company               82632
Individual            13676
University             4327
Research Institute     1749
Government              195
Name: type, dtype: int64

In [18]:
df[['Application Id', 'Application Year', 'Full Text', 'Applicant', 'type']].to_excel('/Users/cotraak/Desktop/Research/WIPO Process/Data/wipo_ai_processed2.xlsx', index=False)
df[['Application Id','Application Year','Full Text','IPC main','IPC subclass']].to_excel('/Users/cotraak/Desktop/Research/WIPO Process/Data/wipo_ai_ipcs.xlsx',index=False)

In [19]:
dtf=df[['Application Id', 'Application Year', 'Full Text', 'Applicant', 'type']]

In [20]:
dtf.head()

Unnamed: 0,Application Id,Application Year,Full Text,Applicant,type
0,US42597370,2000,bacterial strains genes and enzymes for contro...,Institute Of Molecular Agrobiology,Research Institute
1,US39458207,2000,reversible rolling method and reversible rolli...,Hitachi,Company
2,WO2001024205,2000,composite magnetic sheet and method of produci...,Tokin Corporation,Company
3,US39488862,2000,compounding assembly for nutritional fluids an...,Baxter International Inc,Company
4,US39579616,2000,hydrotreating catalyst for hydrotreating hydro...,Tonen Corporation,Company
