# Scraping OECD's Glossary of Statistical Terms and writing the results to the Content Database

### https://stats.oecd.org/glossary/

In [1]:
import urllib.request as urllib2 
from urllib.request import urlopen

import bs4
from bs4 import BeautifulSoup

import re
import pandas as pd

import requests

import numpy as np
from operator import itemgetter

### Scraping

* Because there are blanks in the IDs of the Glossary articles, we scan all IDs up to a number large enough.
* The sections are not standard. The various names met are used to delimit the text.

In [2]:
base_url = "https://stats.oecd.org/glossary/detail.asp?ID={}"

sections = ['Definition','Definition:','Statistical Theme:','Context:',
            'Last updated on','Created on','Cross References:','Source Publication:','French Equivalent:','Hyperlink:','Glossary Output Segments:','Version Indicator:','Classification Indicator:','French Definition:']


num_articles = 8000
OECD_df=pd.DataFrame(index=range(num_articles))

for n in range(1,num_articles+1):
    if n % 100 ==0: print(n)
    scrape_url = base_url.format(n)
    OECD_df.loc[n,'ID'] = n
    OECD_df.loc[n,'URL'] = scrape_url
    res = requests.get(scrape_url)
    soup = bs4.BeautifulSoup(res.content,'lxml')
    
    links = []
    for link in soup.findAll('a'):
        links.append(link.get('href'))

    #print(links)
    str1 = " ".join(str(x) for x in links)
    p=re.findall(r'detail\.asp\?ID=\d+', str1)
    #print(p)
    p = ['https://stats.oecd.org/glossary/'+lnk for lnk in p]
    
    text=soup.get_text()
    text=re.sub(r'(\n)+',' ',text)
    text=re.sub(r'( )+',' ',text)
    text=text.replace('\xa0','')
    text=text.replace('\r',' ')
    text=re.sub(r'( )+',' ',text)
    text = text.replace('OECD Glossary of Statistical Terms -', "")
    text = text.replace('Glossary Home About Contact Us Downloadable Version Advanced Filter Web Service OECD Statistics', "")


    #print('ID = ',n,': >',text,'<')
    first_column = True ## first to be collected: Term
    matches = [(re.search(section,text).span()[0],re.search(section,text).span()[1],section) 
               for section in sections if not re.search(section,text) is None]
    matches.sort(key=itemgetter(0))
    #print('matches: ',matches,'\n')
    for k in range(len(matches)):
        match=matches[k] 
        s1,s2 = match[:2] ## start-end
        column = match[2] ## item from list 'sections'
        #print('checking: ',s1,s2,column)
        if k == len(matches)-1: ## last one, end of text to collect is end of 'text'
            s1_next = len(text)
        else:                   ## not last one, end of text to collect is start of next match  
            match_next=matches[k+1]
            s1_next = match_next[0]
        if first_column:
            OECD_df.loc[n,'Term'] = text[:s1].strip()
            first_column = False ## finished with Term - will be reset to True in next 'text'
        else:    
            OECD_df.loc[n,column] = text[s2:s1_next].strip()
        #print(p)
        OECD_df.loc[n,'URL:Cross References']=','.join(p)
        
OECD_df_keep =  OECD_df.copy(deep=True) ## just a copy to be able to change things without re-running this time-consuming part

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000


In [14]:
OECD_df = OECD_df_keep.copy(deep=True) 

OECD_df.dropna(subset = ['Term','Definition:'],inplace=True)
OECD_df.reset_index(drop=True,inplace=True)
OECD_df['ID'] = OECD_df['ID'].astype('int32')

OECD_df

Unnamed: 0,ID,URL,Term,URL:Cross References,French Equivalent:,Definition:,Cross References:,Statistical Theme:,Created on,Last updated on,Source Publication:,Context:,Hyperlink:,Glossary Output Segments:,Classification Indicator:,Version Indicator:,French Definition:
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,Réduction,See Pollution abatement.,Pollution abatement,Environmental statistics,"Tuesday, September 25, 2001","Thursday, March 14, 2002",,,,,,,
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,,Absence from work due to illness refers to the...,,Health statistics,"Tuesday, September 25, 2001","Thursday, November 22, 2001",OECD Health Data 2001: A Comparative Analysis ...,,,,,,
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,,Functional limitation-free life expectancy is ...,,Health statistics,"Tuesday, September 25, 2001","Wednesday, October 31, 2001",OECD Health Data 2001: A Comparative Analysis ...,,,,,,
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,,Acute care is one in which the principal inten...,Acute care beds Acute care hospital staff rati...,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",OECD Health Data 2001: A Comparative Analysis ...,,,,,,
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,,Acute care beds are beds accommodating patient...,Acute care Long-term care beds in hospitals,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",2001 Data Collection on Education Systems: Def...,Acute care beds have alternatively been define...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6931,7352,https://stats.oecd.org/glossary/detail.asp?ID=...,European Agricultural Fund for Rural Developme...,https://stats.oecd.org/glossary/detail.asp?ID=...,,The Common Agricultural Policy (CAP) is financ...,Common Agricultural Policy (CAP) European Agri...,,"Wednesday, April 3, 2013","Wednesday, April 3, 2013","European Commission, Agriculture and Rural Dev...",,http://ec.europa.eu/agriculture/cap-funding/in...,,,,
6932,7354,https://stats.oecd.org/glossary/detail.asp?ID=...,Carbon market,https://stats.oecd.org/glossary/detail.asp?ID=...,,A popular (but misleading) term for a trading ...,Greenhouse gases,,"Thursday, April 4, 2013","Thursday, April 4, 2013",United Nations Framework Convention on Climate...,,http://unfccc.int/essential_background/glossar...,,,,
6933,7355,https://stats.oecd.org/glossary/detail.asp?ID=...,Classification structure,https://stats.oecd.org/glossary/detail.asp?ID=350,,Refers to how the categories of a classificati...,Classification,,"Tuesday, April 9, 2013","Tuesday, April 9, 2013","United Nations Statistics Division, n.d. UN Gl...",,http://unstats.un.org/unsd/class/family/glossa...,,,,
6934,7356,https://stats.oecd.org/glossary/detail.asp?ID=...,United Nation Framework Convention on Climate ...,https://stats.oecd.org/glossary/detail.asp?ID=...,,The United Nations Framework Convention on Cli...,United Nations Conference on Environment and D...,,"Tuesday, April 9, 2013","Friday, April 26, 2013",United Nations Framework Convention on Climate...,"The other “Rio Conventions”, also negotiated a...",http://unfccc.int/2860.php,,,,


### Check cross-references

* Reason: there are cases where the cross-references lead to non-existing IDs.
* Add column 'Cross_References_2' with cross-references separated by semicolons.

In [15]:
for i in range(len(OECD_df)):

    if not OECD_df.loc[i,'URL:Cross References'].strip()=='':
        links = OECD_df.loc[i,'URL:Cross References'].split(',')
        matches=[re.search(r'\d+$',el) for el in links]
        x = [int(el[m.span()[0]:m.span()[1]]) for (el,m) in zip(links,matches)]
        ## x = [id for id in x if id in OECD_df['ID'].values.tolist()] ## if scraping partial results
        titles = [OECD_df.loc[np.where(OECD_df.ID==id)[0],'Term'].values[0] for id in x]
        OECD_df.loc[i,'Cross_References_2']=';'.join([t for t in titles if not t is np.nan])
        
OECD_df        

Unnamed: 0,ID,URL,Term,URL:Cross References,French Equivalent:,Definition:,Cross References:,Statistical Theme:,Created on,Last updated on,Source Publication:,Context:,Hyperlink:,Glossary Output Segments:,Classification Indicator:,Version Indicator:,French Definition:,Cross_References_2
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,Réduction,See Pollution abatement.,Pollution abatement,Environmental statistics,"Tuesday, September 25, 2001","Thursday, March 14, 2002",,,,,,,,Pollution abatement
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,,Absence from work due to illness refers to the...,,Health statistics,"Tuesday, September 25, 2001","Thursday, November 22, 2001",OECD Health Data 2001: A Comparative Analysis ...,,,,,,,
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,,Functional limitation-free life expectancy is ...,,Health statistics,"Tuesday, September 25, 2001","Wednesday, October 31, 2001",OECD Health Data 2001: A Comparative Analysis ...,,,,,,,
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,,Acute care is one in which the principal inten...,Acute care beds Acute care hospital staff rati...,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",OECD Health Data 2001: A Comparative Analysis ...,,,,,,,Acute care beds;Acute care hospital staff rati...
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,,Acute care beds are beds accommodating patient...,Acute care Long-term care beds in hospitals,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",2001 Data Collection on Education Systems: Def...,Acute care beds have alternatively been define...,,,,,,Acute care;Long-term care beds in hospitals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6931,7352,https://stats.oecd.org/glossary/detail.asp?ID=...,European Agricultural Fund for Rural Developme...,https://stats.oecd.org/glossary/detail.asp?ID=...,,The Common Agricultural Policy (CAP) is financ...,Common Agricultural Policy (CAP) European Agri...,,"Wednesday, April 3, 2013","Wednesday, April 3, 2013","European Commission, Agriculture and Rural Dev...",,http://ec.europa.eu/agriculture/cap-funding/in...,,,,,Common Agricultural Policy (CAP);European Agri...
6932,7354,https://stats.oecd.org/glossary/detail.asp?ID=...,Carbon market,https://stats.oecd.org/glossary/detail.asp?ID=...,,A popular (but misleading) term for a trading ...,Greenhouse gases,,"Thursday, April 4, 2013","Thursday, April 4, 2013",United Nations Framework Convention on Climate...,,http://unfccc.int/essential_background/glossar...,,,,,Greenhouse gases
6933,7355,https://stats.oecd.org/glossary/detail.asp?ID=...,Classification structure,https://stats.oecd.org/glossary/detail.asp?ID=350,,Refers to how the categories of a classificati...,Classification,,"Tuesday, April 9, 2013","Tuesday, April 9, 2013","United Nations Statistics Division, n.d. UN Gl...",,http://unstats.un.org/unsd/class/family/glossa...,,,,,Classification
6934,7356,https://stats.oecd.org/glossary/detail.asp?ID=...,United Nation Framework Convention on Climate ...,https://stats.oecd.org/glossary/detail.asp?ID=...,,The United Nations Framework Convention on Cli...,United Nations Conference on Environment and D...,,"Tuesday, April 9, 2013","Friday, April 26, 2013",United Nations Framework Convention on Climate...,"The other “Rio Conventions”, also negotiated a...",http://unfccc.int/2860.php,,,,,United Nations Conference on Environment and D...


### Some cleaning of the data

In [16]:
import unicodedata

OECD_df.drop(columns=['French Equivalent:', ## put also 'French Definition:'
                     'Glossary Output Segments:','Classification Indicator:','Version Indicator:',
                     'Created on','Source Publication:','Hyperlink:'],inplace=True)
OECD_df.rename(columns={'Term':'term','Definition:':'definition',
                        'Statistical Theme:':'theme','Cross_References_2':'related','Context:':'context',
                        'URL:Cross References':'related_URL','Last updated on':'last_update'},inplace=True)

OECD_df.fillna(value='',inplace=True)
print(OECD_df.isnull().sum())


OECD_df['term']= OECD_df['term'].apply(lambda x: unicodedata.normalize('NFKD',x)) ## for the dashes in terms
#OECD_df['related']= OECD_df['related'].apply(lambda x: unicodedata.normalize('NFKD',x)) ## for the dashes in terms


OECD_df

ID                    0
URL                   0
term                  0
related_URL           0
definition            0
Cross References:     0
theme                 0
last_update           0
context               0
French Definition:    0
related               0
dtype: int64


Unnamed: 0,ID,URL,term,related_URL,definition,Cross References:,theme,last_update,context,French Definition:,related
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,See Pollution abatement.,Pollution abatement,Environmental statistics,"Thursday, March 14, 2002",,,Pollution abatement
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,Absence from work due to illness refers to the...,,Health statistics,"Thursday, November 22, 2001",,,
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,Functional limitation-free life expectancy is ...,,Health statistics,"Wednesday, October 31, 2001",,,
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care is one in which the principal inten...,Acute care beds Acute care hospital staff rati...,Health statistics,"Thursday, April 25, 2013",,,Acute care beds;Acute care hospital staff rati...
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care beds are beds accommodating patient...,Acute care Long-term care beds in hospitals,Health statistics,"Thursday, April 25, 2013",Acute care beds have alternatively been define...,,Acute care;Long-term care beds in hospitals
...,...,...,...,...,...,...,...,...,...,...,...
6931,7352,https://stats.oecd.org/glossary/detail.asp?ID=...,European Agricultural Fund for Rural Developme...,https://stats.oecd.org/glossary/detail.asp?ID=...,The Common Agricultural Policy (CAP) is financ...,Common Agricultural Policy (CAP) European Agri...,,"Wednesday, April 3, 2013",,,Common Agricultural Policy (CAP);European Agri...
6932,7354,https://stats.oecd.org/glossary/detail.asp?ID=...,Carbon market,https://stats.oecd.org/glossary/detail.asp?ID=...,A popular (but misleading) term for a trading ...,Greenhouse gases,,"Thursday, April 4, 2013",,,Greenhouse gases
6933,7355,https://stats.oecd.org/glossary/detail.asp?ID=...,Classification structure,https://stats.oecd.org/glossary/detail.asp?ID=350,Refers to how the categories of a classificati...,Classification,,"Tuesday, April 9, 2013",,,Classification
6934,7356,https://stats.oecd.org/glossary/detail.asp?ID=...,United Nation Framework Convention on Climate ...,https://stats.oecd.org/glossary/detail.asp?ID=...,The United Nations Framework Convention on Cli...,United Nations Conference on Environment and D...,,"Friday, April 26, 2013","The other “Rio Conventions”, also negotiated a...",,United Nations Conference on Environment and D...


### Local file for inspection

In [17]:
import datetime
current_time = datetime.datetime.now() 
outfile = 'OECD_final_results_2_'+str(current_time.month)+ '_' + str(current_time.day) + '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.xlsx'

OECD_df.to_excel(outfile)


### Write to the database

In [19]:
import pyodbc

## the definition of the table
##create table "ESTAT"."V1"."OECD_Glossary" 
##( 
##  "id" INTEGER, 
##  "article_id" INTEGER, <- this is OECD's id 
##  "term" VARCHAR, 
##  "url" VARCHAR, 
##  "definition" LONG VARCHAR, 
##  "context" LONG VARCHAR, 
##  "theme" VARCHAR, 
##  "related" VARCHAR, 
##  "related_url" VARCHAR, 
##  "last_update" VARCHAR, 
##  PRIMARY KEY ("id") 
##); 

c = pyodbc.connect('DSN=Virtuoso All;DBA=ESTAT;UID=kimon;PWD=RkhvQYZ442e2JVXLHdtW')

sql = """INSERT INTO ESTAT.V1.OECD_Glossary (id,article_id,term,url,definition,context,theme,related,related_url,last_update)
         VALUES (?,?,?,?,?,?,?,?,?,?)"""

cursor = c.cursor()

for i in range(len(OECD_df)):
    cursor.execute(sql,i+1,
        int(OECD_df.loc[i,'ID']),
        OECD_df.loc[i,'term'],    
        OECD_df.loc[i,'URL'],    
        OECD_df.loc[i,'definition'],    
        OECD_df.loc[i,'context'],    
        OECD_df.loc[i,'theme'],    
        OECD_df.loc[i,'related'],    
        OECD_df.loc[i,'related_URL'], 
        OECD_df.loc[i,'last_update']) 
    

c.commit()