# Scraping OECD's Glossary of Statistical Terms and writing the results to the Content Database

### Revised (January 2022) to correct an error

### https://stats.oecd.org/glossary/

In [1]:
import urllib.request as urllib2 
from urllib.request import urlopen

import bs4
from bs4 import BeautifulSoup

import re
import pandas as pd

import requests

import numpy as np
from operator import itemgetter

### Scraping

* Because there are blanks in the IDs of the Glossary articles, we scan all IDs up to a number large enough.
* The sections are not standard. The various names met are used to delimit the text.

In [2]:
base_url = "https://stats.oecd.org/glossary/detail.asp?ID={}"

sections = ['Definition','Definition:','Statistical Theme:','Context:',
            'Last updated on','Created on','Cross References:','Source Publication:','French Equivalent:','Hyperlink:','Glossary Output Segments:','Version Indicator:','Classification Indicator:','French Definition:']


num_articles = 8000
OECD_df=pd.DataFrame(index=range(num_articles))

for n in range(1,num_articles+1):
    if n % 100 ==0: print(n)
    scrape_url = base_url.format(n)
    OECD_df.loc[n,'ID'] = n
    OECD_df.loc[n,'URL'] = scrape_url
    res = requests.get(scrape_url)
    if res.status_code != 200: continue
    soup = bs4.BeautifulSoup(res.content,'lxml')
    
    #links = []
    #for link in soup.findAll('a'):
    #    links.append(link.get('href'))

    #str1 = " ".join(str(x) for x in links)
    #p=re.findall(r'detail\.asp\?ID=\d+', str1)
    #p = ['https://stats.oecd.org/glossary/'+lnk for lnk in p]
    
    links = soup.findAll('a',href=re.compile(r'detail\.asp\?ID=\d+'))
    p = ['https://stats.oecd.org/glossary/'+l.get('href') for l in links]
    
    text=soup.get_text()
    text=re.sub(r'(\n)+',' ',text)
    text=re.sub(r'( )+',' ',text)
    text=text.replace('\xa0','')
    text=text.replace('\r',' ')
    text=re.sub(r'( )+',' ',text)
    text = text.replace('OECD Glossary of Statistical Terms -', "")
    text = text.replace('Glossary Home About Contact Us Downloadable Version Advanced Filter Web Service OECD Statistics', "")



    first_column = True ## first to be collected: Term
    matches = [(re.search(section,text).span()[0],re.search(section,text).span()[1],section) 
               for section in sections if not re.search(section,text) is None]
    matches.sort(key=itemgetter(0))
    for k in range(len(matches)):
        match=matches[k] 
        s1,s2 = match[:2] ## start-end
        column = match[2] ## item from list 'sections'

        if k == len(matches)-1: ## last one, end of text to collect is end of 'text'
            s1_next = len(text)
        else:                   ## not last one, end of text to collect is start of next match  
            match_next=matches[k+1]
            s1_next = match_next[0]
        if first_column:
            cand_term = text[:s1].strip()
            test = re.search('"> ',cand_term) ## an anomaly in https://stats.oecd.org/glossary/detail.asp?ID=2216
            if test is not None:
                _,sb = test.span(0)
                cand_term = cand_term[(sb+1):].strip()
            OECD_df.loc[n,'Term'] = cand_term
            print(n,OECD_df.loc[n,'Term'])
            first_column = False ## finished with Term - will be reset to True in next 'text'
        else:    
            OECD_df.loc[n,column] = text[s2:s1_next].strip()
        #print(p)
        OECD_df.loc[n,'URL:Cross References']=','.join(p)
        
OECD_df_keep =  OECD_df.copy(deep=True) ## just a copy to be able to change things without re-running this time-consuming part

1 Abatement
2 Absence from work due to illness
3 Activity restriction - free expectancy
4 Acute care
5 Acute care beds
6 Administrative data collection
7 Administrative source
8 Abnormal obsolescence
10 Academic staff
11 Academic support for students
12 Accessibility (as a statistical data quality dimension)
13 Account
14 Accountability (environmental)
16 Accrual accounting
18 Accrued interest
19 Accumulation
20 Accumulation accounts
21 Accuracy
22 Acid rain
23 Aquaculture
24 Acquisitions
25 Acquisition price
26 Acquisition, time of
28 Active labour market programmes
29 Active metainformation system
30 Activity
32 Activity classification
33 Actual final consumption of general government
34 Actual final consumption of households
35 Actual final consumption of non-profit institutions serving households
36 Actual individual consumption
37 Actual social contributions
40 Acute health conditions
41 Adaptation
42 Adapted products
43 ADB
44 Add-factor
45 Additivity - PPI
46 Adjusted disposable

351 Classification of environment protection activities (CEPA)
352 Classification of individual consumption by purpose (COICOP)
353 Classification of outlays of producers by purpose (COPP)
354 Classification of the functions of government (COFOG)
355 Classification of the purposes of non-profit institutions (COPNI)
356 Classification unit
357 Classification of visitors
358 Classification scheme
359 Climate
360 Climate change
362 Climate Convention
363 Climate protection
364 Closed ecological system
365 Closed-end investment company
366 Closed season
367 Cluster samples
368 Coal, oil and natural gas reserves
369 Coarse grains
371 Codex Alimentarius
372 Codex Alimentarius Commission
373 Coefficient table
374 COFOG (classification of the functions of government)
375 Coherence
376 Cohort
377 COICOP (classification of individual consumption by purpose)
378 Collective consumption service
379 Collective households
380 Collective living quarters
382 Co-management
383 Command-and-control policy

682 Due-for-payment recording
683 Dump
684 Dumping (of waste)
685 Dumping at sea
686 Durability of a building
687 Durable good
688 Duration of a fixed income instrument
689 Duration of marriage
690 Duration of stay
691 Duration of unemployment
692 Dust
693 Dust arrester
694 Dwelling - SNA
695 Dwelling – UN
696 Dwelling construction (activity)
697 Dwelling occupancy status
698 Dynamic Asian Countries
699 Dystrophic water
700
700 EAGGF
701 Early foetal death
702 Early retirement for labour market reasons
704 Earnings (wages and salaries) – ILO
705 Earnings (wages and salaries) – UN
706 Earnings per share
707 Earth Summit
708 Earthwatch
709 ECB
710 Eco-development
711 Eco-domestic product
712 Ecological amplitude
713 Ecological balance
714 Ecological dominance
715 Ecological equilibrium
716 Ecological ethics
717 Ecological footprint
718 Ecological impact
719 Ecological statistics
720 Ecology
721 Economic assets
723 Economic instruments (environmental protection policy)
724 Economic intere

1000 Flag of convenience countries
1001 Flag state (for fishing vessel)
1002 Flexible inputs
1004 Floor area
1005 Flow series / data
1006 Economic flows
1007 Flows in real terms
1008 Fluorocarbon
1009 Fob price
1011 Foetal death rate
1012 Follow-up survey (or multi-round survey or multi-phase survey)
1013 Food and Agriculture Organisation (FAO)
1014 Food balance sheet
1015 Food chain
1016 Food web
1017 Foot and mouth disease
1018 Foreign affiliate
1019 Foreign assets
1020 Foreign bank
1021 Foreign border workers
1022 Foreign-born population of a country
1023 Foreign business travellers
1024 Foreign controlled corporations (non- financial and financial)
1026 Foreign currency transactions (banking)
1027 Foreign diplomatic and consular personnel
1028 Foreign direct investment
1031 Foreign direct investment enterprise
1032 Foreign exchange companies
1033 Foreign exchange rate
1034 Foreign exchange reserves
1035 Foreign exchange swap
1036 Financial guarantee corporations
1037 Foreigners adm

1293 Implicit price index based on constant exchange rates (or constant international prices) of period t0
1294 Implicit price index based on constant exchange rates (or constant international prices) of period t
1295 Import coverage ratio
1296 Import duties
1297 Import quotas
1298 Import subsidies
1299 Imports of goods and services – SNA
1300
1300 Imports of goods and services – UN
1302 Import price index
1303 Improvised housing units
1305 Imputed expenditure
1306 Imputed social contributions
1307 Inbound tourism
1308 Inbound tourism consumption
1309 Incineration
1310 Incineration at sea
1311 Incineration with recovery of energy
1312 Incinerator
1313 Income – SNA
1314 Income – Eurostat
1315 Net income from abroad
1317 Income on debt (interest accrued)
1318 Income on equity
1319 Incomes of health and social workers (average)
1320 Incomes of physicians, general practitioners, dentists
1321 Incorporated enterprise
1322 Incremental product innovation
1324 Laspeyres index number
1325 Index

1588 Manufacturing or industrial milk
1589 Margin (trade)
1590 Margin (transport)
1591 Marginal land
1592 Marginal settlements
1593 Margin (financial)
1594 Marine Mammal Protection Act (MMPA)
1595 Marine park
1596 Marine pollution
1597 Marital status
1599 Market access
1600
1600 Market Price Support (MPS)
1602 Market transfers
1603 Marketable securities
1604 Market establishments
1605 Market non-profit institutions serving businesses
1606 Market output – SNA
1607 Market output – ESA
1608 Market prices – SNA
1609 Market prices – BPM
1610 Market price equivalents
1612 Market producers – SNA
1613 Market producers – ESA
1615 Market services – ISIC
1616 Market services – NACE
1617 Market valuation
1620 Marketing agency (or board)
1621 Marketing Assistance Loan Programme
1623 Marketing loan (United States)
1624 Marketing orders (United States)
1625 Oil meal marketing year
1626 Oilseed oil marketing year
1627 Marriage
1628 Marriage order
1629 Materials and energy balances
1630 Maternal mortal

1893 Official development finance (ODF)
1894 Official reserves
1895 Offshore banking centres
1896 Offshore enterprises
1897 Offstream fish farming
1898 Offstream use of water
1899 Oilmeal
1900
1900 Oil prices (IEA)
1901 Oilseeds
1902 Oil spill
1903 Old age cash benefits
1904 One-hoss shay
1906 Original maturity of an instrument
1907 Open burning
1908 Open-ended investment company
1909 Open land
1910 Open market operation
1911 Operating lease
1912 Operating surplus
1914 Opportunity cost
1915 Optimum yield (in fish harvest)
1916 Options – SNA
1917 Options – BPM
1919 Organic compounds
1920 Organic farming
1921 Organic fertilizers
1922 Organism
1923 Organophosphates
1924 Oriented basic research
1925 Other accounts receivable / payable
1926 Other accumulation entries
1927 Other buildings and structures
1928 Other buildings
1929 Other capital
1930 Other capital taxes n.e.c.
1931 Other capital transfers
1932 Other changes in assets account
1933 Other changes in the volume of assets account
19

2200 Public utilities
2201 Purchased goodwill
2202 Purchaser’s prices – SNA
2203 Purchaser’s prices – ESA
2204 Purchasing power parity (PPPs) – SNA
2205 Purchasing power parities (PPPs) – OECD
2206 Pure basic research
2207 Purpose
2208 Purpose of aid
2209 Purpose of visit
2210 PVA
2211 Qualifier
2212 Qualifier term
2214 Qualitative errors
2215 Quality – Eurostat
2216 Quality – IMF
2217 Quality - National
2218 Quality of life
2219 Quantitative data
2221 Quantity index
2222 Quantity relative
2223 Quantum index
2225 Quasi-corporations
2227 Question
2228 Questionnaire
2230 Questionnaire design
2231 Radiation therapy equipment
2232 Radioactive waste
2233 Rainforest
2234 Range management
2235 Rare species
2236 Rates of change
2237 Six month rate of change (OECD CLIs)
2238 Twelve month rate of change (OECD composite leading indicators)
2239 Rate of natural increase
2241 Ratio to trend
2242 Raw sewerage
2243 Real effective exchange rates
2244 Real gross domestic income (real GDI)
2245 Real hol

2520 Specialised surveys
2521 Species
2522 Specific-rate tariff
2523 Spot price
2524 Spot rate
2525 Spread
2526 SPS Agreement
2528 Stabilisation funds (Canada)
2529 Stabilisation payment
2530 Stability (of ecosystem)
2531 Stamp taxes
2532 Standard
2533 Standard efficiency units
2534 Standardised data element
2536 Standby credit
2537 State Agricultural Intervention Fund – Czech Republic (SAIF)
2538 State government
2539 State indicator
2540 Stateless persons
2541 State trading enterprise (or body)
2542 Statistical concept
2543 Statistical data
2544 Statistical data collection
2545 Data editing
2546 Statistical ecology
2547 Statistical indicator
2551 Statistical metadata repository
2554 Statistical production
2555 Statistical territory – UN
2556 Statistical territory of the European Union – Eurostat
2557 Statistical unit – ISIC
2558 Statistical unit – Eurostat
2559 Stockholm Declaration
2560 Stock series / data
2561 Stock (of fish)
2562 Stocks – SNA
2563 Stocks – ESA
2564 Stocks (distrib

2839 Value added, gross
2840 Value added, net
2841 Value added - basic prices – SNA
2842 Value added - basic prices – NACE
2845 Value added function
2846 Value added tax (VAT) – SNA
2847 Value added tax (VAT) – ESA
2848 Tourism value added
2849 Value domain
2850 Value of work put in place (construction)
2851 Values at current international prices (at current PPPs)
2852 Values at current exchange rates (at current USD)
2853 Values at constant international prices of period t0 (at PPPs of period t0)
2854 Values at constant exchange rates of period t0
2855 Values at constant international prices of period t-1 (at PPPs of period t-1)
2856 Values at constant exchange rates of period t-1
2857 Variable
2858 Variable rate
2859 Value added tax (VAT), deductible
2860 Value added tax (VAT), invoiced
2861 Value added tax (VAT), non-deductible
2862 Vegetable oil
2863 Vehicles
2864 Version
2865 Version identifier
2866 Vertical integration (of a fishery)
2867 Vertically integrated enterprise
2868 Vis

3161 Combination
3162 Common cartel
3163 Competition
3165 Concentration
3169 Concerted action or practice
3170 Conglomerate
3171 Conglomerate merger
3172 Conscious parallelism
3173 Consolidation (of firms)
3174 Conspiracy
3175 Constant returns to scale
3176 Consumers' surplus
3177 Consumer welfare
3178 Contestability
3179 Constable markets
3180 Control of enterprises
3181 Costs
3182 Countervailing power
3183 Cournot (Nash) Equilibrium
3184 Crisis cartel
3185 Cross price elasticity of demand
3186 Cut-throat competition
3187 Dead-weight welfare loss
3188 Deconcentration
3189 Deep pockets
3190 Delivered pricing
3191 Demonopolisation
3193 Destructive competition
3194 Discrimination
3195 Diseconomies of scale
3196 Distributor's mark
3197 Diversification
3198 Divestiture
3199 Dominant firm
3200
3200 Dominant market position
3201 Dumping (of products overseas)
3202 Duopoly
3203 Economies of scale
3204 Economies of scope
3206 Elasticity of demand, price
3207 Business enterprise
3208 Entropy
32

3520 Validity error
3521 Winsorisation
3522 European Conference of Ministers of Transport (ECMT)
3523 ECMT
3524 Passenger car
3525 Unrelated diversification
3526 Related diversification
3527 Product specific economies of scale
3528 Plant specific economies of scale
3529 IUCN
3530 Functional labour markets
3531 Labour markets
3532 Spatial labour markets
3533 Local labour markets
3534 Employment protection
3535 Employment protection legislation (EPL)
3536 Job turnover
3537 Economy-wide job turnover rate
3538 Labour turnover
3539 Underemployment
3540 Visible underemployment
3541 Invisible underemployment
3542 Time related underemployment
3544 Profit sharing
3545 Maternity leave
3546 Paternity leave
3547 Leave for family reasons
3548 Parental leave
3549 International labour standards
3550 ILO international labour standards
3552 Working-time arrangements
3553 Collective bargaining
3554 Collective bargaining coverage
3555 Collective agreement
3556 Job loss
3557 Job losers
3558 Displaced work

3887 Variate transformation
3888 Weight bias
3889 Weighted index number
3890 Weighting coefficient
3891 Zero sum game
3892 Decile
3893 Quantiles
3894 Percentiles
3895 Octiles
3896 Quintiles
3897 Lower quartile
3898 Net correlation
3899 Non-random sample
3900
3900 Overall sampling fraction
3901 Partial replacement
3902 Purposive sample
3903 Replacement
3904 Significance
3905 Inverse correlation
3906 Noise (Statistical)
3907 Link relative
3908 Chain relative
3909 Sample unit
3910 Tolerance (Statistical)
3911 Weights - ISI
3912 Railway
3913 Railway network
3914 Track (railway)
3915 Track gauge
3916 Rail loading gauge
3917 Running track
3918 Electrified track
3919 Sidings (railway)
3920 Private siding (railway)
3921 Line (railway)
3922 Average length of line operated throughout the year (for rail transport)
3923 Electrified line
3924 Maximum operating speed
3925 Tractive vehicle
3926 Locomotive
3927 Steam locomotive
3928 Electric locomotive
3929 Diesel locomotive
3930 Railcar
3931 Passenge

4153 Turnover (of inland waterways transport enterprise)
4154 Revenues (of inland waterways transport enterprises)
4155 Costs (of inland waterways transport enterprises)
4156 Types of costs (of inland waterways transport enterprises)
4157 Value added (of inland waterways transport enterprises)
4158 Tangible investment (of inland waterways transport enterprises)
4159 Investment expenditure on infrastructure (for inland waterways transport)
4160 Investment expenditure on vessels (for inland waterways transport)
4161 Maintenance expenditure on infrastructure (for inland waterways transport)
4162 Maintenance expenditure on vessels (for inland waterways transport)
4163 Inland waterways traffic
4164 Inland waterways traffic on national territory
4165 Unladen inland waterways traffic
4166 Inland waterways journey
4167 Vessel-kilometre (for inland waterways transport)
4168 Inland waterways convoy
4169 Vehicle-kilometre (for inland waterways transport)
4170 Tonne-kilometre offered (for inland w

4372 ALGOL
4373 Analysis of variance (ANOVA)
4374 American National Standards Institute (ANSI)
4375 A programme language (APL)
4376 American Standard Code for Information Interchange (ASCII)
4377 Asynchronous Communication (ASYNC)
4378 BASIC
4379 Baud rate
4380 Binary coded decimal (BCD)
4381 Binary digit (BIT)
4382 Benchmark (in context of quality improvement)
4383 Byte
4384 C
4385 Computer Assisted Design / Computer Assisted Manufacturing (CAD-CAM)
4386 Computer Assisted Personal Interviewing (CAPI)
4387 Computer Aided Software Engineering (CASE)
4388 Computer Assisted Telephone Interviewing (CATI)
4389 Computer Based Training (CBT)
4390 Compact Disk - Read Only Memory (CD-ROM)
4391 COBOL
4392 Coverage improvement
4393 Characters per inch (CPI)
4394 Critical path method (CPM)
4395 Coefficient of variation
4396 Data base management system (DBMS)
4397 Data processing (DP)
4398 Dots per inch (DPI)
4399 Electronic data processing (EDP)
4400
4400 Facsimile devices (FAX)
4401 FORTRAN
4402 

4726 Calling opportunities
4728 Code division multiple access (CDMA)
4729 Dual band
4730 Dual mode
4731 Equivalent mobile operators
4732 Global system for mobile communications (GSM)
4733 Personal communications services (PCS)
4734 Pre-paid
4735 PSTN
4736 Roaming
4738 Subscriber identity module (SIM)
4739 Smart card
4740 Short Messaging Service (SMS)
4741 Spam
4742 Termination charges
4743 Tromboning
4744 International Mobile Telecommunications (IMT) 2000
4745 On-line access agreement
4746 Web site notices and disclaimers
4747 Electronic software distribution agreement
4748 Certification authority subscriber application agreement
4749 Web wrap agreement
4750 Internet advertising sponsorship agreement
4751 Web link agreement
4752 E-government
4753 Ex ante control
4754 Ex poste control
4755 A priori audit
4756 A posteriori audit
4757 Accountability (in management theory)
4758 Control / controls (in management and administration)
4759 Accounting controls
4760 Accruals account
4761 Adminis

5057 Cut-off threshold
5058 Statistical error
5059 Estimate
5060 Expected value
5061 Relative standard deviation
5062 Relative standard error
5064 Quality control survey
5065 Misclassification
5066 Non-probability sampling
5067 Statistical characteristics
5068 Statistical measure
5069 Under-coverage
5070 Arbitrage
5071 Auditability
5072 Automated teller machine (ATM)
5073 ATM
5074 Call money
5075 Cryptography
5076 Cipher text
5077 Closed network
5078 Credit card
5079 Encryption
5080 Firewall
5081 Money laundering
5082 Overnight money
5083 Day-to-day money
5084 Point of sale
5085 Electronic funds transfer at the point of sale (EFTPOS)
5086 EFTPOS
5087 Velocity (of money)
5088 Dublin Core Metadata Initiative (DCMI)
5089 DCMI
5090 Data Quality Reference Site (DQRS)
5091 DQRS
5092 DQAF
5093 Fundamental Principles of Official Statistics
5094 United Nations Statistical Commission (UNSC)
5095 UNSC
5096 Cyber money
5097 E-banking
5098 Electronic funds transfer (EFT)
5099 EFT
5100
5100 Extranet

5385 Lower secondary education (ISCED 2)
5386 Maintenance and operations personnel
5387 Management/Quality control/Administration
5388 Mathematical literacy
5389 Mode of study
5390 Native students
5391 Net graduation rates
5392 New entrants to a level of education - OECD
5393 Non-compulsory curriculum
5394 Non-instructional educational institutions - OECD
5395 Non-native students
5396 Part-time student
5397 Part-time teacher
5398 PISA index of achievement press
5399 PISA index of comfort with and perceived ability to use computers
5400
5400 PISA index of disciplinary climate
5401 PISA index of economic, social and cultural status (ESCS)
5402 PISA index of interest in computers
5403 PISA index of teacher support
5404 PISA index of the use of school resources
5405 PISA International Socio-Economic Index of Occupational Status (ISEI)
5406 PISA mean score
5407 PISA population
5408 Post-secondary non-tertiary level of education (ISCED 4)
5409 Pre-primary education (ISCED 0)
5410 Pre-vocatio

5732 Producer's index
5733 Test approach
5734 Unequivocal price index
5735 Value updating
5736 Chaining
5737 Logarithmic Laspeyres price index
5738 Logarithmic Paasche price index
5739 Collective consumption
5740 Conditional cost of living index
5741 Consumers
5742 Cost-of-living bias
5743 Comparison period
5744 Democratic index
5745 Explicit quality adjustment
5746 Implicit quality adjustment
5747 Household budget surveys
5748 Household consumption expenditure surveys
5749 Household expenditure surveys
5750 Hybrid values or expenditures
5751 Hybrid weights
5752 Owner occupied housing
5753 Rental equivalence
5754 User cost
5755 Scanner data
5756 Substitute
5757 Substitution
5758 Substitution effect
5760 Uses approach
5761 Young index
5763 ABO
5764 Net accumulation
5765 Gross accumulation
5766 ALPS
5767 Dumping margin
5768 ASEAN
5769 Non-wage benefits
5770 Accounting edit
5771 Asphalt
5772 Multiplier
5773 CPP
5774 Carry trade
5775 Incomplete census
5776 London certificates of deposit
57

6031 Upper-middle-income countries
6032 Write-off
6033 Cover
6034 Enhanced structural adjustment facility (ESAF)
6035 Structural adjustment facility (SAF)
6036 Poverty reduction and growth facility (PRGF)
6037 Houston terms
6038 Extended fund facility (EFF)
6039 Flow rescheduling
6040 Debt default
6041 London Club
6042 Nominal value (of a debt instrument)
6043 Official development assistance (ODA)
6044 Concessionality level - IMF
6045 Net present value (NPV) of debt
6046 Short-term debt
6047 Long-term external debt
6048 Final maturity date
6049 American depository receipt (ADR)
6050 Balances on nostro and vostro accounts
6051 Bank deposits
6052 Bearer depository receipts (BDR)
6053 Collateralised debt obligations (CDOs)
6054 Commercial paper
6055 Commodity linked bonds
6056 Commodity linked derivatives
6057 Convertible bonds
6058 Credit linked note
6059 Currency linked bonds
6060 Currency pool loans
6061 Depository receipts
6062 Deposits in mutual associations
6063 Equity linked bond
6

6345 Accounting treatment of terminal costs
6346 Acidifying potential (AP)
6347 Additions to stock levels
6348 Aggregation in physical accounts
6349 Agricultural land and surface water (SEEA)
6350 Allocation of emissions to final demand
6351 Amenity functions of the natural capital
6352 Annual fellings
6353 Annual removals
6354 Appropriation method
6355 Avoidance
6356 Benefit transfer
6357 Bequest benefit
6358 Best available technology
6359 Biological resources
6360 Capital approach to sustainable development
6361 Capital expenditure for environmental protection
6362 Capital services method
6363 Capture fisheries
6364 CEPA 2000
6365 Changes in land cover by categories of changes
6366 Changes in the rate of extraction
6367 Changes in the unit resource rent
6368 Cleaner technologies and products group (of the environment industry)
6369 Competition of functions
6370 Compliance criterion
6371 Conjoint analysis approaches
6372 Connected products
6374 Consumption value method (for standing t

6580 Stumpage price
6581 Stumpage value method (for standing timber valuation)
6582 Supply and use table for environmental protection
6583 Supply and use table for natural resources
6584 Supply and use table for residuals
6585 Sustainable catch
6586 Sustainable development indicator
6587 Sustainable national income
6588 Sustainable yield
6589 Symmetric treatment of defensive expenditure
6590 Terminal costs
6591 Three pillar approach to sustainable development
6592 Total (domestic) Material input
6593 Total actual renewable water resources
6594 Total Domestic Output (TDO)
6595 Total material consumption (TMC)
6596 Total Material Output (TMO)
6597 Total material requirement (TMR)
6598 Total natural renewable water
6599 Total non–renewable water
6600
6600 Travel cost method
6601 Unused extraction
6602 Use benefits
6603 Use tables for natural resources
6604 Valuation of the degradation of land and soil
6605 Valuing natural resources
6606 Virtual population analysis
6607 Volume
6608 Water q

6869 New products (in context of innovation)
6870 Process innovation
6871 Marketing innovation
6872 Innovations in pricing
6873 Organisational innovation
6874 Innovative active firm
6875 Acquisition of technology and knowledge
6876 Open information sources
6877 Innovation co-operation
6878 Knowledge management
6879 Ambiguity rule
6880 Analysis server
6881 Attribution
6882 Anonymised data
6883 Anonymised record
6884 Approximate disclosure
6885 Argus
6886 Attribute disclosure
6887 Barnardisation
6888 Blurring
6889 Bounds
6890 Calculated interval
6891 Cell suppression
6892 Complementary suppression
6893 Complete disclosure
6894 Concentration rule
6895 Confidentiality edit
6896 Controlled rounding
6897 Controlled Tabular Adjustment (CTA)
6898 Conventional rounding
6899 Data divergence
6900
6900 Data intruder
6901 Data intrusion detection
6902 Data Intrusion Simulation (DIS)
6903 Data protection
6904 Data swapping
6905 Data utility
6906 Deterministic rounding
6907 Direct identification
6908

7210 EMAS
7211 Environmental R&D
7212 Environment-commercial 'win-wins'
7213 Flexible policy measures
7214 Market-based instruments
7215 Performance-based standard
7216 Pollution prevention
7217 Prescriptive policy measures
7218 Technology-based standards
7219 Total suspended solids (TSS)
7220 Biobank
7221 Human genetic research database(s) (HGRDs)
7222 Biological sample
7223 Coded samples
7224 Consent
7225 Identified samples
7226 Unidentified samples
7227 Unlinked samples
7228 Benchmark
7229 Capacity assessment
7230 Capacity development
7231 Civil society organisations
7232 Country assistance strategies / plans
7233 Development Policy Lending
7234 Direct Budget Support (DBS)
7235 Fragile States
7236 Governance
7237 Good governance
7238 National ownership
7239 Policy reform
7240 Sector wide approach
7241 Sectoral strategy
7242 Strategic conflict assessment
7243 Structural Adjustment Programmes
7244 Tiering
7245 Arm’s length principle
7246 Due care
7247 Heightened managerial care
7248 H

In [18]:
OECD_df = OECD_df_keep.copy(deep=True) 

OECD_df.dropna(subset = ['Term','Definition:'],inplace=True)
OECD_df.reset_index(drop=True,inplace=True)
OECD_df['ID'] = OECD_df['ID'].astype('int32')

OECD_df.to_excel('OECD_df.xlsx')
OECD_df.head()

Unnamed: 0,ID,URL,Term,URL:Cross References,French Equivalent:,Definition:,Cross References:,Statistical Theme:,Created on,Last updated on,Source Publication:,Context:,Hyperlink:,Glossary Output Segments:,Classification Indicator:,Version Indicator:,French Definition:
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,Réduction,See Pollution abatement.,Pollution abatement,Environmental statistics,"Tuesday, September 25, 2001","Thursday, March 14, 2002",,,,,,,
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,,Absence from work due to illness refers to the...,,Health statistics,"Tuesday, September 25, 2001","Thursday, November 22, 2001",OECD Health Data 2001: A Comparative Analysis ...,,,,,,
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,,Functional limitation-free life expectancy is ...,,Health statistics,"Tuesday, September 25, 2001","Wednesday, October 31, 2001",OECD Health Data 2001: A Comparative Analysis ...,,,,,,
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,,Acute care is one in which the principal inten...,Acute care beds Acute care hospital staff rati...,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",OECD Health Data 2001: A Comparative Analysis ...,,,,,,
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,,Acute care beds are beds accommodating patient...,Acute care Long-term care beds in hospitals,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",2001 Data Collection on Education Systems: Def...,Acute care beds have alternatively been define...,,,,,


### Check cross-references

* Reason: there are cases where the cross-references lead to non-existing IDs.
* Add column 'Cross_References_2' with cross-references separated by semicolons.

In [19]:
for i in range(len(OECD_df)):

    if not OECD_df.loc[i,'URL:Cross References'].strip()=='':
        links = OECD_df.loc[i,'URL:Cross References'].split(',')

        matches=[re.search(r'\d+$',el) for el in links]

        x = [int(el[m.span()[0]:m.span()[1]]) for (el,m) in zip(links,matches)]
        ## x = [id for id in x if id in OECD_df['ID'].values.tolist()] ## if scraping partial results
        titles = [OECD_df.loc[np.where(OECD_df.ID==id)[0],'Term'].values[0] for id in x]
        OECD_df.loc[i,'Cross_References_2']=';'.join([t for t in titles if not t is np.nan])
        
OECD_df.head()        

Unnamed: 0,ID,URL,Term,URL:Cross References,French Equivalent:,Definition:,Cross References:,Statistical Theme:,Created on,Last updated on,Source Publication:,Context:,Hyperlink:,Glossary Output Segments:,Classification Indicator:,Version Indicator:,French Definition:,Cross_References_2
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,Réduction,See Pollution abatement.,Pollution abatement,Environmental statistics,"Tuesday, September 25, 2001","Thursday, March 14, 2002",,,,,,,,Pollution abatement
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,,Absence from work due to illness refers to the...,,Health statistics,"Tuesday, September 25, 2001","Thursday, November 22, 2001",OECD Health Data 2001: A Comparative Analysis ...,,,,,,,
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,,Functional limitation-free life expectancy is ...,,Health statistics,"Tuesday, September 25, 2001","Wednesday, October 31, 2001",OECD Health Data 2001: A Comparative Analysis ...,,,,,,,
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,,Acute care is one in which the principal inten...,Acute care beds Acute care hospital staff rati...,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",OECD Health Data 2001: A Comparative Analysis ...,,,,,,,Acute care beds;Acute care hospital staff rati...
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,,Acute care beds are beds accommodating patient...,Acute care Long-term care beds in hospitals,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",2001 Data Collection on Education Systems: Def...,Acute care beds have alternatively been define...,,,,,,Acute care;Long-term care beds in hospitals


### Some cleaning of the data

In [20]:
import unicodedata

OECD_df.drop(columns=['French Equivalent:', ## put also 'French Definition:'
                     'Glossary Output Segments:','Classification Indicator:','Version Indicator:',
                     'Created on','Hyperlink:'],inplace=True)
OECD_df.rename(columns={'Term':'term','Definition:':'definition',
                        'Statistical Theme:':'theme','Cross_References_2':'related','Context:':'context',
                        'URL:Cross References':'related_URL','Last updated on':'last_update'},inplace=True)

OECD_df.fillna(value='',inplace=True)
print(OECD_df.isnull().sum())


OECD_df['term']= OECD_df['term'].apply(lambda x: unicodedata.normalize('NFKD',x)) ## for the dashes in terms
#OECD_df['related']= OECD_df['related'].apply(lambda x: unicodedata.normalize('NFKD',x)) ## for the dashes in terms


OECD_df.head()

ID                    0
URL                   0
term                  0
related_URL           0
definition            0
Cross References:     0
theme                 0
last_update           0
context               0
French Definition:    0
related               0
dtype: int64


Unnamed: 0,ID,URL,term,related_URL,definition,Cross References:,theme,last_update,context,French Definition:,related
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,See Pollution abatement.,Pollution abatement,Environmental statistics,"Thursday, March 14, 2002",,,Pollution abatement
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,Absence from work due to illness refers to the...,,Health statistics,"Thursday, November 22, 2001",,,
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,Functional limitation-free life expectancy is ...,,Health statistics,"Wednesday, October 31, 2001",,,
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care is one in which the principal inten...,Acute care beds Acute care hospital staff rati...,Health statistics,"Thursday, April 25, 2013",,,Acute care beds;Acute care hospital staff rati...
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care beds are beds accommodating patient...,Acute care Long-term care beds in hospitals,Health statistics,"Thursday, April 25, 2013",Acute care beds have alternatively been define...,,Acute care;Long-term care beds in hospitals


### Local file for inspection

In [21]:
import datetime
current_time = datetime.datetime.now() 
outfile = 'OECD_final_results_2_'+str(current_time.month)+ '_' + str(current_time.day) + '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.xlsx'

OECD_df.to_excel(outfile)


### Write to the database

In [16]:
%%script false --no-raise-error

import pyodbc

## the definition of the table
##create table "ESTAT"."V1"."OECD_Glossary" 
##( 
##  "id" INTEGER, 
##  "article_id" INTEGER, <- this is OECD's id 
##  "term" VARCHAR, 
##  "url" VARCHAR, 
##  "definition" LONG VARCHAR, 
##  "context" LONG VARCHAR, 
##  "theme" VARCHAR, 
##  "related" VARCHAR, 
##  "related_url" VARCHAR, 
##  "last_update" VARCHAR, 
##  "source_publ" VARCHAR,
##  PRIMARY KEY ("id") 
##); 

c = pyodbc.connect('DSN=Virtuoso All;DBA=ESTAT;UID=xxxxx;PWD=xxxxx')

sql = """INSERT INTO ESTAT.V1.OECD_Glossary (id,article_id,term,url,definition,context,theme,related,related_url,last_update,source_publ)
         VALUES (?,?,?,?,?,?,?,?,?,?,?)"""


cursor = c.cursor()

for i in range(len(OECD_df)):
    cursor.execute(sql,i+1,
        int(OECD_df.loc[i,'ID']),
        OECD_df.loc[i,'term'],    
        OECD_df.loc[i,'URL'],    
        OECD_df.loc[i,'definition'],    
        OECD_df.loc[i,'context'],    
        OECD_df.loc[i,'theme'],    
        OECD_df.loc[i,'related'],    
        OECD_df.loc[i,'related_URL'], 
        OECD_df.loc[i,'last_update'], 
        OECD_df.loc[i,'Source Publication:'])
    

c.commit()