# Testing the Named Entities Recognition engine of Spacy

### Step 1. Loading Spacy models
***

We install Spacy's language library for the first run. Then we can comment-out the download command. Note that we are loading Spacy's "medium" model.


In [1]:
import re
import pandas as pd
import numpy as np
import spacy
import sys
from collections import Counter
import re


## Run to install the language library, then comment-out
## !{sys.executable} -m spacy download en
!{sys.executable} -m spacy download en_core_web_md

nlp = spacy.load('en_core_web_md')
print('Finished loading.')

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')
Finished loading.


### Step 2. Pre-processing
***
#### Load Glossary articles from the database 

* Definitions from dat_glossary.
* Titles from dat_link_info (with resource_information_id=1, i.e. Eurostat, see ESTAT.V1.mod_resource_information).
* Match above on id.

In [2]:
import pyodbc

c = pyodbc.connect('DSN=Virtuoso All;DBA=ESTAT;UID=xxxxx;PWD=xxxxx')
cursor = c.cursor()

SQLCommand = """SELECT T2.title,T1.definition  
                FROM ESTAT.V1.dat_glossary as T1 
                INNER JOIN ESTAT.V1.dat_link_info as T2  
                  ON T1.id=T2.id 
                WHERE T2.resource_information_id=1 """

GL_df = pd.read_sql(SQLCommand,c)

GL_df

Unnamed: 0,title,definition
0,Accident at work,An accident at work in the framework ...
1,Fatal accident at work,A fatal accident at work refers to an...
2,Non-fatal accident at work,A non-fatal accident at work is...
3,Aggregate demand,Aggregate demand is the total amount of ...
4,Goods and services account,The goods and services account shows ...
...,...,...
1309,Actual individual consumption (AIC),"Actual individual consumption , abbrevia..."
1310,Activity rate,Activity rate is the percentage of a...
1311,Activation policies,The activation policies are policies ...
1312,Active enterprises - FRIBS,"<Brief user-oriented definition, one or a fe..."


#### Delete records with empty definitions and carry out data cleansing


In [3]:
GL_df = GL_df.replace('', np.nan).copy()
GL_df = GL_df.dropna(axis=0,how='any')
print(GL_df.isnull().sum())
GL_df.reset_index(drop=True, inplace=True)

#import unicodedata as ud

def clean(x, quotes=True):
    if pd.isnull(x): return x  
    x = x.strip()
    
    ## make letter-question mark-letter -> letter-quote-space-letter !!! but NOT in the lists of URLs!!!
    if quotes:
        x = re.sub(r'([A-Za-z])\?([A-Za-z])','\\1\' \\2',x) 
    
    ## make letter-question mark-space lower case letter letter-quote-space letter
    x = re.sub(r'([A-Za-z])\? ([a-z])','\\1\' \\2',x) 

    ## delete ,000 commas in numbers    
    x = re.sub(r'\b(\d+),(\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## delete  000 spaces in numbers
    x = re.sub(r'\b(\d+) (\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## remove more than one spaces
    x = re.sub(r' +', ' ',x)
    
    ## remove start and end spaces
    x = re.sub(r'^ +| +$', '',x,flags=re.MULTILINE) 
    
    ## space-comma -> comma
    x = re.sub(r' \,',',',x)
    
    ## space-dot -> dot
    x = re.sub(r' \.','.',x)
    
    x = re.sub(r'â.{2}',"'",x) ### !!! NEW: single quotes are read as: âXX
    
    #x = x.encode('latin1').decode('utf-8') ## â\x80\x99
    #x = ud.normalize('NFKD',x).encode('ascii', 'ignore').decode()
    
    return x


GL_df['title'] = GL_df['title'].apply(clean)
GL_df['title'] = GL_df['title'].apply(lambda x: re.sub(r'\?','-',x)) ## also replace question marks by dashes
GL_df['definition'] = GL_df['definition'].apply(clean)

GL_df

title         0
definition    0
dtype: int64


Unnamed: 0,title,definition
0,Accident at work,An accident at work in the framework of the ad...
1,Fatal accident at work,A fatal accident at work refers to an accident...
2,Non-fatal accident at work,A non-fatal accident at work is an accident wh...
3,Aggregate demand,Aggregate demand is the total amount of goods ...
4,Goods and services account,The goods and services account shows the balan...
...,...,...
1307,Actual individual consumption (AIC),"Actual individual consumption, abbreviated as ..."
1308,Activity rate,Activity rate is the percentage of active pers...
1309,Activation policies,The activation policies are policies designed ...
1310,Active enterprises - FRIBS,"<Brief user-oriented definition, one or a few ..."


#### Delete "special" records

* i.e. redirections.



In [4]:
## Drop The records with definitions "The revision ..." and "Redirect to ..." 

idx = GL_df[GL_df['definition'].str.startswith('The revision #')].index
print(idx)
GL_df.drop(idx , inplace=True)
idx = GL_df[GL_df['definition'].str.startswith('Redirect to')].index
print(idx)
GL_df.drop(idx , inplace=True)
GL_df.reset_index(drop=True, inplace=True)
GL_df

Int64Index([ 230,  291,  383,  385,  432,  435,  437,  438,  503,  518,  529,
             556,  587,  728,  741,  774,  825,  888,  890,  911,  959,  960,
             968, 1001, 1005, 1131, 1142, 1180, 1229],
           dtype='int64')
Int64Index([], dtype='int64')


Unnamed: 0,title,definition
0,Accident at work,An accident at work in the framework of the ad...
1,Fatal accident at work,A fatal accident at work refers to an accident...
2,Non-fatal accident at work,A non-fatal accident at work is an accident wh...
3,Aggregate demand,Aggregate demand is the total amount of goods ...
4,Goods and services account,The goods and services account shows the balan...
...,...,...
1278,Actual individual consumption (AIC),"Actual individual consumption, abbreviated as ..."
1279,Activity rate,Activity rate is the percentage of active pers...
1280,Activation policies,The activation policies are policies designed ...
1281,Active enterprises - FRIBS,"<Brief user-oriented definition, one or a few ..."


#### Add the title to the column "definition"

In [5]:
GL_df['definition'] = GL_df['title'] +'. '+GL_df['definition']
#GL_df['source'] = 'GL'
GL_df

Unnamed: 0,title,definition
0,Accident at work,Accident at work. An accident at work in the f...
1,Fatal accident at work,Fatal accident at work. A fatal accident at wo...
2,Non-fatal accident at work,Non-fatal accident at work. A non-fatal accide...
3,Aggregate demand,Aggregate demand. Aggregate demand is the tota...
4,Goods and services account,Goods and services account. The goods and serv...
...,...,...
1278,Actual individual consumption (AIC),Actual individual consumption (AIC). Actual in...
1279,Activity rate,Activity rate. Activity rate is the percentage...
1280,Activation policies,Activation policies. The activation policies a...
1281,Active enterprises - FRIBS,Active enterprises - FRIBS. <Brief user-orient...


### Step 3. Apply the NER engine
***

Create columns ORG, GPE, NORP, LOCATION which will hold dictionaries with entities recognized as:
* Organizations;
* Countries, cities, states;
* Nationalities or religious or political groups;
* Non-GPE locations, mountain ranges, bodies of water, respectively. 

In each dictionary in a record, the key is the entity and the values are a list with the token span's *start* index position, the token span's *stop* index position and the count in the definition of the glossary article.

In [6]:
nlp.max_length = 1500000

GL_df['ORG'] = [dict() for i in range(len(GL_df))]
GL_df['GPE'] = [dict() for i in range(len(GL_df))]
GL_df['NORP'] = [dict() for i in range(len(GL_df))]
GL_df['LOCATION'] = [dict() for i in range(len(GL_df))]

for i in range(len(GL_df)):
    if i % 100 == 0: print('i = ',i,' of ',len(GL_df))
    tokens = nlp(GL_df.loc[i,'definition'])
    entities = tokens.ents
    for ent in entities:
        #print(ent.text, ent.label_)
        if ent.label_ == 'ORG':
            if ent.text.upper() in GL_df.loc[i,'ORG'].keys():
                GL_df.loc[i,'ORG'][ent.text.upper()][0].append((ent.start,ent.end)) 
                GL_df.loc[i,'ORG'][ent.text.upper()][1] += 1 
            else:    
                GL_df.loc[i,'ORG'][ent.text.upper()] = [[(ent.start,ent.end)],1]
        
        elif ent.label_ == 'GPE':
            if ent.text.upper() in GL_df.loc[i,'GPE'].keys():
                GL_df.loc[i,'GPE'][ent.text.upper()][0].append((ent.start,ent.end)) 
                GL_df.loc[i,'GPE'][ent.text.upper()][1] += 1 
            else:    
                GL_df.loc[i,'GPE'][ent.text.upper()] = [[(ent.start,ent.end)],1]
                
        elif ent.label_ == 'NORP':
            if ent.text.upper() in GL_df.loc[i,'NORP'].keys():
                GL_df.loc[i,'NORP'][ent.text.upper()][0].append((ent.start,ent.end)) 
                GL_df.loc[i,'NORP'][ent.text.upper()][1] += 1 
            else:    
                GL_df.loc[i,'NORP'][ent.text.upper()] = [[(ent.start,ent.end)],1]
                
        elif ent.label_ == 'LOCATION':
            if ent.text.upper() in GL_df.loc[i,'LOCATION'].keys():
                GL_df.loc[i,'LOCATION'][ent.text.upper()][0].append((ent.start,ent.end)) 
                GL_df.loc[i,'LOCATION'][ent.text.upper()][1] += 1 
            else:    
                GL_df.loc[i,'LOCATION'][ent.text.upper()] = [[(ent.start,ent.end)],1]         
    
GL_df

#PERSON People, including fictional
#NORP Nationalities or religious or political groups
#FACILITY Buildings, airports, highways, bridges, etc.
#ORGANIZATION Companies, agencies, institutions, etc.
#GPE Countries, cities, states
#LOCATION Non-GPE locations, mountain ranges, bodies of water
#PRODUCT Vehicles, weapons, foods, etc. (Not services)
#EVENT Named hurricanes, battles, wars, sports events, etc.
#WORK OF ART Titles of books, songs, etc.
#LAW Named documents made into laws 
#LANGUAGE Any named language
#The following values are also annotated in a style similar to names:
#DATE Absolute or relative dates or periods
#TIME Times smaller than a day
#PERCENT Percentage (including “%”)
#MONEY Monetary values, including unit
#QUANTITY Measurements, as of weight or distance
#ORDINAL “first”, “second”
#CARDINAL Numerals that do not fall under another typ

GL_df

i =  0  of  1283
i =  100  of  1283
i =  200  of  1283
i =  300  of  1283
i =  400  of  1283
i =  500  of  1283
i =  600  of  1283
i =  700  of  1283
i =  800  of  1283
i =  900  of  1283
i =  1000  of  1283
i =  1100  of  1283
i =  1200  of  1283


Unnamed: 0,title,definition,ORG,GPE,NORP,LOCATION
0,Accident at work,Accident at work. An accident at work in the f...,"{'EUROPEAN STATISTICS': [[(17, 19)], 1], 'THE ...",{},{},{}
1,Fatal accident at work,Fatal accident at work. A fatal accident at wo...,{},{},{},{}
2,Non-fatal accident at work,Non-fatal accident at work. A non-fatal accide...,"{'THE LABOUR FORCE SURVEY': [[(59, 63)], 1], '...",{},{},{}
3,Aggregate demand,Aggregate demand. Aggregate demand is the tota...,{},{},{},{}
4,Goods and services account,Goods and services account. The goods and serv...,{},{},{},{}
...,...,...,...,...,...,...
1278,Actual individual consumption (AIC),Actual individual consumption (AIC). Actual in...,"{'AIC': [[(4, 5), (13, 14), (138, 139), (151, ...",{},{},{}
1279,Activity rate,Activity rate. Activity rate is the percentage...,{},{},{},{}
1280,Activation policies,Activation policies. The activation policies a...,"{'LMP': [[(40, 41), (149, 150), (163, 164)], 3]}",{},{},{}
1281,Active enterprises - FRIBS,Active enterprises - FRIBS. <Brief user-orient...,{},{},{},{}


### Step 4. Gathering the most common entities: example with ORG entities
***

We can see a few errors and repetitions. These require some further cleansing steps and fine-tuning of the NER engine (not yet carried out). There are in total 947 terms identified as named entities - organizations.



In [7]:
from itertools import chain
org_list=sorted(list(chain.from_iterable(GL_df['ORG'].apply(lambda x: x.keys()))))
org_all_freqs = sorted(Counter(org_list))
print('Total terms identified as ORG: ',len(org_all_freqs))

print('\n100 most common:\n')
org_common_freqs = Counter(org_list).most_common(100)
org_common = sorted([x[0] for x in org_common_freqs])
print(org_common_freqs)

Total terms identified as ORG:  991

100 most common:

[('EU', 218), ('THE EUROPEAN UNION', 140), ('EUROSTAT', 53), ('EUROPEAN UNION', 47), ('EC', 39), ('NACE', 33), ('STATE', 31), ('THE EUROPEAN COMMISSION', 18), ('EFTA', 17), ('ICT', 17), ('THE EUROPEAN PARLIAMENT', 14), ('THE UNITED NATIONS', 14), ('PLURAL', 13), ('UN', 13), ('COMMUNITY', 11), ('OECD', 11), ('THE EUROPEAN COMMUNITY', 11), ('THE EUROPEAN COUNCIL', 11), ('EEA', 10), ('FSS', 10), ('INTRA-EU', 10), ('COMMISSION REGULATION', 8), ('ESA 2010', 8), ('HRST', 8), ('NL', 8), ('COUNCIL', 7), ('COUNCIL REGULATION', 7), ('CY', 7), ('DK', 7), ('LABOUR', 7), ('NUTS', 7), ('R & D', 7), ('THE EUROPEAN STATISTICAL SYSTEM', 7), ('COMMISSION', 6), ('EHIS', 6), ('ESA', 6), ('FDI', 6), ('LFS', 6), ('REGULATION', 6), ('SNA', 6), ('THE EUROPEAN CENTRAL BANK', 6), ('THE INTERNATIONAL MONETARY FUND', 6), ('UNITED NATIONS', 6), ('VAT', 6), ('ALL EUROPEAN UNION', 5), ('BMI', 5), ('CIS', 5), ('DMC', 5), ('EEC', 5), ('FARM', 5), ('NUAA', 5), ('PP

### Step 5. Storing information on these most common entities per article: example with ORG entities
***

This is one way of storing the information on both all entities and counts and on the most common ones in a Pandas dataframe.


In [8]:
GL_df['ORG_COMMON_100'] = GL_df['ORG'].apply(lambda x: {y:x[y] for y in x.keys() if y in org_common})
GL_df


Unnamed: 0,title,definition,ORG,GPE,NORP,LOCATION,ORG_COMMON_100
0,Accident at work,Accident at work. An accident at work in the f...,"{'EUROPEAN STATISTICS': [[(17, 19)], 1], 'THE ...",{},{},{},"{'EU': [[(119, 120)], 1]}"
1,Fatal accident at work,Fatal accident at work. A fatal accident at wo...,{},{},{},{},{}
2,Non-fatal accident at work,Non-fatal accident at work. A non-fatal accide...,"{'THE LABOUR FORCE SURVEY': [[(59, 63)], 1], '...",{},{},{},{}
3,Aggregate demand,Aggregate demand. Aggregate demand is the tota...,{},{},{},{},{}
4,Goods and services account,Goods and services account. The goods and serv...,{},{},{},{},{}
...,...,...,...,...,...,...,...
1278,Actual individual consumption (AIC),Actual individual consumption (AIC). Actual in...,"{'AIC': [[(4, 5), (13, 14), (138, 139), (151, ...",{},{},{},{}
1279,Activity rate,Activity rate. Activity rate is the percentage...,{},{},{},{},{}
1280,Activation policies,Activation policies. The activation policies a...,"{'LMP': [[(40, 41), (149, 150), (163, 164)], 3]}",{},{},{},{}
1281,Active enterprises - FRIBS,Active enterprises - FRIBS. <Brief user-orient...,{},{},{},{},{}


### Step 6. Exporting the dataframe to Excel
***
This is useful for the manual inspection and the design of rules for the fine-tuning of the NER engine. This output can then directly be imported in the database.


In [9]:
GL_df.to_excel('GL_df.xlsx')