# Statistics Explained  articles: noun phrases and matching with Eurostat's Concepts and Definitions Database



In [1]:
import re
import pandas as pd
import sys


In [2]:
from datetime import datetime

def file_name(pre,ext):
    current_time = datetime.now() 
    return pre + '_'+ str(current_time.month)+ '_' + str(current_time.day) + \
                 '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.'+ext

In [3]:
import pyodbc
c = pyodbc.connect('DSN=Virtuoso All;DBA=ESTAT;UID=kimon;PWD=RkhvQYZ442e2JVXLHdtW')
cursor = c.cursor()

In [4]:
#import unicodedata as ud

def clean(x, quotes=True):
    if pd.isnull(x): return x  
    x = x.strip()
    
    ## make letter-question mark-letter -> letter-quote-space-letter !!! but NOT in the lists of URLs!!!
    if quotes:
        x = re.sub(r'([A-Za-z])\?([A-Za-z])','\\1\' \\2',x) 
    
    ## make letter-question mark-space lower case letter letter-quote-space letter
    x = re.sub(r'([A-Za-z])\? ([a-z])','\\1\' \\2',x) 

    ## delete ,000 commas in numbers    
    x = re.sub(r'\b(\d+),(\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## delete  000 spaces in numbers
    x = re.sub(r'\b(\d+) (\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## remove more than one spaces
    x = re.sub(r' +', ' ',x)
    
    ## remove start and end spaces
    x = re.sub(r'^ +| +$', '',x,flags=re.MULTILINE) 
    
    ## space-comma -> comma
    x = re.sub(r' \,',',',x)
    
    ## space-dot -> dot
    x = re.sub(r' \.','.',x)
    
    x = re.sub(r'â.{2}',"'",x) ### !!! NEW: single quotes are read as: âXX
    
    #x = x.encode('latin1').decode('utf-8') ## â\x80\x99
    #x = ud.normalize('NFKD',x).encode('ascii', 'ignore').decode()
    
    return x

### Statistics explained articles

* IDs and titles from dat_link_info, with resource_information_id=1, i.e. Eurostat (see ESTAT.V1.mod_resource_information) and matching IDs from dat_article.
* Carry out data cleansing on titles and URLs.

In [5]:
SQLCommand = """SELECT id, title 
                FROM ESTAT.V1.dat_link_info 
                WHERE resource_information_id=1 AND id IN (SELECT id FROM ESTAT.V1.dat_article) """

SE_df = pd.read_sql(SQLCommand,c)

SE_df['title'] = SE_df['title'].apply(clean)
SE_df.head(5)


Unnamed: 0,id,title
0,7,Accidents at work statistics
1,13,National accounts and GDP
2,16,Railway safety statistics in the EU
3,17,Railway freight transport statistics
4,18,Railway passenger transport statistics - quart...


### Add paragraphs titles and contents

* From dat_article_paragraph with abstract=0 (i.e. "no").
* Match article_id from dat_article_paragraph with id from dat_article.
* Carry out data cleansing on titles and paragraph contents.

In [6]:
SQLCommand = """SELECT article_id, title, content 
                FROM ESTAT.V1.dat_article_paragraph
                WHERE abstract=0 AND article_id IN (SELECT id FROM ESTAT.V1.dat_article) """

add_content = pd.read_sql(SQLCommand,c)
add_content['title'] = add_content['title'].apply(clean)
add_content['content'] = add_content['content'].apply(clean)
add_content

Unnamed: 0,article_id,title,content
0,2905,Absences from work sharply increase in first h...,Absences from work recorded unprecedented high...
1,2905,Absences: 9.5 % of employment in Q4 2019 and 1...,The article's next figure (Figure 4) compares ...
2,2905,Higher share of absences from work among women...,"Considering all four quarters of 2020, the sha..."
3,2905,Absences from work due to own illness or disab...,"From Q4 2019 to Q4 2020, the number of people ..."
4,2905,Absences from work due to holidays,"Expressed as a share of employed people, absen..."
...,...,...,...
3854,10539,General presentation and definition,Scope of asylum statistics and Dublin statisti...
3855,10539,Methodological aspects in asylum statistics,Annual aggregate of the number of asylum appli...
3856,10539,Methodological aspects in Dublin statistics,Asymmetries For most of the collected Dublin s...
3857,10539,What questions can or cannot be answered with ...,How many asylum seekers are entering EU Member...


### Aggregate above paragraph titles and contents  from SE articles paragraphs by article id

* Create a column _raw content_ which gathers all paragraph titles and contents in one text per article.

In [7]:
add_content_grouped = add_content.groupby(['article_id'])[['title','content']].aggregate(lambda x: list(x))
add_content_grouped.reset_index(drop=False, inplace=True)
for i in range(len(add_content_grouped)):
    add_content_grouped.loc[i,'raw content'] = ''
    for (a,b) in zip(add_content_grouped.loc[i,'title'],add_content_grouped.loc[i,'content']):
        add_content_grouped.loc[i,'raw content'] += ' '+a + ' ' + b
add_content_grouped = add_content_grouped[['article_id','raw content']]    

add_content_grouped

Unnamed: 0,article_id,raw content
0,7,"Number of accidents In 2018, there were 3.1 m..."
1,13,Developments for GDP in the EU-27: growth sin...
2,16,Fall in the number of railway accidents 9 % f...
3,17,Downturn for EU transport performance in 2019...
4,18,Rail passenger transport performance continue...
...,...,...
860,10456,Problem After successfully identifying and jo...
861,10470,"Problem In France, there was significant room..."
862,10506,General overview Nine PEEIs concern short-ter...
863,10531,What are administrative sources? The term 'ad...


### Merge raw content of SE articles with main file

* Also, add title to raw content.

In [8]:
SE_df = pd.merge(SE_df,add_content_grouped,left_on='id',right_on='article_id',how='inner')
SE_df.drop(['article_id'],axis=1,inplace=True)

SE_df['raw content'] = SE_df['title'] +'. '+SE_df['raw content']

SE_df.head(5)

Unnamed: 0,id,title,raw content
0,7,Accidents at work statistics,Accidents at work statistics. Number of accid...
1,13,National accounts and GDP,National accounts and GDP. Developments for G...
2,16,Railway safety statistics in the EU,Railway safety statistics in the EU. Fall in ...
3,17,Railway freight transport statistics,Railway freight transport statistics. Downtur...
4,18,Railway passenger transport statistics - quart...,Railway passenger transport statistics - quart...


### Check for missing information

In [9]:
import numpy as np

SE_df = SE_df.replace('', np.nan) 
print(SE_df.isnull().sum())

id             0
title          0
raw content    0
dtype: int64


### Collecting information on noun phrases


In [10]:
import nltk
import re
import pprint
from nltk import Tree

new_patterns = """
    NP:    {<DT><WP><VBP>*<RB>*<VBN><IN><NN>}
           {<NN|NNS|NNP|NNPS><IN>*<NN|NNS|NNP|NNPS>+}
           {<JJ>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+}
           {<JJ>*<NN|NNS|NNP|NNPS>+}
           
    """

new_NPChunker = nltk.RegexpParser(new_patterns)

def prepare_text(input):
    tokenized_sentence = nltk.sent_tokenize(input)  # Tokenize the text into sentences.
    tokenized_words = [nltk.word_tokenize(sentence) for sentence in tokenized_sentence]  # Tokenize words in sentences.
    tagged_words = [nltk.pos_tag(word) for word in tokenized_words]  # Tag words for POS in each sentence.
    word_tree = [new_NPChunker.parse(word) for word in tagged_words]  # Identify NP chunks
    return word_tree  # Return the tagged & chunked sentences.


def return_a_list_of_NPs(sentences):
    nps = []  # an empty list in which to NPs will be stored.
    for sent in sentences:
        tree = new_NPChunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                t = subtree
                t = ' '.join(word for word, tag in t.leaves())
                nps.append(t)
    return nps


In [11]:
d=[]

for i in range(len(SE_df)):
    sentences = prepare_text(SE_df.loc[i,'raw content'])
    res = return_a_list_of_NPs(sentences)
    res = [(SE_df.loc[i,'id'],l) for l in res]
    d.extend(res)

In [12]:
nphrases_df = pd.DataFrame(d,columns=["doc_id", "noun_phrase"])    
nphrases_df

Unnamed: 0,doc_id,noun_phrase
0,7,Accidents at work statistics
1,7,Number of accidents
2,7,non-fatal accidents
3,7,calendar days
4,7,absence from work
...,...,...
418461,10539,stateless person
418462,10539,EURODAC Regulation
418463,10539,access
418464,10539,EU fingerprint database record


### Merge with the file with the SE titles.

In [13]:
nphrases_df2=pd.merge(SE_df[['id','title']],nphrases_df,left_on='id',right_on='doc_id')
nphrases_df2.drop(columns=['id'],inplace=True)
nphrases_df2

Unnamed: 0,title,doc_id,noun_phrase
0,Accidents at work statistics,7,Accidents at work statistics
1,Accidents at work statistics,7,Number of accidents
2,Accidents at work statistics,7,non-fatal accidents
3,Accidents at work statistics,7,calendar days
4,Accidents at work statistics,7,absence from work
...,...,...,...
418461,Asylum statistics introduced,10539,stateless person
418462,Asylum statistics introduced,10539,EURODAC Regulation
418463,Asylum statistics introduced,10539,access
418464,Asylum statistics introduced,10539,EU fingerprint database record


### Lemmatize noun phrases
* NLTK seems to be better than Spacy in lemmatization. Convert to lower-case first. 
* Keep only words with alphanumeric characters and drop stop-words.

In [14]:
import nltk

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

stop = stopwords.words('english')

  

In [15]:
def lemmatize_text(text): ## only alphanumeric characters and drop stop-words
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text) if w.isalnum() and not w in stop]

nphrases_df2['normalized_noun_phrase'] = nphrases_df2.noun_phrase.apply(lambda x: x.lower())
nphrases_df2['normalized_noun_phrase'] = nphrases_df2.normalized_noun_phrase.apply(lemmatize_text)
nphrases_df2['normalized_noun_phrase'] = [' '.join(map(str, l)) for l in nphrases_df2['normalized_noun_phrase'] ]
nphrases_df2['normalized_noun_phrase'] = nphrases_df2.normalized_noun_phrase.apply(lambda x: x.upper())
nphrases_df2



Unnamed: 0,title,doc_id,noun_phrase,normalized_noun_phrase
0,Accidents at work statistics,7,Accidents at work statistics,ACCIDENT WORK STATISTIC
1,Accidents at work statistics,7,Number of accidents,NUMBER ACCIDENT
2,Accidents at work statistics,7,non-fatal accidents,ACCIDENT
3,Accidents at work statistics,7,calendar days,CALENDAR DAY
4,Accidents at work statistics,7,absence from work,ABSENCE WORK
...,...,...,...,...
418461,Asylum statistics introduced,10539,stateless person,STATELESS PERSON
418462,Asylum statistics introduced,10539,EURODAC Regulation,EURODAC REGULATION
418463,Asylum statistics introduced,10539,access,ACCESS
418464,Asylum statistics introduced,10539,EU fingerprint database record,EU FINGERPRINT DATABASE RECORD


In [16]:
nphrases_df2.replace('', np.nan, inplace=True)
nphrases_df2.dropna(subset=['normalized_noun_phrase'],inplace=True)
nphrases_df2

Unnamed: 0,title,doc_id,noun_phrase,normalized_noun_phrase
0,Accidents at work statistics,7,Accidents at work statistics,ACCIDENT WORK STATISTIC
1,Accidents at work statistics,7,Number of accidents,NUMBER ACCIDENT
2,Accidents at work statistics,7,non-fatal accidents,ACCIDENT
3,Accidents at work statistics,7,calendar days,CALENDAR DAY
4,Accidents at work statistics,7,absence from work,ABSENCE WORK
...,...,...,...,...
418461,Asylum statistics introduced,10539,stateless person,STATELESS PERSON
418462,Asylum statistics introduced,10539,EURODAC Regulation,EURODAC REGULATION
418463,Asylum statistics introduced,10539,access,ACCESS
418464,Asylum statistics introduced,10539,EU fingerprint database record,EU FINGERPRINT DATABASE RECORD


* Some further processing.
* Cut noun-phrases with only one word.

In [17]:
nphrases_df2['normalized_noun_phrase'] = nphrases_df2['normalized_noun_phrase'].apply(lambda x: re.sub(r'[()]','',x))
nphrases_df2['normalized_noun_phrase'] = nphrases_df2['normalized_noun_phrase'].apply(lambda x: re.sub(r'Â','A',x))
nphrases_df2['normalized_noun_phrase'] = nphrases_df2['normalized_noun_phrase'].apply(lambda x: re.sub(r'%','',x))
nphrases_df2['normalized_noun_phrase'] = nphrases_df2['normalized_noun_phrase'].apply(lambda x: re.sub(r'\]','',x))
nphrases_df2['normalized_noun_phrase'] = nphrases_df2['normalized_noun_phrase'].apply(lambda x: re.sub(r'/]','',x))
nphrases_df2['normalized_noun_phrase'] = nphrases_df2['normalized_noun_phrase'].apply(lambda x: re.sub(r'\+]','',x))
nphrases_df2['normalized_noun_phrase'] = nphrases_df2['normalized_noun_phrase'].apply(lambda x: re.sub(r'\-]','',x))
nphrases_df2['normalized_noun_phrase'] = nphrases_df2['normalized_noun_phrase'].apply(lambda x: re.sub(r'\d+','',x))

nphrases_df2['normalized_noun_phrase_count'] = nphrases_df2['normalized_noun_phrase'].apply(lambda x: len(x.replace(',',' ').split()))
idx = nphrases_df2[nphrases_df2['normalized_noun_phrase_count'] <=1].index
print(idx)

nphrases_df2.drop(nphrases_df2[nphrases_df2['normalized_noun_phrase_count'] <=1].index, inplace = True)
idx = nphrases_df2[nphrases_df2['normalized_noun_phrase_count'] <=1].index
print(idx)

nphrases_df2.drop(columns=['normalized_noun_phrase_count'],inplace=True)

Int64Index([     2,      7,      8,      9,     11,     15,     19,     20,
                22,     23,
            ...
            418434, 418437, 418439, 418440, 418443, 418445, 418446, 418452,
            418458, 418463],
           dtype='int64', length=200908)
Int64Index([], dtype='int64')


* Collect overall frequencies.

In [18]:
tmp=nphrases_df2.groupby(by='normalized_noun_phrase').size().to_frame('Overall_Frequencies')
tmp
nphrases_df2 = pd.merge(nphrases_df2,tmp,on='normalized_noun_phrase')
nphrases_df2

Unnamed: 0,title,doc_id,noun_phrase,normalized_noun_phrase,Overall_Frequencies
0,Accidents at work statistics,7,Accidents at work statistics,ACCIDENT WORK STATISTIC,1
1,Accidents at work statistics,7,Number of accidents,NUMBER ACCIDENT,14
2,Accidents at work statistics,7,number of accidents,NUMBER ACCIDENT,14
3,Accidents at work statistics,7,number of accidents,NUMBER ACCIDENT,14
4,Accidents at work statistics,7,number of accidents,NUMBER ACCIDENT,14
...,...,...,...,...,...
182535,Asylum statistics introduced,10539,references Regulation,REFERENCE REGULATION,1
182536,Asylum statistics introduced,10539,Asylum Procedures Directive,ASYLUM PROCEDURE DIRECTIVE,1
182537,Asylum statistics introduced,10539,Reception Conditions Directive,RECEPTION CONDITION DIRECTIVE,1
182538,Asylum statistics introduced,10539,EURODAC Regulation,EURODAC REGULATION,1


* Collect frequencies per document and drop duplicates.

In [19]:
nphrases_df2['Frequencies_per_doc']=nphrases_df2.groupby(['doc_id','normalized_noun_phrase'])['normalized_noun_phrase'].transform('count')

nphrases_df2.drop(columns=['noun_phrase'],inplace=True)
nphrases_df2.drop_duplicates(subset=['title','normalized_noun_phrase'], inplace=True, ignore_index=False)

nphrases_df2

Unnamed: 0,title,doc_id,normalized_noun_phrase,Overall_Frequencies,Frequencies_per_doc
0,Accidents at work statistics,7,ACCIDENT WORK STATISTIC,1,1
1,Accidents at work statistics,7,NUMBER ACCIDENT,14,7
8,Railway safety statistics in the EU,16,NUMBER ACCIDENT,14,2
10,Accidents at work ? statistics on causes and c...,2947,NUMBER ACCIDENT,14,1
11,Road safety statistics ? characteristics at na...,7156,NUMBER ACCIDENT,14,2
...,...,...,...,...,...
182535,Asylum statistics introduced,10539,REFERENCE REGULATION,1,1
182536,Asylum statistics introduced,10539,ASYLUM PROCEDURE DIRECTIVE,1,1
182537,Asylum statistics introduced,10539,RECEPTION CONDITION DIRECTIVE,1,1
182538,Asylum statistics introduced,10539,EURODAC REGULATION,1,1


### Unique noun phrases in SE articles

In [20]:
res = nphrases_df2.groupby(['normalized_noun_phrase']).size().to_frame('size').reset_index() ## unique noun phrases
res.drop(columns=['size'],inplace=True)
res

Unnamed: 0,normalized_noun_phrase
0,A LEVEL
1,A SINGLE PERSON
2,AASTERN EUROPEAN COUNTRY
3,ABBREVIATED NEET
4,ABBREVIATION ESA
...,...
57053,ZOOM BUTTON
57054,Ã LAND
57055,Ã LAND ISLAND
57056,Ã RDAL


### Eurostat's Concepts and Definitions Database


In [21]:
SQLCommand = """SELECT id, code_id, term
                FROM ESTAT.V1.dat_estat_glossary 
             """


concepts_df = pd.read_sql(SQLCommand,c)
concepts_df


concepts_df

Unnamed: 0,id,code_id,term
0,1,12789,"(n,k) rule"
1,2,12799,"(p,q) rule"
2,3,19247,Âµ-ARGUS
3,4,5545,Abandoned wine-growing area
4,5,20003,Abduction by a legal guardian
...,...,...,...
11215,11216,4277,Value adjustments on investments
11216,11217,4284,Value of transactions between affiliated enter...
11217,11218,4285,Value re-adjustments on investments
11218,11219,19099,Widened agricultural census


### Lemmatize terms

* Convert to lower-case first.

In [22]:
concepts_df['lemmatized_term'] = concepts_df['term'].apply(lambda x: x.lower())
concepts_df['lemmatized_term']= concepts_df['lemmatized_term'].apply(lemmatize_text)
concepts_df['lemmatized_term']= [' '.join(map(str, l)) for l in concepts_df['lemmatized_term']]
concepts_df['lemmatized_term'] = concepts_df['lemmatized_term'].apply(lambda x: x.upper())
concepts_df

Unnamed: 0,id,code_id,term,lemmatized_term
0,1,12789,"(n,k) rule",RULE
1,2,12799,"(p,q) rule",RULE
2,3,19247,Âµ-ARGUS,
3,4,5545,Abandoned wine-growing area,ABANDONED AREA
4,5,20003,Abduction by a legal guardian,ABDUCTION LEGAL GUARDIAN
...,...,...,...,...
11215,11216,4277,Value adjustments on investments,VALUE ADJUSTMENT INVESTMENT
11216,11217,4284,Value of transactions between affiliated enter...,VALUE TRANSACTION AFFILIATED ENTERPRISE
11217,11218,4285,Value re-adjustments on investments,VALUE INVESTMENT
11218,11219,19099,Widened agricultural census,WIDENED AGRICULTURAL CENSUS


### Unique noun phrases in SE articles

In [23]:
res = nphrases_df2.groupby(['normalized_noun_phrase']).size().to_frame('size').reset_index() ## unique noun phrases
res.drop(columns=['size'],inplace=True)
res

Unnamed: 0,normalized_noun_phrase
0,A LEVEL
1,A SINGLE PERSON
2,AASTERN EUROPEAN COUNTRY
3,ABBREVIATED NEET
4,ABBREVIATION ESA
...,...
57053,ZOOM BUTTON
57054,Ã LAND
57055,Ã LAND ISLAND
57056,Ã RDAL


###  Find matches per unique noun phrase

* Column 'Common' has a list with tuples per record/unique noun phrase from the SE articles: (id of the term in Concepts & Definitions with the match, entire Concepts & Definitions normalized term, the part that matches).
* Columns 'len_intersect' and 'len_union' contain the corresponding lengths of the intersection and the union of terms for the calculation of Jaccard similarities.
* Column Jaccard has lists with the corresponding Jaccard similarities. 


In [24]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

res['Common']=[[] for i in range(len(res))]
res['len_intersect']=[[] for i in range(len(res))]
res['len_union']=[[] for i in range(len(res))]
res['Jaccard']=[[] for i in range(len(res))]

search_in = concepts_df['lemmatized_term'].apply(lambda x: x.split(' '))
search_in
for i in range(len(res)):
    np = res.loc[i,'normalized_noun_phrase'].strip().split(' ')
    np = [el for el in np if not el.lower() in stop] ## excluding individual words which are stop-words
    np = set(np)
    if (i+1) % 100==0:
        print(i+1,' of ',len(res),' unique noun phrases: set of terms: ',np)
    for (j,x) in enumerate(search_in):
        set_2 = set(x)
        common = np.intersection(set_2)
        uni = np.union(set_2)
        if len(common) > 0:
            #print(i,np, ' with ',concepts_df.loc[j,'normalized_noun_phrase'],' : ',common)
            res.loc[i,'Common'].append(list((concepts_df.loc[j,'code_id'],concepts_df.loc[j,'lemmatized_term'],common)))
            res.loc[i,'len_intersect'].append(len(common))
            res.loc[i,'len_union'].append(len(uni))
            res.loc[i,'Jaccard'].append(len(common)/len(uni))
            
res            

100  of  57058  unique noun phrases: set of terms:  {'YEAR', 'ACADEMIC'}
200  of  57058  unique noun phrases: set of terms:  {'ASSISTANCE', 'ACCOMMODATION'}
300  of  57058  unique noun phrases: set of terms:  {'ACCOUNT', 'INDUSTRY'}
400  of  57058  unique noun phrases: set of terms:  {'PRINCIPLE', 'ACCOUNTING'}
500  of  57058  unique noun phrases: set of terms:  {'WORKFORCE', 'ACTION', 'HEALTH', 'PLANNING'}
600  of  57058  unique noun phrases: set of terms:  {'ACCOUNT', 'ACTIVITY', 'HEALTH', 'EXPERT'}
700  of  57058  unique noun phrases: set of terms:  {'ACTIVITY', 'ACTUAL'}
800  of  57058  unique noun phrases: set of terms:  {'DATA', 'ADDITIONAL', 'ADMINISTRATIVE'}
900  of  57058  unique noun phrases: set of terms:  {'CHANGE', 'ADDRESS'}
1000  of  57058  unique noun phrases: set of terms:  {'MUNICIPALITY', 'LAND', 'ADMINISTRATION', 'FUND'}
1100  of  57058  unique noun phrases: set of terms:  {'INSTRUCTION', 'ADULT'}
1200  of  57058  unique noun phrases: set of terms:  {'PHYSICAL', 'AC

10400  of  57058  unique noun phrases: set of terms:  {'CROATIA', 'CYPRUS'}
10500  of  57058  unique noun phrases: set of terms:  {'DAIRY', 'PROCESS'}
10600  of  57058  unique noun phrases: set of terms:  {'DATA', 'PURPOSE', 'BUSINESS', 'STATISTIC'}
10700  of  57058  unique noun phrases: set of terms:  {'DATA', 'PRACTICE', 'DISSEMINATION'}
10800  of  57058  unique noun phrases: set of terms:  {'DATA', 'HUB', 'SOLUTION'}
10900  of  57058  unique noun phrases: set of terms:  {'DATA', 'NUT', 'REGION'}
11000  of  57058  unique noun phrases: set of terms:  {'DATA', 'SECURITY'}
11100  of  57058  unique noun phrases: set of terms:  {'DATABASE', 'EUROSTAT'}
11200  of  57058  unique noun phrases: set of terms:  {'DAY', 'MALTA'}
11300  of  57058  unique noun phrases: set of terms:  {'DEBT', 'INSTRUMENT'}
11400  of  57058  unique noun phrases: set of terms:  {'GREECE', 'DECLINE'}
11500  of  57058  unique noun phrases: set of terms:  {'LSU', 'DECREASE'}
11600  of  57058  unique noun phrases: set o

20600  of  57058  unique noun phrases: set of terms:  {'POTENTIAL', 'FULL'}
20700  of  57058  unique noun phrases: set of terms:  {'PREPARATION', 'PROJECT', 'FUNDING'}
20800  of  57058  unique noun phrases: set of terms:  {'SPAIN', 'GALICIA'}
20900  of  57058  unique noun phrases: set of terms:  {'GASEOUS', 'MATERIAL'}
21000  of  57058  unique noun phrases: set of terms:  {'GENDER', 'DIFFERENCE'}
21100  of  57058  unique noun phrases: set of terms:  {'REGULATION', 'IMPLEMENTING', 'GENERAL'}
21200  of  57058  unique noun phrases: set of terms:  {'GAP', 'GENERATIONAL'}
21300  of  57058  unique noun phrases: set of terms:  {'STRATIFICATION', 'GEOGRAPHICAL'}
21400  of  57058  unique noun phrases: set of terms:  {'REGION', 'GERMAN'}
21500  of  57058  unique noun phrases: set of terms:  {'GLACIAL', 'RETREAT'}
21600  of  57058  unique noun phrases: set of terms:  {'WORK', 'GMINAS'}
21700  of  57058  unique noun phrases: set of terms:  {'EXPORT', 'GOOD'}
21800  of  57058  unique noun phrases: 

30800  of  57058  unique noun phrases: set of terms:  {'SYSTEM', 'MANAGEMENT', 'MANURE'}
30900  of  57058  unique noun phrases: set of terms:  {'MANY', 'REGION', 'MOUNTAINOUS'}
31000  of  57058  unique noun phrases: set of terms:  {'MARGINAL', 'CHANGE'}
31100  of  57058  unique noun phrases: set of terms:  {'MARKET', 'ENTRY'}
31200  of  57058  unique noun phrases: set of terms:  {'ISLAND', 'MARSHALL'}
31300  of  57058  unique noun phrases: set of terms:  {'RATE', 'MATH'}
31400  of  57058  unique noun phrases: set of terms:  {'MEAN', 'INCREASE'}
31500  of  57058  unique noun phrases: set of terms:  {'MEASUREMENT', 'DISABILITY'}
31600  of  57058  unique noun phrases: set of terms:  {'GOOD', 'MEDICAL'}
31700  of  57058  unique noun phrases: set of terms:  {'MEETING', 'PARIS'}
31800  of  57058  unique noun phrases: set of terms:  {'MENTAL', 'HEALTHCARE', 'SERVICE'}
31900  of  57058  unique noun phrases: set of terms:  {'METHOD', 'SOIL', 'SAMPLING'}
32000  of  57058  unique noun phrases: se

41200  of  57058  unique noun phrases: set of terms:  {'CONTINUOUS', 'SURVEY', 'QUARTERLY'}
41300  of  57058  unique noun phrases: set of terms:  {'AREA', 'RAIL'}
41400  of  57058  unique noun phrases: set of terms:  {'EU', 'AREA', 'POLICY', 'RANGE'}
41500  of  57058  unique noun phrases: set of terms:  {'RAPID', 'ACCELERATION'}
41600  of  57058  unique noun phrases: set of terms:  {'GERMANY', 'RATE'}
41700  of  57058  unique noun phrases: set of terms:  {'RATIO', 'AIR', 'PASSENGER'}
41800  of  57058  unique noun phrases: set of terms:  {'RATIO', 'TAX'}
41900  of  57058  unique noun phrases: set of terms:  {'REASON', 'COMPARABILITY'}
42000  of  57058  unique noun phrases: set of terms:  {'PERIOD', 'RECENT'}
42100  of  57058  unique noun phrases: set of terms:  {'RECORD', 'HEIGHT'}
42200  of  57058  unique noun phrases: set of terms:  {'RECYCLED', 'AMOUNT'}
42300  of  57058  unique noun phrases: set of terms:  {'REDUCTION', 'OVERTIME'}
42400  of  57058  unique noun phrases: set of terms

51500  of  57058  unique noun phrases: set of terms:  {'FIBRE', 'TEXTILE'}
51600  of  57058  unique noun phrases: set of terms:  {'THIRD', 'CONFERENCE', 'INTERNATIONAL'}
51700  of  57058  unique noun phrases: set of terms:  {'BUDGET', 'TIGHT'}
51800  of  57058  unique noun phrases: set of terms:  {'PER', 'TIME', 'YEAR'}
51900  of  57058  unique noun phrases: set of terms:  {'CONSUMPTION', 'TOBACCO'}
52000  of  57058  unique noun phrases: set of terms:  {'BEET', 'SUGAR', 'TONNE'}
52100  of  57058  unique noun phrases: set of terms:  {'ARABLE', 'LAND', 'TOTAL'}
52200  of  57058  unique noun phrases: set of terms:  {'FRESH', 'FRUIT', 'TOTAL'}
52300  of  57058  unique noun phrases: set of terms:  {'OUTLAY', 'TOTAL'}
52400  of  57058  unique noun phrases: set of terms:  {'GENERATION', 'TOTALITY', 'ELECTRICITY'}
52500  of  57058  unique noun phrases: set of terms:  {'TOWN', 'VILLAGE'}
52600  of  57058  unique noun phrases: set of terms:  {'TRADE', 'STATE', 'MEMBER'}
52700  of  57058  unique 

Unnamed: 0,normalized_noun_phrase,Common,len_intersect,len_union,Jaccard
0,A LEVEL,"[[220, ATTACHMENT LEVEL, {LEVEL}], [273, BASIC...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 3, 3, 4, 2, 2, 4, 2, 2, 5, 2, 4, 3, 1, 1, ...","[0.5, 0.3333333333333333, 0.3333333333333333, ..."
1,A SINGLE PERSON,"[[13778, ACCIDENT PERSON CAUSED ROLLING STOCK ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[7, 7, 6, 4, 3, 4, 3, 5, 3, 3, 3, 3, 5, 5, 4, ...","[0.14285714285714285, 0.14285714285714285, 0.1..."
2,AASTERN EUROPEAN COUNTRY,"[[8557, BRIDGE COUNTRY, {COUNTRY}], [17029, CO...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4, 7, 6, 4, 4, 10, 4, 4, 4, 4, 4, 5, 6, 4, 4,...","[0.25, 0.14285714285714285, 0.1666666666666666..."
3,ABBREVIATED NEET,[],[],[],[]
4,ABBREVIATION ESA,[],[],[],[]
...,...,...,...,...,...
57053,ZOOM BUTTON,[],[],[],[]
57054,Ã LAND,"[[43, ACQUISITION LE DISPOSAL LAND TANGIBLE AS...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[7, 3, 3, 3, 3, 5, 3, 7, 3, 5, 5, 4, 3, 3, 5, ...","[0.14285714285714285, 0.3333333333333333, 0.33..."
57055,Ã LAND ISLAND,"[[43, ACQUISITION LE DISPOSAL LAND TANGIBLE AS...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[8, 4, 4, 4, 4, 6, 4, 8, 4, 6, 6, 5, 4, 4, 6, ...","[0.125, 0.25, 0.25, 0.25, 0.25, 0.166666666666..."
57056,Ã RDAL,[],[],[],[]


In [25]:
outfile = file_name('SE_vs_Eurostat_Noun_Phrases','xlsx')
res.to_excel(outfile)