In [49]:
import pandas as pd
df = pd.read_csv('data-files/full-flair-ner-list-oecd-corpus.csv')

In [50]:
df.head()

Unnamed: 0,entity,entity_type,sentence,span,docid,model
0,oecd,ORG,preface as part of the oecd programme on water...,23:27,39,flair - FLERT and XML embeddings
1,peru,GPE,this report is the result of a two-year policy...,113:117,39,flair - FLERT and XML embeddings
2,peru,GPE,2021 coincides with peru’s 200 years of indepe...,20:24,39,flair - FLERT and XML embeddings
3,peruvians,NORP,two-thirds of peruvians live in the pacific hy...,14:23,39,flair - FLERT and XML embeddings
4,peru,GPE,the recovery from covid-19 presents a unique w...,71:75,39,flair - FLERT and XML embeddings


In [51]:
# total unique named entities
n = len(pd.unique(df['entity']))
print("no. of unique named entities : ", n)

# for item in pd.unique(df['entity'])[:100]:
#     print(item)

no. of unique named entities :  20465


In [52]:
# total unique named entities by entity type
entity_counts_by_type_df = df.groupby(['entity_type'])['entity'].nunique()
entity_counts_by_type_df

entity_type
FAC        298
GPE       3372
LOC       2380
NORP       270
ORG       9970
PERSON    5930
Name: entity, dtype: int64

In [53]:
# dataframes for each entity type
fac_df = df[df['entity_type'] == 'FAC']
gpe_df = df[df['entity_type'] == 'GPE']
loc_df = df[df['entity_type'] == 'LOC']
norp_df = df[df['entity_type'] == 'NORP']
org_df = df[df['entity_type'] == 'ORG']
per_df = df[df['entity_type'] == 'PERSON']

In [54]:
org_df.head()

Unnamed: 0,entity,entity_type,sentence,span,docid,model
0,oecd,ORG,preface as part of the oecd programme on water...,23:27,39,flair - FLERT and XML embeddings
7,oecd,ORG,"the oecd stands ready to support peru design, ...",4:8,39,flair - FLERT and XML embeddings
10,oecd,ORG,ángel gurría oecd secretary-general gabriel qu...,13:17,39,flair - FLERT and XML embeddings
14,oecd,ORG,ángel gurría oecd secretary-general gabriel qu...,224:228,39,flair - FLERT and XML embeddings
15,oecd,ORG,this report expands the global outreach of oec...,43:47,39,flair - FLERT and XML embeddings


In [55]:
org_df['entity'].value_counts()

oecd                                                     12390
ana                                                        800
eu                                                         786
conagua                                                    469
world bank                                                 456
                                                         ...  
committee of water and sanitation users of grenoble          1
, food and the environment                                   1
robvq                                                        1
the commission for the protection of the danube river        1
rbmps                                                        1
Name: entity, Length: 9970, dtype: int64

In [56]:
org_df.loc[org_df['entity'].str.contains('Rockefeller', case=False)]

Unnamed: 0,entity,entity_type,sentence,span,docid,model
40464,the rockefeller foundation,ORG,initiated by the presidential hurricane sandy ...,131:157,13,flair - FLERT and XML embeddings
41609,the rockefeller foundation ’s,ORG,"in 2012, circle of blue received the rockefell...",33:61,13,flair - FLERT and XML embeddings
70024,rockefeller foundation,ORG,the city resilience index (box 3.2) provides a...,211:233,43,flair - FLERT and XML embeddings
70028,the rockefeller foundation,ORG,source: the rockefeller foundation/arup (2016)...,8:34,43,flair - FLERT and XML embeddings
70136,the rockefeller foundation,ORG,"the rockefeller foundation/arup (2016), city r...",0:26,43,flair - FLERT and XML embeddings
102543,rockefeller foundation,ORG,water quality trading be effective to tackle w...,180:202,17,flair - FLERT and XML embeddings
102933,rockefeller foundation,ORG,"rockefeller foundation (2015), “incentive-base...",0:22,17,flair - FLERT and XML embeddings
102937,the rockefeller foundation,ORG,"rockefeller foundation (2015), “incentive-base...",216:242,17,flair - FLERT and XML embeddings


In [57]:
# Conagua - Comision Nacional Del Agua (National Water Commission - Mexico)
# Ana - national water agency of Brazil

In [58]:
from pandas.api.types import is_categorical_dtype

for col in org_df.columns:
    if (col not in ['sentence', 'span']):
        if (is_categorical_dtype(col)):
            display(pd.DataFrame(org_df[col].astype('str').value_counts().sort_values(ascending=False).head(20)))
        else:
            display(pd.DataFrame(org_df[col].value_counts().sort_values(ascending=False).head(20)))

Unnamed: 0,entity
oecd,12390
ana,800
eu,786
conagua,469
world bank,456
european commission,213
the european union,186
fao,153
the world bank,144
the,137


Unnamed: 0,entity_type
ORG,42268


Unnamed: 0,docid
26,2011
30,1881
5,1838
39,1710
10,1465
46,1458
52,1271
13,1252
53,1223
32,1184


Unnamed: 0,model
flair - FLERT and XML embeddings,42268


In [59]:
for col in org_df.columns:
    if (col not in ['sentence', 'span']):
        if (is_categorical_dtype(col)):
            display(pd.DataFrame(org_df[col].astype('str').value_counts().sort_values(ascending=True).head(20)))
        else:
            display(pd.DataFrame(org_df[col].value_counts().sort_values(ascending=True).head(20)))

Unnamed: 0,entity
institute of the national fund for workers’ housing,1
icrc,1
public services regulatory commission of the republic of armenia,1
water regulatory authority of albania,1
the greater bilbao water partnership,1
the water management association of the,1
“ metro vancouver ”,1
metro vancouver,1
barcelona metropolitan area,1
conseil communautaire,1


Unnamed: 0,entity_type
ORG,42268


Unnamed: 0,docid
29,241
35,250
1,263
41,266
0,277
45,290
6,344
31,374
14,376
51,409


Unnamed: 0,model
flair - FLERT and XML embeddings,42268


In [60]:
for col in per_df.columns:
    # print(col, end=' - \n')
    # print('_' * 50)
    if (col not in ['sentence', 'span']):
        if (is_categorical_dtype(col)):
            display(pd.DataFrame(per_df[col].astype('str').value_counts().sort_values(ascending=False).head(20)))
        else:
            display(pd.DataFrame(per_df[col].value_counts().sort_values(ascending=False).head(20)))

Unnamed: 0,entity
fao,54
ana,53
margat,43
shah,40
van der gun,39
xavier leflaive,39
j.,37
van,36
hutton,36
kim,32


Unnamed: 0,entity_type
PERSON,10753


Unnamed: 0,docid
49,823
30,752
46,568
52,557
28,455
7,444
39,435
26,416
16,385
10,360


Unnamed: 0,model
flair - FLERT and XML embeddings,10753


In [61]:
for col in per_df.columns:
    # print(col, end=' - \n')
    # print('_' * 50)
    if (col not in ['sentence', 'span']):
        if (is_categorical_dtype(col)):
            display(pd.DataFrame(per_df[col].astype('str').value_counts().sort_values(ascending=True).head(20)))
        else:
            display(pd.DataFrame(per_df[col].value_counts().sort_values(ascending=True).head(20)))

Unnamed: 0,entity
allain-dupré,1
ben cheikh,1
pegram,1
arthur pigou,1
burdon,1
atkins,1
georgiou,1
goffe,1
schuhmann,1
reed,1


Unnamed: 0,entity_type
PERSON,10753


Unnamed: 0,docid
14,12
35,17
22,19
18,21
45,22
15,29
55,34
27,41
31,44
25,48


Unnamed: 0,model
flair - FLERT and XML embeddings,10753
