In [1]:
import pandas as pd

import re

import nltk
from nltk.tokenize import word_tokenize

import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

## Definitions

In [2]:
# extracting words with labels: person, gpe, location and organisation
def extract_ne_from_tree ( tree ):
    result = []
    for s in tree.subtrees():
        label = s.label()
        if (label == 'PERSON' or label == 'GPE' or label == 'LOCATION' or label == 'ORGANIZATION'):
            leaves = s.leaves()
            ne = ''
            for l in leaves:
                ne = ne + ' ' + l[0]
            result.append((label, ne[1:]))
    return result

In [3]:
#tokenise, pos tagging, extracting words with labels above & place them into a list
def ner_info_extraction(content ,content_list):
    for x in content:
        res_sents = nltk.sent_tokenize(x)
        res_tokens = [word_tokenize(sents) for sents in res_sents]
        tagged_res = [nltk.pos_tag(tokens) for tokens in res_tokens]
        res_tree = [nltk.ne_chunk(elem) for elem in tagged_res]
        res_list = [extract_ne_from_tree(tree) for tree in res_tree]
        content_list+=[(res_list)]

In [4]:
# Since cnn is present at the start of all articles, it creates noise in the entities extracted
def ner_sorting(content ,location, gpe, person, org):
    for row in content:
        check_location = []
        check_gpe = []
        check_person = []
        check_org = []
        for sent in row:
            if len(sent)!=0:
                for tup in sent:
                    if tup[1] != 'CNN':
                        if tup[0] == "LOCATION":
                            if tup[1] not in check_location:
                                check_location.append(tup[1])
                        if tup[0] == "GPE":
                            if tup[1] not in check_gpe:
                                check_gpe.append(tup[1])
                        if tup[0] == "PERSON":
                            if tup[1] not in check_person:
                                check_person.append(tup[1])
                        if tup[0] == "ORGANIZATION":
                            if tup[1] not in check_org:
                                check_org.append(tup[1])
        location.append(check_location)
        gpe.append(check_gpe)
        person.append(check_person)
        org.append(check_org)

In [5]:
# Extraction of entities using spaCy
def spacy_ner(content):
    nlist = []
    for x in content:
        cont_doc = nlp(x)
        temp = []
        for ent in cont_doc.ents:
            if ent.label_=="ORG" or ent.label_=="PERSON" or ent.label_=="GPE" or ent.label_=="LOC":
                temp.append([ent.text, ent.label_])
        nlist.append(temp)
    return nlist

In [6]:
# Since cnn is present at the start of all articles, it creates noise in the entities extracted
def spacy_sorting(content):
    org = []
    gpe = []
    person = []
    location = []
    for row in content:
        check_org = []
        check_gpe = []
        check_person = []
        check_loc = []
        if len(row)!=0:
            for tup in row:
                if tup[0]!= "CNN":
                    if tup[1] == "ORG":
                        if tup[0] not in check_org:
                            check_org.append(tup[0])
                    if tup[1] == "GPE":
                        if tup[0] not in check_gpe:
                            check_gpe.append(tup[0])
                    if tup[1] == "PERSON":
                        if tup[0] not in check_person:
                            check_person.append(tup[0])
                    if tup[1] == "LOC":
                        if tup[0] not in check_loc:
                            check_loc.append(tup[0])
        org.append(check_org)
        gpe.append(check_gpe)
        person.append(check_person)
        location.append(check_loc)
    return [org, gpe, person, location]

## cnn news

In [7]:
import glob
import os

path = r'../2.text_preprocessing/cna/cna_FINAL.csv' # use your path

#Importing in the news articles
cna = pd.read_csv(path)
cna = cna.loc[:,['url','text']]
print(cna.shape)
cna

(139, 2)


Unnamed: 0,url,text
0,https://www.channelnewsasia.com/sport/steely-m...,LONDON :Seven-time winners AC Milan reached th...
1,https://www.channelnewsasia.com/sport/choupo-m...,MUNICH :Bayern Munich forward Eric-Maxim Choup...
2,https://www.channelnewsasia.com/sport/chelsea-...,LONDON: Chelsea's Guro Reiten scored from the ...
3,https://www.channelnewsasia.com/sport/nothing-...,LONDON: AC Milan can dare to dream about going...
4,https://www.channelnewsasia.com/sport/contes-c...,LONDON: Antonio Conte's dismal record in the C...
...,...,...
134,https://www.channelnewsasia.com/sport/fa-cup-q...,LONDON : FA Cup quarter-final draw made on Wed...
135,https://www.channelnewsasia.com/sport/man-unit...,"MANCHESTER, England :Manchester United fought ..."
136,https://www.channelnewsasia.com/sport/formula-...,Statistics for Sunday's season-opening Bahrain...
137,https://www.channelnewsasia.com/sport/prop-hao...,Prop Mohamed Haouas will miss France's remaini...


NLTK extraction

In [8]:
# Extracting using NLTK with the help of defined function above
cna_content = []

ner_info_extraction(cna['text'],cna_content)
    
print(cna_content)

[[[('ORGANIZATION', 'AC Milan'), ('ORGANIZATION', 'Champions League'), ('ORGANIZATION', 'Tottenham Hotspur')], [('ORGANIZATION', 'Italians'), ('ORGANIZATION', 'San Siro'), ('GPE', 'Tottenham')], [('PERSON', 'Tottenham'), ('PERSON', 'Antonio Conte'), ('PERSON', 'Cristian Romero')], [('PERSON', 'Milan')], [('PERSON', 'Stefano Pioli'), ('PERSON', 'Milan'), ('PERSON', 'Tottenham'), ('ORGANIZATION', 'Sheffield United'), ('PERSON', 'Wolverhampton Wanderers')], [('PERSON', 'Milan'), ('GPE', 'European'), ('PERSON', 'Pioli')], [], [], [('PERSON', 'Pioli')], [], [('PERSON', 'Tottenham')], [], [('GPE', 'Tottenham'), ('PERSON', 'Harry Kane'), ('PERSON', 'Son Heung-min'), ('PERSON', 'Dejan Kulusevski'), ('PERSON', 'Milan'), ('PERSON', 'Junior Messias')], [('ORGANIZATION', 'Italians'), ('PERSON', 'Brahim Diaz'), ('PERSON', 'Fraser Forster')], [('PERSON', 'Tottenham'), ('PERSON', 'Milan'), ('PERSON', 'Mike Maignan')], [('PERSON', 'Tottenham'), ('PERSON', 'Rafael Leao')], [('PERSON', 'Kane'), ('PERSON

In [9]:
#sort words according to labelled entities
location_with_brac = []
gpe_with_brac = []
person_with_brac = []
org_with_brac = []

ner_sorting(cna_content, location_with_brac, gpe_with_brac, person_with_brac, org_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

print(org_with_brac)

[[], ['Bayern'], [], [], [], [], [], ['Northern Irishman'], [], ['Southern California'], [], [], [], [], [], ['East Brabant'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Ham United'], [], ['South Africa', 'Caribbean'], [], [], [], ['West'], [], [], [], [], [], [], [], [], ['West', 'West Indies'], [], [], [], [], [], [], [], [], ['South'], [], [], [], [], [], ['Southeast'], ['South Korean'], [], [], [], [], ['West Ham United'], [], [], ['West Germany', 'West German'], [], [], [], [], [], [], [], ['Sebastian Munoz'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Ham United'], [], ['South Africa', 'Caribbean'], [], [], [], [], ['West Ham United', 'West Bromwich'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Germany', 'West German'], ['West'], [], ['Western Force'], [], [], [], [], [], [], ['West Ham'], [], [], []]

[['Tottenham', 'European'], ['MUNICH', 'Germans', 'Paris', 'Bayern', 'Europe

In [10]:
# add into dataframe
cna['location_with_brac'] = location_with_brac
cna['gpe_with_brac'] = gpe_with_brac
cna['person_with_brac'] = person_with_brac
cna['org_with_brac'] = org_with_brac

cna['nltk_location']=cna['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['nltk_gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['nltk_person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['nltk_org'] = cna['org_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'org_with_brac'],axis = 1)

cna.head()

Unnamed: 0,url,text,nltk_location,nltk_gpe,nltk_person,nltk_org
0,https://www.channelnewsasia.com/sport/steely-m...,LONDON :Seven-time winners AC Milan reached th...,,"Tottenham, European","Tottenham, Antonio Conte, Cristian Romero, Mil...","AC Milan, Champions League, Tottenham Hotspur,..."
1,https://www.channelnewsasia.com/sport/choupo-m...,MUNICH :Bayern Munich forward Eric-Maxim Choup...,Bayern,"MUNICH, Germans, Paris, Bayern, Europe, French...","Bayern Munich, Serge Gnabry, Paris St Germain,...","Champions League, PSG, VfB Stuttgart, Sommer, ..."
2,https://www.channelnewsasia.com/sport/chelsea-...,LONDON: Chelsea's Guro Reiten scored from the ...,,"Brighton, United, Chelsea, Norway, Blues, Women","Chelsea, Guro Reiten, Johanna Rytting Kaneryd,...","Super League, Jess Carter, Swede"
3,https://www.channelnewsasia.com/sport/nothing-...,LONDON: AC Milan can dare to dream about going...,,"London, Europe, European, Milan, Serie","Milan, Stefano Pioli, Tottenham Hotspur, Pioli...","Champions League, AS Roma"
4,https://www.channelnewsasia.com/sport/contes-c...,LONDON: Antonio Conte's dismal record in the C...,,"Tottenham, London, Italian, Italy","Antonio Conte, Conte, Juventus, Chelsea, Totte...","Champions League, Tottenham Hotspur, AC Milan,..."


spacy

In [11]:
#extract entities using above defined function
cont = cna['text'].tolist()

cna_list = spacy_ner(cont)
    
print(cna_list)

[[['LONDON', 'GPE'], ['AC Milan', 'ORG'], ['London', 'GPE'], ['Tottenham', 'GPE'], ['Tottenham', 'GPE'], ['Antonio Conte', 'PERSON'], ['Romero', 'PERSON'], ['Milan', 'GPE'], ["Stefano Pioli's", 'PERSON'], ['Milan', 'GPE'], ['Tottenham', 'GPE'], ['Sheffield United', 'GPE'], ['Wolverhampton Wanderers', 'PERSON'], ['Milan', 'GPE'], ['Pioli', 'GPE'], ['Tottenham', 'ORG'], ['Pioli', 'GPE'], ['GEAR', 'ORG'], ['Tottenham', 'GPE'], ['Tottenham', 'ORG'], ['Harry Kane', 'PERSON'], ['Son Heung-min', 'PERSON'], ['Dejan Kulusevski', 'PERSON'], ['Milan', 'GPE'], ['Brahim Diaz', 'PERSON'], ['Fraser Forster', 'GPE'], ['Tottenham', 'GPE'], ['Milan', 'GPE'], ['Mike Maignan', 'PERSON'], ['Pierre-Emile Hojbjerg', 'PERSON'], ['Tottenham', 'ORG'], ['Rafael Leao', 'PERSON'], ['Kane', 'PERSON'], ['Tottenham', 'GPE'], ['Richarlison', 'PERSON'], ['Emerson', 'ORG'], ['Conte', 'PERSON'], ['Kulusevski', 'PERSON'], ['Davinson Sanchez', 'PERSON'], ['Kane', 'PERSON'], ['Maignan', 'PERSON'], ['Milan', 'GPE'], ['Divock

In [12]:
# sort according to entities
sort = spacy_sorting(cna_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])
print()

print(sort[3])

[['AC Milan', 'Tottenham', 'GEAR', 'Emerson'], ['Choupo-Moting', 'Mbappe', 'Matthijs de Ligt', 'Bayern'], ['Chelsea', 'Brighton & Hove Albion', "Women's Super League", 'Arsenal', "the Women's League Cup"], ['AC Milan', 'the Champions League', 'Serie A', 'Champions League', 'Chelsea'], ['Champions League', 'AC Milan', 'Juventus', 'Inter Milan', 'Chelsea', 'the Premier League', 'Tottenham', 'the Europa Conference League'], ['the Champions League', 'Bayern', 'Mbappe'], [], ['the PGA Tour', 'a Hall of Fame', 'Sawgrass', 'PGA Tour Player', 'TRIUMPH\r\n\r\nMcIlroy', "the PGA Tour's", 'Scheffler', 'PGA Tour', 'LIV Golf', 'top-10', 'TPC Sawgrass', 'Anirban Lahiri'], ['Woods'], ['Indian Wells', 'Grand Slam'], ['Commonwealth Sport Canada', 'CSC', "Commonwealth Sport Canada's", 'the Alberta 2030 Commonwealth Games Corporation', 'the Commonwealth Games Federation'], ['WELLS', 'Netflix'], ['Juventus', "Juventus' Europa League last-16", 'Pogba', 'Manchester United', 'Serie A.'], ['Mercedes', 'Bahrai

In [13]:
#append it to dataframe
cna['org_with_brac'] = sort[0]
cna['gpe_with_brac'] = sort[1]
cna['person_with_brac'] = sort[2]
cna['location_with_brac'] = sort[3]

cna['spacy_org']=cna['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['spacy_gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['spacy_person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['spacy_location'] = cna['location_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac','location_with_brac'],axis = 1)

cna.head()

Unnamed: 0,url,text,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location
0,https://www.channelnewsasia.com/sport/steely-m...,LONDON :Seven-time winners AC Milan reached th...,,"Tottenham, European","Tottenham, Antonio Conte, Cristian Romero, Mil...","AC Milan, Champions League, Tottenham Hotspur,...","AC Milan,Tottenham,GEAR,Emerson","LONDON, London, Tottenham, Milan, Sheffield Un...","Antonio Conte, Romero, Stefano Pioli's, Wolver...",
1,https://www.channelnewsasia.com/sport/choupo-m...,MUNICH :Bayern Munich forward Eric-Maxim Choup...,Bayern,"MUNICH, Germans, Paris, Bayern, Europe, French...","Bayern Munich, Serge Gnabry, Paris St Germain,...","Champions League, PSG, VfB Stuttgart, Sommer, ...","Choupo-Moting,Mbappe,Matthijs de Ligt,Bayern","Paris, Neymar, Messi","Bayern Munich, Eric-Maxim Choupo-Moting, Serge...",Europe
2,https://www.channelnewsasia.com/sport/chelsea-...,LONDON: Chelsea's Guro Reiten scored from the ...,,"Brighton, United, Chelsea, Norway, Blues, Women","Chelsea, Guro Reiten, Johanna Rytting Kaneryd,...","Super League, Jess Carter, Swede","Chelsea,Brighton & Hove Albion,Women's Super L...","LONDON, Manchester United, Norway, Brighton, L...","Johanna Rytting Kaneryd, Sam Kerr, Jess Carter...",
3,https://www.channelnewsasia.com/sport/nothing-...,LONDON: AC Milan can dare to dream about going...,,"London, Europe, European, Milan, Serie","Milan, Stefano Pioli, Tottenham Hotspur, Pioli...","Champions League, AS Roma","AC Milan,the Champions League,Serie A,Champion...","LONDON, Milan, London, Pioli, Tottenham","Stefano Pioli, Bayern Munich, Benfica",Europe
4,https://www.channelnewsasia.com/sport/contes-c...,LONDON: Antonio Conte's dismal record in the C...,,"Tottenham, London, Italian, Italy","Antonio Conte, Conte, Juventus, Chelsea, Totte...","Champions League, Tottenham Hotspur, AC Milan,...","Champions League,AC Milan,Juventus,Inter Milan...","LONDON, London, Tottenham, Italy, Amazon Prime...","Antonio Conte's, Conte, Wolverhampton Wanderers",


In [14]:
cna.to_csv('NER_data/cna_NER.csv')

### Evaluation

In [15]:
path = r'../1.raw_data/golden_truth/golden_truth_cna.csv' # use your path

# import gold truth generated by chatgpt
cna_gold = pd.read_csv(path)
cna_gold = cna_gold.loc[:,['url','organisation','gpe','location','person']]
print(cna_gold.shape)
cna_gold

(50, 5)


Unnamed: 0,url,organisation,gpe,location,person
0,https://www.channelnewsasia.com/sport/djokovic...,ATP Tour,"Serbia, Czech Republic","Adelaide, Melbourne Park, Dubai","Novak Djokovic, Rafa Nadal, Steffi Graf, Tomas..."
1,https://www.channelnewsasia.com/sport/future-l...,"Manchester United, Ajax Amsterdam, Barcelona, ...","London, Wembley.",Old Trafford,"Cristiano Ronaldo, Erik ten Hag, Marcus Rashfo..."
2,https://www.channelnewsasia.com/sport/clinical...,"Manchester United, Newcastle United, VAR","London, Wembley, Saudi Arabia",Tyneside,"Casemiro, Sven Botman, Eddie Howe, Erik ten Ha..."
3,https://www.channelnewsasia.com/sport/klinsman...,"South Korea Football Association (KFA), Bayern...","South Korea, Germany, United States, Canada, M...",the United States,"Juergen Klinsmann, Paulo Bento, Gus Hiddink"
4,https://www.channelnewsasia.com/sport/spurs-be...,"Tottenham Hotspur, Chelsea",London,London,"Oliver Skipp, Harry Kane, Enzo Fernandez, Kepa..."
5,https://www.channelnewsasia.com/sport/psg-romp...,"Paris St Germain, Olympique de Marseille, RC L...",France,Velodrome,"Lionel Messi, Kylian Mbappe, Neymar, Edinson C..."
6,https://www.channelnewsasia.com/sport/la-liga-...,"Barcelona, Real Madrid, Atletico Madrid, Europ...",Spain,Almeria,"El Bilal Toure, Luis Suarez, Leo Baptistao, Ma..."
7,https://www.channelnewsasia.com/sport/inter-ti...,"Inter Milan, Napoli, Empoli, AC Milan, AS Roma...",Italy,Stadio Renato Dall'Ara,"Riccardo Orsolini, Musa Barrow, Roberto Sorian..."
8,https://www.channelnewsasia.com/sport/ighalo-h...,"Asian Champions League, Al Hilal, Al Duhail, U...","Qatar, Doha, Saudi Arabia, Japan",Al Thumama Stadium,"Odion Ighalo, Moussa Marega, Salem Al Dawsari,..."
9,https://www.channelnewsasia.com/sport/rashford...,"Manchester United, Newcastle United, Carabao Cup","Manchester United, Newcastle, and West Ham United",Old Trafford,"Marcus Rashford, Casemiro, Sven Botman"


In [16]:
# joining gold truth data with extracted entities
eval_cna = cna.set_index('url').join(cna_gold.set_index('url'))
print(eval_cna.shape)
eval_cna.head()

(139, 13)


Unnamed: 0_level_0,text,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location,organisation,gpe,location,person
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
https://www.channelnewsasia.com/sport/abde-goal-gives-osasuna-first-leg-win-against-bilbao-copa-del-rey-3317026,"PAMPLONA, Spain : Moroccan forward Ez Abde sco...",,"PAMPLONA, Spain, Bilbao, Barcelona, Pamplona","Abde, Osasuna","Athletic Bilbao, Real Madrid, Basque Country, ...","PAMPLONA,Real Madrid","Spain, Osasuna, Bilbao, Barcelona, Pamplona","Ez Abde, Abde","Athletic Bilbao, Iker Muniain",,,,
https://www.channelnewsasia.com/sport/ac-milan-owner-says-serie-should-steer-clear-outside-investment-3320416,LONDON: Italy's Serie A soccer league should a...,,"LONDON, Italy, Germany, Italian, New York","Serie, Milan, Gerry Cardinale, Serie A","AC, RedBird Capital, Financial Times Business,...","Serie A,AC Milan,RedBird Capital,Serie A's,the...","LONDON, Italy, Germany, Bundesliga",Gerry Cardinale,,,,,
https://www.channelnewsasia.com/sport/all-blacks-coach-foster-says-wont-re-apply-job-post-world-cup-3314316,New Zealand head coach Ian Foster said on Wedn...,,"New Zealand, France","Ian Foster, Foster","Rugby, New Zealand Rugby, NZR, All Blacks","the Rugby World Cup,NZR","New Zealand, France","Ian Foster, Foster",,,,,
https://www.channelnewsasia.com/sport/alonso-privileged-have-raced-longer-piastris-age-3320381,MANAMA: Oscar Piastri was not yet born when Fe...,,"MANAMA, Bahrain, Australian, Melbourne, Piastri","Oscar Piastri, Fernando Alonso, Aston, Martin,...","Formula One, Sakhir, FIA, Fernando Alonso, Fer...","Formula One,Alonso's,Renault,Alpine,Fernando,F...","Bahrain, Melbourne, US","Oscar Piastri, Fernando Alonso, Aston Martin, ...",,,,,
https://www.channelnewsasia.com/sport/argentina-schedule-first-home-friendlies-after-world-cup-triumph-3320061,BUENOS AIRES: Argentina will play their first ...,,"Argentina, Santiago, France, Qatar","Panama, Curacao, Estero, Lionel Messi, Lionel ...","Lionel Scaloni, Monumental, Madre, Argentine, AFA",AFA,"Argentina, Panama, Curacao, France, Qatar",Lionel Messi,,,,,


In [17]:
# Since we did not extract gold truth from chatgpt for all articles, we only keep those that has gold truth extracted.
eval_cna2 = eval_cna[ (eval_cna['organisation'].notna()) & (eval_cna['gpe'].notna()) & (eval_cna['location'].notna()) & (eval_cna['person'].notna())]
print(eval_cna2.shape)
eval_cna2.head()

(48, 13)


Unnamed: 0_level_0,text,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location,organisation,gpe,location,person
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
https://www.channelnewsasia.com/sport/australias-starc-still-feels-discomfort-finger-eyes-return-third-test-3308601,BENGALURU :Australia pace man Mitchell Starc s...,,"Australia, India, Indore, Cummins, Indian, New...","Starc, Pat Cummins, David Warner, Cameron Gree...",Mitchell,,"Australia, India, New Delhi","Mitchell Starc, Pat Cummins, David Warner, Sta...",,"Australia cricket team, India cricket team","India, Australia","Bengaluru, Indore, New Delhi","Mitchell Starc, Pat Cummins, David Warner, Cam..."
https://www.channelnewsasia.com/sport/besiktas-fans-throw-toys-field-children-affected-earthquake-3308381,Besiktas fans threw thousands of stuffed toys ...,,"Antalyaspor, Turkey, Ghana","Besiktas, Christian Atsu","Turkish Super Lig, Vodafone Park","Besiktas,The Turkish Super Lig","Turkey, Syria","Ghana, Christian Atsu",,Besiktas,"Antalyaspor, Ghana, Vodafone Park, Turkey, Syria","Turkey, Syria",Christian Atsu
https://www.channelnewsasia.com/sport/brathwaite-braced-south-africa-challenge-first-test-3309491,West Indies captain Kraigg Brathwaite believes...,"South Africa,Caribbean","West, Zimbabwe, Australia, South Africa, Brath...","Kraigg Brathwaite, Brathwaite","Indies, Centurion Park",,"South Africa, Zimbabwe, Australia, Gqeberha","Kraigg Brathwaite, Brathwaite","West Indies, Caribbean","West Indies cricket team, South Africa cricket...",West Indies,"Centurion Park, South Africa, Zimbabwe, Austra...",Kraigg Brathwaite
https://www.channelnewsasia.com/sport/brathwaite-braced-south-africa-challenge-first-test-3309491,West Indies captain Kraigg Brathwaite believes...,"South Africa,Caribbean","West, Zimbabwe, Australia, South Africa, Brath...","Kraigg Brathwaite, Brathwaite","Indies, Centurion Park",,"South Africa, Zimbabwe, Australia, Gqeberha","Kraigg Brathwaite, Brathwaite","West Indies, Caribbean","West Indies cricket team, South Africa cricket...",West Indies,"Centurion Park, South Africa, Zimbabwe, Austra...",Kraigg Brathwaite
https://www.channelnewsasia.com/sport/british-teenager-brookes-becomes-youngest-world-champion-3310191,Britain's Mia Brookes outclassed Olympic gold ...,,"Britain, Bakuriani, Georgia, Briton, Brooks, N...","Zoi, Brooks, Brookes","Mia Brookes, FIS, Freestyle Ski, Beijing Olymp...","FIS,Briton,Brooks,Sadowski-Synnott,Onitsuka Mi...","Britain, Bakuriani, Georgia, New Zealand's, Japan","Mia Brookes, Zoi Sadowski-Synnott, Brooks",,FIS (International Ski Federation),"Britain, New Zealand, Japan","Bakuriani, Georgia, Beijing (implicitly mentio...","Mia Brookes, Zoi Sadowski-Synnott, Onitsuka Mi..."


#### NLTK

Location

In [18]:
# locations from gold truth
true_loc = eval_cna2["location"].values.tolist()

new_true_loc = []
for x in true_loc:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_true_loc.append(n.lstrip())

print(new_true_loc)

['Bengaluru', 'Indore', 'New Delhi', 'Turkey', 'Syria', 'Centurion Park', 'South Africa', 'Zimbabwe', 'Australia', 'Gqeberha', 'Centurion Park', 'South Africa', 'Zimbabwe', 'Australia', 'Gqeberha', 'Bakuriani', 'Georgia', 'Beijing (implicitly mentioned)', 'Canada', 'Munich', 'Paris', 'VfB Stuttgart', 'Tyneside', 'London', 'north London', 'Dubai', 'Melbourne', 'Rotterdam', 'Doha', 'Adelaide', 'Melbourne Park', 'Dubai', 'Adelaide', 'Melbourne Park', 'Dubai', 'Bristol', 'England', 'Leicester', 'Blackburn', 'Fulham', 'Leeds', 'Brighton and Hove', 'Stoke', 'Manchester', 'West Ham', 'United States', 'Singapore', 'Japan', 'Italy', 'Netherlands', 'Belgium', 'Hungary', 'Britain', 'Austria', 'Canada', 'Spain', 'Monaco', 'Emilia Romagna', 'Miami', 'Qatar', 'France', 'China', 'Saudi Arabia', 'Azerbaijan', 'Australia', 'Bahrain', 'Las Vegas', 'Abu Dhabi.', 'Paris', 'France', 'Norway', 'Denmark', 'Uruguay', 'England', 'Australia', 'New Zealand', 'Old Trafford', 'Manchester', 'Bristol', 'Manchester',

In [19]:
# locations extracted using NLTK
nltk_loc = eval_cna2['nltk_location'].values.tolist()

new_nltk_loc = []
for x in nltk_loc:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_nltk_loc.append(n.lstrip())

print(new_nltk_loc)

['South Africa', 'Caribbean', 'South Africa', 'Caribbean', 'Bayern', 'West Ham United', 'South Korean', 'Southeast', 'West', 'West Ham United', 'West Ham United']


In [20]:
#nltk precision, recall, f1 score of location entity
correct = 0

for x in new_nltk_loc:
    if x in new_true_loc:
        correct+=1

nltk_loc_recall = correct/len(new_true_loc)
nltk_loc_precision = correct/len(new_nltk_loc)
nltk_loc_f1_score = 2*((nltk_loc_precision * nltk_loc_recall)/ (nltk_loc_precision + nltk_loc_recall))
    
print("Recall = ", nltk_loc_recall)
print()
print("Precision = ", nltk_loc_precision)
print()
print("f1 score = ", nltk_loc_f1_score)

Recall =  0.01282051282051282

Precision =  0.18181818181818182

f1 score =  0.02395209580838323


GPE

In [21]:
# GPE from gold truth
true_gpe = eval_cna2["gpe"].values.tolist()

new_true_gpe = []
for x in true_gpe:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_true_gpe.append(n.lstrip())

print(new_true_gpe)

['India', 'Australia', 'Antalyaspor', 'Ghana', 'Vodafone Park', 'Turkey', 'Syria', 'West Indies', 'West Indies', 'Britain', 'New Zealand', 'Japan', 'SheBelievesCup', 'Germany', 'France', 'London', 'Wembley', 'Saudi Arabia', 'Premier League', 'Netherlands', 'Serbia', 'Czech Republic', 'Serbia', 'Czech Republic', 'Premier League', 'Championship', 'Bahrain', 'Las Vegas', 'China', 'France', 'Qatar', 'Australia', 'Azerbaijan', 'Miami', 'Emilia Romagna', 'Monaco', 'Spain', 'Canada', 'Austria', 'Britain', 'Hungary', 'Belgium', 'Netherlands', 'Italy', 'Singapore', 'Japan', 'United States', 'Mexico', 'Brazil', 'Abu Dhabi', 'Jeddah', 'Montreal', 'Spielberg', 'Silverstone', 'Spa-Francorchamps', 'Zandvoort', 'Interlagos', 'Suzuka', 'and Monza.', 'France', 'London', 'Wembley.', 'England', 'England', 'Sentosa Golf Club', 'Qatar', 'Doha', 'Saudi Arabia', 'Japan', 'Italy', 'Spain', 'Fair Trade Commission', 'Dentsu', 'Hakuhodo DY Holdings Inc', 'Tokyu Agency Inc', 'Fuji Creative Corporation', 'Cerespo 

In [22]:
# GPEs extracted using NLTK
nltk_gpe = eval_cna2['nltk_gpe'].values.tolist()

new_nltk_gpe = []
for x in nltk_gpe:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_nltk_gpe.append(n.lstrip())

print(new_nltk_gpe)

['Australia', 'India', 'Indore', 'Cummins', 'Indian', 'New Delhi', 'Antalyaspor', 'Turkey', 'Ghana', 'West', 'Zimbabwe', 'Australia', 'South Africa', 'Brathwaite', 'Discipline', 'West', 'Zimbabwe', 'Australia', 'South Africa', 'Brathwaite', 'Discipline', 'Britain', 'Bakuriani', 'Georgia', 'Briton', 'Brooks', 'New Zealand', 'Japan', 'Canada', 'Bontis', 'Canadian', 'MUNICH', 'Germans', 'Paris', 'Bayern', 'Europe', 'French', 'Messi', 'Vitinha', 'Dutch', 'Newcastle', 'Saudi', 'United', 'English', 'Tottenham', 'London', 'Italian', 'Italy', 'Australian', 'Djokovic', 'Netherlands', 'Doha', 'Italian', 'American', 'Botic', 'Dubai', 'Djokovic', 'Dubai', 'Djokovic', 'BRISTOL', 'England', 'Foden', 'Bristol', 'Fulham', 'Brighton', 'Bristol City', 'Haaland', 'European', 'Palhinha', 'Israeli Solomon', 'Bahrain', 'Abu Dhabi', 'U.S.', 'China', 'April', 'France', 'Qatar', 'Azerbaijan', 'Belgium', 'Brazil', 'Calendar', 'Sakhir', 'Saudi Arabia', 'Jeddah', 'Australia', 'Spain', 'Barcelona', 'Canada', 'Aust

In [23]:
#nltk precision, recall, f1 score of GPE
correct = 0

for x in new_nltk_gpe:
    if x in new_true_gpe:
        correct+=1

nltk_gpe_recall = correct/len(new_true_gpe)
nltk_gpe_precision = correct/len(new_nltk_gpe)
nltk_gpe_f1_score = 2*((nltk_gpe_precision * nltk_gpe_recall)/ (nltk_gpe_precision + nltk_gpe_recall))
    
print("Recall = ", nltk_gpe_recall)
print()
print("Precision = ", nltk_gpe_precision)
print()
print("f1 score = ", nltk_gpe_f1_score)

Recall =  0.8174603174603174

Precision =  0.3344155844155844

f1 score =  0.4746543778801842


Person

In [24]:
# Person from gold truth
true_per = eval_cna2["person"].values.tolist()

new_true_per = []
for x in true_per:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_true_per.append(n.lstrip())

print(new_true_per)

['Mitchell Starc', 'Pat Cummins', 'David Warner', 'Cameron Green', 'Steve Smith', 'Srikar Bharat', 'Christian Atsu', 'Kraigg Brathwaite', 'Kraigg Brathwaite', 'Mia Brookes', 'Zoi Sadowski-Synnott', 'Onitsuka Miyabi', 'Nick Bontis', 'Eric-Maxim Choupo-Moting', 'Serge Gnabry', 'Leon Goretzka', 'Lionel Messi', 'Kylian Mbappe', 'Julian Nagelsmann', 'Neymar', 'Gianluigi Donnarumma', 'Jamal Musiala', 'Marquinhos', 'Vitinha', 'Matthijs de Ligt', 'Sergio Ramos', 'Christophe Galtier', 'Casemiro', 'Sven Botman', 'Eddie Howe', 'Erik ten Hag', 'Jose Mourinho', 'Bruno Fernandes', 'Luke Shaw', 'Loris Karius', 'Marcus Rashford.', 'Antonio Conte', 'Novak Djokovic', 'Tomas Machac', 'Steffi Graf', 'Tallon Griekspoor', 'Daniil Medvedev', 'Matteo Arnaldi', 'Felix Auger-Aliassime', 'Maxime Cressy', 'Karen Khachanov', 'Botic van de Zandschulp', 'Novak Djokovic', 'Rafa Nadal', 'Steffi Graf', 'Tomas Machac', 'Novak Djokovic', 'Rafa Nadal', 'Steffi Graf', 'Tomas Machac', 'Phil Foden', 'Pep Guardiola', 'Kevin D

In [25]:
# Persons extracted using NLTK
nltk_per = eval_cna2['nltk_person'].values.tolist()

new_nltk_per = []
for x in nltk_per:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_nltk_per.append(n.lstrip())

print(new_nltk_per)

['Starc', 'Pat Cummins', 'David Warner', 'Cameron Green', 'Steve Smith', 'Srikar Bharat', 'Bharat', 'Besiktas', 'Christian Atsu', 'Kraigg Brathwaite', 'Brathwaite', 'Kraigg Brathwaite', 'Brathwaite', 'Zoi', 'Brooks', 'Brookes', 'Soccer', 'Nick Bontis', 'Canada Soccer', 'Bontis', 'Bayern Munich', 'Serge Gnabry', 'Paris St Germain', 'Leon Goretzka', 'Gnabry', 'Lionel Messi', 'Kylian Mbappe', 'Julian Nagelsmann', 'Neymar', 'Mbappe', 'Jamal Musiala', 'Gianluigi Donnarumma', 'Marquinhos', 'Yann Sommer', 'Matthijs', 'Sergio Ramos', 'Christophe Galtier', 'United', 'Casemiro', 'Sven Botman', 'Eddie Howe', 'Erik', 'Hag', 'Jose Mourinho', 'Bruno Fernandes', 'Brazilian Casemiro', 'Luke Shaw', 'Loris Karius', 'Marcus Rashford', 'Karius', 'Dutchman Ten Hag', 'Antonio Conte', 'Conte', 'Juventus', 'Chelsea', 'Tottenham', 'Wolverhampton Wanderers', 'Milan', 'Novak', 'Djokovic', 'Czech', 'Tomas Machac', 'Dubai Tennis', 'Steffi Graf', 'Machac', 'Tomas', 'Tallon Griekspoor', 'Rotterdam', 'Daniil Medvedev

In [26]:
#nltk precision, recall, f1 score for person
correct = 0

for x in new_nltk_per:
    if x in new_true_per:
        correct+=1

nltk_per_recall = correct/len(new_true_per)
nltk_per_precision = correct/len(new_nltk_per)
nltk_per_f1_score = 2*((nltk_per_precision * nltk_per_recall)/ (nltk_per_precision + nltk_per_recall))
    
print("Recall = ", nltk_per_recall)
print()
print("Precision = ", nltk_per_precision)
print()
print("f1 score = ", nltk_per_f1_score)

Recall =  0.7689393939393939

Precision =  0.4134419551934827

f1 score =  0.5377483443708608


Org

In [27]:
# Organisations from gold truth
true_org = eval_cna2["organisation"].values.tolist()

new_true_org = []
for x in true_org:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_true_org.append(n.lstrip())

print(new_true_org)

['Australia cricket team', 'India cricket team', 'Besiktas', 'West Indies cricket team', 'South Africa cricket team', 'West Indies cricket team', 'South Africa cricket team', 'FIS (International Ski Federation)', 'Canada Soccer', "men's and women's national teams", "players' association", 'Bayern Munich', 'Paris St Germain', 'Manchester United', 'Newcastle United', 'VAR', 'Tottenham Hotspur', 'AC Milan', 'Juventus', 'Inter Milan', 'Chelsea', 'ATP Tour', 'Dubai Tennis Championships', 'ATP Tour', 'ATP Tour', 'Manchester City', 'Leicester City', 'Blackburn Rovers', 'Fulham', 'Leeds United', 'Brighton and Hove Albion', 'West Ham United', 'Formula One', 'French Football Federation (FFF)', 'Manchester United', 'Ajax Amsterdam', 'Barcelona', 'Brentford', 'Manchester City', 'American Glazer family.', 'Manchester City', 'Manchester City', "HSBC Women's World Championship", 'Asian Champions League', 'Al Hilal', 'Al Duhail', 'Urawa Red Diamonds', 'Manchester United', 'J.League', 'Porto', 'Inter M

In [28]:
# Organisations extracted using NLTK
nltk_org = eval_cna2['nltk_org'].values.tolist()

new_nltk_org = []
for x in nltk_org:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_nltk_org.append(n.lstrip())

print(new_nltk_org)

['Mitchell', 'Turkish Super Lig', 'Vodafone Park', 'Indies', 'Centurion Park', 'Indies', 'Centurion Park', 'Mia Brookes', 'FIS', 'Freestyle Ski', 'Beijing Olympics', 'Onitsuka Miyabi', 'SheBelievesCup', 'National Team', 'Champions League', 'PSG', 'VfB Stuttgart', 'Sommer', 'Spaniard', 'Newcastle United', 'League', 'Manchester United', 'Europa League', 'VAR', 'Newcastle', 'Tyneside', 'Champions League', 'Tottenham Hotspur', 'AC Milan', 'Inter Milan', 'Conte', 'Amazon', 'Europa Conference League', 'Sheffield United', 'ATP', 'Czech', 'Melbourne', 'ATP Tour', 'Adelaide', 'ATP Tour', 'Adelaide', 'Blackburn Rovers', 'Sammie Szmodics', 'Kelechi Iheanacho', 'City', 'Blackburn', 'Leicester', 'Vegas', 'Melbourne', 'Baku', 'Imola', 'Montreal', 'Spielberg', 'Zandvoort', 'Interlagos', 'FFF', 'Sports', 'Kadidiatou Diani', 'RMC Sport', 'Euro', 'European', 'League', 'Newcastle United', 'Wembley', 'Old Trafford', 'Ajax Amsterdam', 'American Glazer', 'Europa League', 'Brentford', 'Ashton Gate', 'League'

In [29]:
#nltk precision, recall, f1 score of organisation entity
correct = 0

for x in new_nltk_org:
    if x in new_true_org:
        correct+=1

nltk_org_recall = correct/len(new_true_org)
nltk_org_precision = correct/len(new_nltk_org)
nltk_org_f1_score = 2*((nltk_org_precision * nltk_org_recall)/ (nltk_org_precision + nltk_org_recall))
    
print("Recall = ", nltk_org_recall)
print()
print("Precision = ", nltk_org_precision)
print()
print("f1 score = ", nltk_org_f1_score)

Recall =  0.3815789473684211

Precision =  0.25892857142857145

f1 score =  0.30851063829787234


Average recall, precision & f1 score

In [30]:
avg_recall = (nltk_org_recall + nltk_per_recall + nltk_gpe_recall + nltk_loc_recall)/4
avg_precision = (nltk_org_precision + nltk_per_precision + nltk_gpe_precision + nltk_loc_precision)/4
avg_f1_score = (nltk_org_f1_score + nltk_per_f1_score + nltk_gpe_f1_score + nltk_loc_f1_score)/4

print("Recall = ", avg_recall)
print()
print("Precision = ", avg_precision)
print()
print("f1 score = ", avg_f1_score)

Recall =  0.4951997928971613

Precision =  0.2971510732139551

f1 score =  0.33621636408932515


#### spacy

In [31]:
# locations extracted using spaCy
spacy_loc = eval_cna2['spacy_location'].values.tolist()

new_spacy_loc = []
for x in spacy_loc:
    if len(x)!=0:
        txt = x.split(",")
        for i in txt:
            new_spacy_loc.append(i.strip())

print(new_spacy_loc)

['West Indies', 'Caribbean', 'West Indies', 'Caribbean', 'Europe', 'Erling Haaland', 'West Ham United', 'Asia', 'Al Thumama Stadium', 'The Isle of Man', "the Isle of Man's", 'the SEA Games', 'Indian Wells', 'Europe', 'Indian Wells', 'West Indies', 'the West Indies', 'Europe', 'Europe', 'West Ham United', 'West Ham United', 'Bealham', 'Strip']


In [32]:
#spacy precision, recall, f1 score of locations
correct = 0

for x in new_spacy_loc:
    if x in new_true_loc:
        correct+=1

spacy_loc_recall = correct/len(new_true_loc)
spacy_loc_precision = correct/len(new_spacy_loc)
spacy_loc_f1_score = 2*((spacy_loc_precision * spacy_loc_recall)/ (spacy_loc_precision + spacy_loc_recall))
    
print("Recall = ", spacy_loc_recall)
print()
print("Precision = ", spacy_loc_precision)
print()
print("f1 score = ", spacy_loc_f1_score)

Recall =  0.04487179487179487

Precision =  0.30434782608695654

f1 score =  0.0782122905027933


GPE

In [33]:
# GPE extracted using spaCy
spacy_gpe = eval_cna2['spacy_gpe'].values.tolist()

new_spacy_gpe = []
for x in spacy_gpe:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_spacy_gpe.append(i.strip())

print(new_spacy_gpe)

['Australia', 'India', 'New Delhi', 'Turkey', 'Syria', 'South Africa', 'Zimbabwe', 'Australia', 'Gqeberha', 'South Africa', 'Zimbabwe', 'Australia', 'Gqeberha', 'Britain', 'Bakuriani', 'Georgia', "New Zealand's", 'Japan', 'Canada', 'Paris', 'Neymar', 'Messi', 'LONDON', "Newcastle United's", 'Casemiro', 'Newcastle', 'Saudi Arabia', 'Europa League', 'LONDON', 'London', 'Tottenham', 'Italy', 'Amazon Prime', 'Sheffield United', 'Milan', 'Dubai', 'Netherlands', 'Doha', 'Melbourne', 'Dubai', 'Dubai', 'England', 'Manchester City', 'Bristol City', "Leicester City's", 'Leeds United', 'Stoke City', 'Bahrain', 'Abu Dhabi', 'Las Vegas', 'U.S.', 'China', 'France', 'Qatar', 'Azerbaijan', 'Austria', 'Belgium', 'Austin', 'Brazil', 'Jeddah', 'Melbourne', 'Baku', 'Imola', 'Barcelona', 'Silverstone', 'Suzuka', 'Interlagos', 'PARIS', 'France', 'Diacre', 'Katoto', 'Kadidiatou Diani', 'Norway', 'Denmark', 'Uruguay', 'England', 'Germany', 'Australia', 'New Zealand', 'LONDON', "Newcastle United's", 'Barcelona

In [34]:
#spacy precision, recall, f1 score of gpe
correct = 0

for x in new_spacy_gpe:
    if x in new_true_gpe:
        correct+=1

spacy_gpe_recall = correct/len(new_true_gpe)
spacy_gpe_precision = correct/len(new_spacy_gpe)
spacy_gpe_f1_score = 2*((spacy_gpe_precision * spacy_gpe_recall)/ (spacy_gpe_precision + spacy_gpe_recall))
    
print("Recall = ", spacy_gpe_recall)
print()
print("Precision = ", spacy_gpe_precision)
print()
print("f1 score = ", spacy_gpe_f1_score)

Recall =  0.8174603174603174

Precision =  0.45982142857142855

f1 score =  0.5885714285714285


Person

In [35]:
# person extracted using spaCy
spacy_per = eval_cna2['spacy_person'].values.tolist()

new_spacy_per = []
for x in spacy_per:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_spacy_per.append(i.strip())

print(new_spacy_per)

['Mitchell Starc', 'Pat Cummins', 'David Warner', 'Starc', 'Cameron Green', 'Cummins', 'Steve Smith', 'Srikar Bharat', 'Ghana', 'Christian Atsu', 'Kraigg Brathwaite', 'Brathwaite', 'Kraigg Brathwaite', 'Brathwaite', 'Mia Brookes', 'Zoi Sadowski-Synnott', 'Brooks', 'Nick Bontis', 'Bayern Munich', 'Eric-Maxim Choupo-Moting', 'Serge Gnabry', "Leon Goretzka's", 'Gnabry', 'Messi', 'Kylian Mbappe', 'Julian Nagelsmann', "Jamal Musiala's", 'Gianluigi Donnarumma', 'Marquinhos', 'Yann Sommer', 'Vitinha', "Sergio Ramos's", 'Sommer', 'Eddie Howe', "Erik ten Hag's", 'Jose Mourinho', 'Bruno Fernandes', 'Brazilian Casemiro', 'Luke Shaw', 'Loris Karius', 'Marcus Rashford', "Antonio Conte's", 'Conte', 'Wolverhampton Wanderers', 'Novak Djokovic', 'Tomas Machac', 'Djokovic', 'Rotterdam', 'Daniil Medvedev', 'Felix Auger-Aliassime', 'Karen Khachanov', 'Botic van de Zandschulp', 'Novak Djokovic', 'Rafa Nadal', 'Tomas Machac', 'Novak Djokovic', 'Rafa Nadal', 'Tomas Machac', "Phil Foden's", 'Kevin De Bruyne',

In [36]:
#spacy precision, recall, f1 score for person
correct = 0

for x in new_spacy_per:
    if x in new_true_per:
        correct+=1

spacy_per_recall = correct/len(new_true_per)
spacy_per_precision = correct/len(new_spacy_per)
spacy_per_f1_score = 2*((spacy_per_precision * spacy_per_recall)/ (spacy_per_precision + spacy_per_recall))
    
print("Recall = ", spacy_per_recall)
print()
print("Precision = ", spacy_per_precision)
print()
print("f1 score = ", spacy_per_f1_score)

Recall =  0.6477272727272727

Precision =  0.5377358490566038

f1 score =  0.5876288659793815


Organisation

In [37]:
# Organisations extracted using spaCy
spacy_org = eval_cna2['spacy_org'].values.tolist()

new_spacy_org = []
for x in spacy_org:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_spacy_org.append(i.strip())

print(new_spacy_org)

['Besiktas', 'The Turkish Super Lig', 'FIS', 'Briton', 'Brooks', 'Sadowski-Synnott', 'Onitsuka Miyabi', "Women's National Team", 'Choupo-Moting', 'Mbappe', 'Matthijs de Ligt', 'Bayern', 'Manchester United', 'Sven Botman', 'United', 'Botman', 'Wembley', 'Bruno Fernandes', 'Champions League', 'AC Milan', 'Juventus', 'Inter Milan', 'Chelsea', 'the Premier League', 'Tottenham', 'the Europa Conference League', "Steffi Graf's", 'Machac', 'American Maxime Cressy', 'the ATP Tour', 'Adelaide', 'the ATP Tour', 'Adelaide', 'BRISTOL', "Pep Guardiola's", 'Sammie Szmodics', 'Kelechi Iheanacho', 'Fulham', "Brighton and Hove Albion's", 'City', 'Bristol', 'Championship', 'Cottagers', 'the Premier League', 'League Cup', 'Noel Le Graet', 'FFF', 'the Sports ministry', 'Le Graet', "Women's World Cup", 'RMC Sport', 'Renard', 'Clermont', 'the European Championship', "The Women's World Cup", "Manchester United's", 'Ajax Amsterdam', 'Old Trafford', 'Cristiano Ronaldo', 'the Europa League', 'the Premier League'

In [38]:
#spacy precision, recall, f1 score of organisations
correct = 0

for x in new_spacy_org:
    if x in new_true_org:
        correct+=1

spacy_org_recall = correct/len(new_true_org)
spacy_org_precision = correct/len(new_spacy_org)
spacy_org_f1_score = 2*((spacy_org_precision * spacy_org_recall)/ (spacy_org_precision + spacy_org_recall))
    
print("Recall = ", spacy_org_recall)
print()
print("Precision = ", spacy_org_precision)
print()
print("f1 score = ", spacy_org_f1_score)

Recall =  0.28289473684210525

Precision =  0.19457013574660634

f1 score =  0.23056300268096513


Average precision, recall, f1-score

In [39]:
avg_recall = (spacy_org_recall + spacy_per_recall + spacy_gpe_recall + spacy_loc_recall)/4
avg_precision = (spacy_org_precision + spacy_per_precision + spacy_gpe_precision + spacy_loc_precision)/4
avg_f1_score = (spacy_org_f1_score + spacy_per_f1_score + spacy_gpe_f1_score + spacy_loc_f1_score)/4

print("Recall = ", avg_recall)
print()
print("Precision = ", avg_precision)
print()
print("f1 score = ", avg_f1_score)

Recall =  0.4482385304753726

Precision =  0.3741188098653988

f1 score =  0.3712438969336421
