In [1]:
import pandas as pd

import re

import nltk
from nltk.tokenize import word_tokenize

import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

## Definitions

In [2]:
# extracting words with labels: person, gpe, location and organisation
def extract_ne_from_tree ( tree ):
    result = []
    for s in tree.subtrees():
        label = s.label()
        if (label == 'PERSON' or label == 'GPE' or label == 'LOCATION' or label == 'ORGANIZATION'):
            leaves = s.leaves()
            ne = ''
            for l in leaves:
                ne = ne + ' ' + l[0]
            result.append((label, ne[1:]))
    return result

In [3]:
#tokenise, pos tagging, extracting words with labels above & place them into a list
def ner_info_extraction(content ,content_list):
    for x in content:
        res_sents = nltk.sent_tokenize(x)
        res_tokens = [word_tokenize(sents) for sents in res_sents]
        tagged_res = [nltk.pos_tag(tokens) for tokens in res_tokens]
        res_tree = [nltk.ne_chunk(elem) for elem in tagged_res]
        res_list = [extract_ne_from_tree(tree) for tree in res_tree]
        content_list+=[(res_list)]

In [4]:
# Since cnn is present in all articles, it creates noise in the entities extracted
def ner_sorting(content ,location, gpe, person, org):
    for row in content:
        check_location = []
        check_gpe = []
        check_person = []
        check_org = []
        for sent in row:
            if len(sent)!=0:
                for tup in sent:
                    if tup[1] != 'CNN':
                        if tup[0] == "LOCATION":
                            if tup[1] not in check_location:
                                check_location.append(tup[1])
                        if tup[0] == "GPE":
                            if tup[1] not in check_gpe:
                                check_gpe.append(tup[1])
                        if tup[0] == "PERSON":
                            if tup[1] not in check_person:
                                check_person.append(tup[1])
                        if tup[0] == "ORGANIZATION":
                            if tup[1] not in check_org:
                                check_org.append(tup[1])
        location.append(check_location)
        gpe.append(check_gpe)
        person.append(check_person)
        org.append(check_org)

In [5]:
# Extraction of entities using spaCy
def spacy_ner(content):
    nlist = []
    for x in content:
        cont_doc = nlp(x)
        temp = []
        for ent in cont_doc.ents:
            if ent.label_=="ORG" or ent.label_=="PERSON" or ent.label_=="GPE" or ent.label_=="LOC":
                temp.append([ent.text, ent.label_])
        nlist.append(temp)
    return nlist

In [6]:
# Since cnn is present at the start of all articles, it creates noise in the entities extracted
def spacy_sorting(content):
    org = []
    gpe = []
    person = []
    location = []
    for row in content:
        check_org = []
        check_gpe = []
        check_person = []
        check_loc = []
        if len(row)!=0:
            for tup in row:
                if tup[0]!= "CNN":
                    if tup[1] == "ORG":
                        if tup[0] not in check_org:
                            check_org.append(tup[0])
                    if tup[1] == "GPE":
                        if tup[0] not in check_gpe:
                            check_gpe.append(tup[0])
                    if tup[1] == "PERSON":
                        if tup[0] not in check_person:
                            check_person.append(tup[0])
                    if tup[1] == "LOC":
                        if tup[0] not in check_loc:
                            check_loc.append(tup[0])
        org.append(check_org)
        gpe.append(check_gpe)
        person.append(check_person)
        location.append(check_loc)
    return [org, gpe, person, location]

## cnn news

In [7]:
import glob
import os

path = r'../2.text_preprocessing/cnn/cnn_FINAL.csv' # use your path

#Importing in the news articles
cnn = pd.read_csv(path)
cnn = cnn.loc[:,['url','text']]
print(cnn.shape)
cnn

(82, 2)


Unnamed: 0,url,text
0,https://edition.cnn.com/2023/02/28/sport/los-a...,During the Lakers' 27-point comeback victory o...
1,https://edition.cnn.com/2023/02/16/sport/lesle...,(CNN) It's while running in the moors and hill...
2,https://edition.cnn.com/2023/02/27/sport/damia...,(CNN) Damian Lillard set an NBA record in his ...
3,https://edition.cnn.com/2023/02/27/football/su...,(CNN) Betrayal has formed part of European pol...
4,https://edition.cnn.com/2023/02/22/football/pa...,(CNN) The Palestinian team has never taken par...
...,...,...
77,https://edition.cnn.com/2023/02/28/us/irv-cros...,"(CNN) Irv Cross, a former NFL star and broadca..."
78,https://edition.cnn.com/2023/03/01/football/pa...,(CNN) Paul Pogba made his long-awaited return ...
79,https://edition.cnn.com/2023/03/06/sport/enriq...,(CNN) Spanish hurdler Enrique Llopis had to be...
80,https://edition.cnn.com/2023/03/06/football/ma...,(CNN) Manchester United manager Erik ten Hag s...


NLTK extraction

In [8]:
# Extracting using NLTK with the help of defined function above
cnn_content = []

ner_info_extraction(cnn['text'],cnn_content)
    
print(cnn_content)

[[[('ORGANIZATION', 'Lakers'), ('ORGANIZATION', 'Dallas Mavericks'), ('ORGANIZATION', 'LeBron')], [], [('ORGANIZATION', 'Lakers'), ('PERSON', 'James')], [], [('PERSON', 'James'), ('ORGANIZATION', 'Memphis Grizzlies')], [], [('ORGANIZATION', 'Lakers')], [('ORGANIZATION', 'NBA')], [('PERSON', 'Dennis Schröder'), ('GPE', 'Los Angeles'), ('PERSON', 'Jonas Valančiūnas'), ('ORGANIZATION', 'New Orleans Pelicans')], [('ORGANIZATION', 'NBA')], [('PERSON', 'Rob Pelinka'), ('PERSON', 'James'), ('PERSON', 'Anthony Davis'), ('ORGANIZATION', 'Russell Westbrook')], [('PERSON', 'Patrick Beverley'), ('GPE', 'August'), ('ORGANIZATION', 'Lakers')], [('PERSON', 'James'), ('PERSON', 'Pelinka'), ('ORGANIZATION', 'NBA')], [('ORGANIZATION', 'Lakers')], [('LOCATION', 'Western Conference')], [('PERSON', 'Darvin Ham'), ('ORGANIZATION', 'Lakers')], [('ORGANIZATION', 'Lakers')], [('GPE', 'Midseason'), ('PERSON', 'Malik Beasley'), ('PERSON', 'Jarred Vanderbilt'), ('PERSON', 'Mo Bamba'), ('PERSON', 'Davon Reed'), ('

In [9]:
#sort words according to labelled entities
location_with_brac = []
gpe_with_brac = []
person_with_brac = []
org_with_brac = []

ner_sorting(cnn_content, location_with_brac, gpe_with_brac, person_with_brac, org_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

print(org_with_brac)

[['Western Conference'], ['Western Front'], ['Western Conference'], [], ['West Bank', 'Gaza City', 'East Jerusalem', 'Gaza Strip', 'Gaza', 'West Asian'], [], [], [], [], ['Southern California'], ['South Carolina'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Southern Oregon'], [], [], ['South Africa'], [], ['Cardinals'], ['Southern California'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Caribbean'], [], [], [], [], [], [], [], [], [], [], ['South Korean'], [], [], [], [], [], [], [], [], [], [], [], [], [], []]

[['Los Angeles', 'August', 'Midseason', 'Dallas', 'James', 'Portland'], ['Scotland', 'Germany', 'Unbeknownst', 'Bäumer', 'Russian', 'Ukraine', 'German', 'French', 'Costa Rica', 'Birmingham', 'Alabama', 'Scottish', 'California', 'English'], ['Portland', 'Lillard'], ['European', 'Europe', 'Russian', 'American', 'Middle Eastern', 'British', 'Public', 'Zimbalist', 'Juventus', 'Turin', 'Barcelona', 'Justice'], ['P

In [10]:
# add into dataframe
cnn['location_with_brac'] = location_with_brac
cnn['gpe_with_brac'] = gpe_with_brac
cnn['person_with_brac'] = person_with_brac
cnn['org_with_brac'] = org_with_brac

cnn['nltk_location']=cnn['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['nltk_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['nltk_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['nltk_org'] = cnn['org_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'org_with_brac'],axis = 1)

cnn.head()

Unnamed: 0,url,text,nltk_location,nltk_gpe,nltk_person,nltk_org
0,https://edition.cnn.com/2023/02/28/sport/los-a...,During the Lakers' 27-point comeback victory o...,Western Conference,"Los Angeles, August, Midseason, Dallas, James,...","James, Dennis Schröder, Jonas Valančiūnas, Rob...","Lakers, Dallas Mavericks, LeBron, Memphis Griz..."
1,https://edition.cnn.com/2023/02/16/sport/lesle...,(CNN) It's while running in the moors and hill...,Western Front,"Scotland, Germany, Unbeknownst, Bäumer, Russia...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Kammerer, Armistice..."
2,https://edition.cnn.com/2023/02/27/sport/damia...,(CNN) Damian Lillard set an NBA record in his ...,Western Conference,"Portland, Lillard","Damian Lillard, Lillard, Donovan Mitchell, Kla...","NBA, Houston Rockets, Chicago Bulls, Blazers, ..."
3,https://edition.cnn.com/2023/02/27/football/su...,(CNN) Betrayal has formed part of European pol...,,"European, Europe, Russian, American, Middle Ea...","Julius Caesar, Brute, Cards, Super League, Jef...","Betrayal, Roman Republic, European Super Leagu..."
4,https://edition.cnn.com/2023/02/22/football/pa...,(CNN) The Palestinian team has never taken par...,"West Bank,Gaza City,East Jerusalem,Gaza Strip,...","Palestinian, Palestine, Portugal, African, Isr...","Arab, Gaza, East Jerusalem, Rami, Arab Emirate...","Qatar, Atlas Lions, Hamadeh, Mohammed Bin Zaye..."


spacy

In [11]:
#extract entities using above defined function
cont = cnn['text'].tolist()

cnn_list = spacy_ner(cont)
    
print(cnn_list)

[[['Lakers', 'PERSON'], ['the Dallas Mavericks', 'ORG'], ['LeBron James', 'PERSON'], ['James looks', 'PERSON'], ['the Memphis Grizzlies', 'LOC'], ['NBA', 'ORG'], ['Dennis Schröder', 'PERSON'], ['Jonas Valančiūnas', 'GPE'], ['the New Orleans Pelicans', 'ORG'], ['NBA', 'ORG'], ['Rob Pelinka', 'PERSON'], ['James', 'PERSON'], ['Anthony Davis', 'PERSON'], ['Russell Westbrook', 'PERSON'], ['Patrick Beverley', 'PERSON'], ['Lakers', 'PERSON'], ['James', 'PERSON'], ['Pelinka', 'PERSON'], ['NBA', 'ORG'], ['Lakers', 'PERSON'], ['Lakers', 'PERSON'], ['Midseason', 'PERSON'], ['Malik Beasley', 'PERSON'], ["D'Angelo Russell", 'PERSON'], ['Jarred Vanderbilt', 'PERSON'], ['Davon Reed', 'PERSON'], ['Mavericks', 'ORG'], ['Dallas', 'GPE'], ['Jason Kidd', 'PERSON'], ['Vanderbilt', 'PERSON'], ['a*s', 'PERSON'], ['Jayson Tatum', 'PERSON'], ['Boston Celtics', 'ORG'], ['NBA', 'ORG'], ['Lakers', 'PERSON'], ['Lakers', 'PERSON'], ['Lakers', 'PERSON'], ['Portland Trail Blazers', 'ORG'], ['NBA', 'ORG']], [['CNN', '

In [12]:
# sort according to entities
sort = spacy_sorting(cnn_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])
print()

print(sort[3])

[['the Dallas Mavericks', 'NBA', 'the New Orleans Pelicans', 'Mavericks', 'Boston Celtics', 'Portland Trail Blazers'], ['BAFTA', 'XTERRA'], ['Damian Lillard', 'NBA', 'the Portland Trail Blazers', 'Houston', 'the Chicago Bulls', 'Blazers', 'Lillard', 'CBS Sports', 'the Phoenix Suns', 'The Trail Blazers'], ['Julius Caesar', 'European Super League', 'ESL', 'the Champions League', 'House of Cards', 'UEFA', 'Apple', 'Super League: The War for Football', 'CNN Sports', 'Juventus', 'European Club Association', 'Real Madrid', 'Paris Saint-Germain', 'the European Club Association', 'the Super League', 'the Premier League', 'Champions League', 'Super League', 'FIFA', 'Manchester United', 'PSG', 'the European Court of Justice', 'ECJ', 'European Union'], ['the Atlas Lions', 'State', 'AFC Asian Cup', 'CNN Sports', 'Tubas', 'the Israel Defense Forces', 'IDF', 'Reuters', 'Hamas', 'Al-Shifa', 'Islamic Jihad', 'the Palestinian Football Association', 'AP', 'FIFA', 'Henley & Partners', 'Visa', 'AFC', 'The

In [13]:
#append it to dataframe
cnn['org_with_brac'] = sort[0]
cnn['gpe_with_brac'] = sort[1]
cnn['person_with_brac'] = sort[2]
cnn['location_with_brac'] = sort[3]

cnn['spacy_org']=cnn['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['spacy_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['spacy_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['spacy_location'] = cnn['location_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac','location_with_brac'],axis = 1)

cnn.head()

Unnamed: 0,url,text,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location
0,https://edition.cnn.com/2023/02/28/sport/los-a...,During the Lakers' 27-point comeback victory o...,Western Conference,"Los Angeles, August, Midseason, Dallas, James,...","James, Dennis Schröder, Jonas Valančiūnas, Rob...","Lakers, Dallas Mavericks, LeBron, Memphis Griz...","the Dallas Mavericks,NBA,the New Orleans Pelic...","Jonas Valančiūnas, Dallas","Lakers, LeBron James, James looks, Dennis Schr...",the Memphis Grizzlies
1,https://edition.cnn.com/2023/02/16/sport/lesle...,(CNN) It's while running in the moors and hill...,Western Front,"Scotland, Germany, Unbeknownst, Bäumer, Russia...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Kammerer, Armistice...","BAFTA,XTERRA","Scotland, Germany, Ukraine, Hollywood, Netflix...","Lesley Paterson, Paterson, Oscar, Erich Maria ...",Triathlete Lesley Paterson
2,https://edition.cnn.com/2023/02/27/sport/damia...,(CNN) Damian Lillard set an NBA record in his ...,Western Conference,"Portland, Lillard","Damian Lillard, Lillard, Donovan Mitchell, Kla...","NBA, Houston Rockets, Chicago Bulls, Blazers, ...","Damian Lillard,NBA,the Portland Trail Blazers,...",Portland,"Lillard, Donovan Mitchell's, Klay Thompson's, ...","the west coast, the Bay Area"
3,https://edition.cnn.com/2023/02/27/football/su...,(CNN) Betrayal has formed part of European pol...,,"European, Europe, Russian, American, Middle Ea...","Julius Caesar, Brute, Cards, Super League, Jef...","Betrayal, Roman Republic, European Super Leagu...","Julius Caesar,European Super League,ESL,the Ch...","the Roman Republic, Leicester City, Manchester...","Betrayal, Jeff Zimbalist, Aleksander Čeferin, ...","Europe, Middle Eastern"
4,https://edition.cnn.com/2023/02/22/football/pa...,(CNN) The Palestinian team has never taken par...,"West Bank,Gaza City,East Jerusalem,Gaza Strip,...","Palestinian, Palestine, Portugal, African, Isr...","Arab, Gaza, East Jerusalem, Rami, Arab Emirate...","Qatar, Atlas Lions, Hamadeh, Mohammed Bin Zaye...","the Atlas Lions,State,AFC Asian Cup,CNN Sports...","Portugal, Israel, the West Bank, Gaza, East Je...","Rami Hamadeh, Jordan, Ahmed Daraghmeh, Daraghm...","the Middle East, Asia"


In [14]:
cnn.to_csv('NER_data/cnn_NER.csv')

### Evaluation

In [15]:
path = r'../1.raw_data/golden_truth/golden_truth_cnn.csv' # use your path

# import gold truth generated by chatgpt
cnn_gold = pd.read_csv(path)
cnn_gold = cnn_gold.loc[:,['url','organisation','gpe','location','person']]
print(cnn_gold.shape)
cnn_gold

(54, 5)


Unnamed: 0,url,organisation,gpe,location,person
0,https://edition.cnn.com/2023/02/26/sport/austr...,International Cricket Council (ICC),Cape Town,Newlands Cricket Ground,"Meg Lanning, Alyssa Healy, Beth Mooney, Nadine..."
1,https://edition.cnn.com/2023/02/27/football/su...,"Roman Republic, European Super League (ESL), U...","Europe, British, American, Middle Eastern",Switzerland,"Julius Caesar, Aleksander Ceferin, Andrea Agne..."
2,https://edition.cnn.com/2023/02/16/golf/tiger-...,"PGA Tour, Riviera Country Club, TGR Foundation...",California,"Riviera Country Club, California, The Masters ...","Tiger Woods, Max Homa, Keith Mitchell, Sam Sne..."
3,https://edition.cnn.com/2023/02/19/sport/tiger...,PGA Tour,"California, USA","Genesis Invitational, Riviera Country Club","Tiger Woods, Jon Rahm, Justin Thomas"
4,https://edition.cnn.com/2023/02/27/sport/jake-...,Love Island,Saudi Arabia,"Diriyah Arena, Riyadh","Jake Paul, Tommy Fury, Tyson Fury, Mike Tyson,..."
5,https://edition.cnn.com/2023/02/27/football/li...,"Paris Saint-Germain (PSG), Marseille, Opta, FE...","Paris, France.",Ligue 1 (French soccer league).,"Kylian Mbappé, Lionel Messi, Cristiano Ronaldo..."
6,https://edition.cnn.com/2023/02/26/sport/byron...,"Dallas Cowboys, Miami Dolphins, NFL",United States,Dallas,Byron Jones
7,https://edition.cnn.com/2023/02/25/tennis/barb...,WTA (Women's Tennis Association),"Dubai, Czech Republic, Poland",Dubai,"Serena Williams, Caroline Wozniacki, Ons Jabeu..."
8,https://edition.cnn.com/2023/02/27/tennis/nova...,"Association of Tennis Professionals (ATP), Wom...",United States,"Dubai, Australian Open, Indian Wells, Miami Op...","Novak Djokovic, Steffi Graf, Rafael Nadal"
9,https://edition.cnn.com/2023/02/26/sport/arman...,"All Star Perche, FFAthlétisme (French Athletic...","France, United States, Ukraine","Clermont-Ferrand, France, Oregon, Donetsk","Armand Duplantis, Renaud Lavillenie, Kurtis Ma..."


In [16]:
# joining gold truth data with extracted entities
eval_cnn = cnn.set_index('url').join(cnn_gold.set_index('url'))
print(eval_cnn.shape)
eval_cnn.head()

(82, 13)


Unnamed: 0_level_0,text,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location,organisation,gpe,location,person
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
https://edition.cnn.com/2023/02/15/sport/jonathan-gannon-eagles-arizona-cardinals-nfl-spt-intl,(CNN) The Arizona Cardinals have hired Philade...,Cardinals,Gannon,"Philadelphia Eagles, Jonathan Gannon, Shane St...","Arizona Cardinals, Eagles, Kansas City Chiefs,...","The Arizona Cardinals,the Kansas City Chiefs,t...",Eagles,"Philadelphia Eagles, Jonathan Gannon, Shane St...",,"Arizona Cardinals, Philadelphia Eagles, Indian...",Super Bowl,"Arizona, Philadelphia, Kansas City, Indianapol...","Jonathan Gannon, Shane Steichen"
https://edition.cnn.com/2023/02/15/sport/kansas-city-chiefs-super-bowl-parade-spt-intl,(CNN) It was party time in Missouri on Wednesd...,,"Missouri, Chiefs, City","Super Bowl, Kansas City, Patrick Mahomes, Maho...","MVP, NFL, Super Bowl, Union Station, Chiefs","Super Bowl,Mahomes,MVP,NFL,Union Station,Kansa...","Missouri, Kansas City",Patrick Mahomes,,"NFL, Kansas City Chiefs, Bud Light",Eagles,"Missouri, Kansas City, Union Station, Arrowhea...","Patrick Mahomes, Travis Kelce, Jamaal Charles,..."
https://edition.cnn.com/2023/02/16/golf/tiger-woods-genesis-invitational-pga-tour-spt-intl,"(CNN) In February 1992, a sprightly 16-year-ol...",,"California, Match, Record, Snead, Sound, American","Tiger Woods, Woods, Max Homa, Keith Mitchell, ...","PGA Tour, Riviera Country Club, Genesis Invita...","the Riviera Country Club,Woods,the Genesis Inv...",California,"Tiger Woods, Max Homa, Keith Mitchell, LeBron ...",,"PGA Tour, Riviera Country Club, TGR Foundation...",California,"Riviera Country Club, California, The Masters ...","Tiger Woods, Max Homa, Keith Mitchell, Sam Sne..."
https://edition.cnn.com/2023/02/16/sport/lesley-paterson-all-quiet-on-the-western-front-film-triathlon-spt-intl,(CNN) It's while running in the moors and hill...,Western Front,"Scotland, Germany, Unbeknownst, Bäumer, Russia...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Kammerer, Armistice...","BAFTA,XTERRA","Scotland, Germany, Ukraine, Hollywood, Netflix...","Lesley Paterson, Paterson, Oscar, Erich Maria ...",Triathlete Lesley Paterson,Netflix,"France, Great Britain","Scotland, Germany, Ukraine, Costa Rica, Birmin...","Lesley Paterson, Paul Bäumer, Ian Stokell"
https://edition.cnn.com/2023/02/19/sport/tiger-woods-genesis-invitational-third-round-spt-intl,(CNN) It was an impressive and eventful third ...,,,"Tiger Woods, Woods","Genesis Invitational, Riviera Country Club, PG...","Woods,the Riviera Country Club,the PGA Tour,Ge...",,,,PGA Tour,"California, USA","Genesis Invitational, Riviera Country Club","Tiger Woods, Jon Rahm, Justin Thomas"


In [17]:
# Since we did not extract gold truth from chatgpt for all articles, we only keep those that has gold truth extracted.
eval_cnn2 = eval_cnn[ (eval_cnn['organisation'].notna()) & (eval_cnn['gpe'].notna()) & (eval_cnn['location'].notna()) & (eval_cnn['person'].notna())]
print(eval_cnn2.shape)
eval_cnn2.head()

(40, 13)


Unnamed: 0_level_0,text,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location,organisation,gpe,location,person
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
https://edition.cnn.com/2023/02/15/sport/jonathan-gannon-eagles-arizona-cardinals-nfl-spt-intl,(CNN) The Arizona Cardinals have hired Philade...,Cardinals,Gannon,"Philadelphia Eagles, Jonathan Gannon, Shane St...","Arizona Cardinals, Eagles, Kansas City Chiefs,...","The Arizona Cardinals,the Kansas City Chiefs,t...",Eagles,"Philadelphia Eagles, Jonathan Gannon, Shane St...",,"Arizona Cardinals, Philadelphia Eagles, Indian...",Super Bowl,"Arizona, Philadelphia, Kansas City, Indianapol...","Jonathan Gannon, Shane Steichen"
https://edition.cnn.com/2023/02/15/sport/kansas-city-chiefs-super-bowl-parade-spt-intl,(CNN) It was party time in Missouri on Wednesd...,,"Missouri, Chiefs, City","Super Bowl, Kansas City, Patrick Mahomes, Maho...","MVP, NFL, Super Bowl, Union Station, Chiefs","Super Bowl,Mahomes,MVP,NFL,Union Station,Kansa...","Missouri, Kansas City",Patrick Mahomes,,"NFL, Kansas City Chiefs, Bud Light",Eagles,"Missouri, Kansas City, Union Station, Arrowhea...","Patrick Mahomes, Travis Kelce, Jamaal Charles,..."
https://edition.cnn.com/2023/02/16/golf/tiger-woods-genesis-invitational-pga-tour-spt-intl,"(CNN) In February 1992, a sprightly 16-year-ol...",,"California, Match, Record, Snead, Sound, American","Tiger Woods, Woods, Max Homa, Keith Mitchell, ...","PGA Tour, Riviera Country Club, Genesis Invita...","the Riviera Country Club,Woods,the Genesis Inv...",California,"Tiger Woods, Max Homa, Keith Mitchell, LeBron ...",,"PGA Tour, Riviera Country Club, TGR Foundation...",California,"Riviera Country Club, California, The Masters ...","Tiger Woods, Max Homa, Keith Mitchell, Sam Sne..."
https://edition.cnn.com/2023/02/16/sport/lesley-paterson-all-quiet-on-the-western-front-film-triathlon-spt-intl,(CNN) It's while running in the moors and hill...,Western Front,"Scotland, Germany, Unbeknownst, Bäumer, Russia...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Kammerer, Armistice...","BAFTA,XTERRA","Scotland, Germany, Ukraine, Hollywood, Netflix...","Lesley Paterson, Paterson, Oscar, Erich Maria ...",Triathlete Lesley Paterson,Netflix,"France, Great Britain","Scotland, Germany, Ukraine, Costa Rica, Birmin...","Lesley Paterson, Paul Bäumer, Ian Stokell"
https://edition.cnn.com/2023/02/19/sport/tiger-woods-genesis-invitational-third-round-spt-intl,(CNN) It was an impressive and eventful third ...,,,"Tiger Woods, Woods","Genesis Invitational, Riviera Country Club, PG...","Woods,the Riviera Country Club,the PGA Tour,Ge...",,,,PGA Tour,"California, USA","Genesis Invitational, Riviera Country Club","Tiger Woods, Jon Rahm, Justin Thomas"


#### NLTK

Location

In [18]:
# locations from gold truth
true_loc = eval_cnn2["location"].values.tolist()

new_true_loc = []
for x in true_loc:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_true_loc.append(n.lstrip())

print(new_true_loc)

['Arizona', 'Philadelphia', 'Kansas City', 'Indianapolis', 'NFL draft.', 'Missouri', 'Kansas City', 'Union Station', 'Arrowhead Stadium', 'Philadelphia', 'Riviera Country Club', 'California', 'The Masters at Augusta', 'Scotland', 'Germany', 'Ukraine', 'Costa Rica', 'Birmingham', 'Alabama', 'California', 'Genesis Invitational', 'Riviera Country Club', 'Southern California', 'Qatar', 'Portugal', 'Israel', 'West Bank', 'Gaza', 'East Jerusalem', 'Abu Dhabi', 'Nablus', 'Tubas', 'Al-Shifa hospital', 'Mongolia', 'Yemen', 'Singapore', 'United States', 'Japan', 'London', 'Not specified', 'New Zealand', 'Wales', 'France', 'Spain', 'US', 'United States', 'Indian Wells', 'Miami', 'Belgrade', 'New York', 'Australia', 'Dubai', 'Ancient Greece', 'Paris', 'Tokyo', 'New Delhi', 'Tashkent', 'UK', 'US', 'Russia', 'Belarus', 'Ukraine', 'Kyiv', 'Wembley Stadium', 'London', 'Los Angeles', 'Australia', 'New Zealand', 'Ukraine', 'Irpin', 'Thailand', 'Dublin', 'Ireland', 'central Ukraine', 'Dubai', 'Manchester

In [19]:
# locations extracted using NLTK
nltk_loc = eval_cnn2['nltk_location'].values.tolist()

new_nltk_loc = []
for x in nltk_loc:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_nltk_loc.append(n.lstrip())

print(new_nltk_loc)

['Cardinals', 'Western Front', 'Southern California', 'West Bank', 'Gaza City', 'East Jerusalem', 'Gaza Strip', 'Gaza', 'West Asian', 'South Carolina', 'Southern California', 'South Africa', 'Southern Oregon', 'Western Conference']


In [20]:
#nltk precision, recall, f1 score of location entity
correct = 0

for x in new_nltk_loc:
    if x in new_true_loc:
        correct+=1

nltk_loc_recall = correct/len(new_true_loc)
nltk_loc_precision = correct/len(new_nltk_loc)
nltk_loc_f1_score = 2*((nltk_loc_precision * nltk_loc_recall)/ (nltk_loc_precision + nltk_loc_recall))
    
print("Recall = ", nltk_loc_recall)
print()
print("Precision = ", nltk_loc_precision)
print()
print("f1 score = ", nltk_loc_f1_score)

Recall =  0.03636363636363636

Precision =  0.42857142857142855

f1 score =  0.0670391061452514


GPE

In [21]:
# GPE from gold truth
true_gpe = eval_cnn2["gpe"].values.tolist()

new_true_gpe = []
for x in true_gpe:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_true_gpe.append(n.lstrip())

print(new_true_gpe)

['Super Bowl', 'Eagles', 'California', 'France', 'Great Britain', 'California', 'USA', 'None mentioned.', 'Palestinian', 'Morocco', 'Africa', 'Arab', 'United Arab Emirates', 'Middle East', 'FIFA', 'Not specified', 'ew Zealand', 'France', 'Wales', 'Spain', 'US', 'Serbian', 'Russia', 'US', 'Cuba', 'Dutch', 'US', 'England', 'Southern California', 'Philippines', 'Peru', 'Germany', 'Russia', 'Britain', 'United States', 'United Nations High Commissioner for Human Rights (OHCHR)', 'Dubai', 'Czech Republic', 'Poland', 'Saudi Arabia', 'United States.', 'France', 'United States', 'Ukraine', 'Cape Town', 'United States', 'Southern Oregon', 'Virginia', 'Turkey', 'eleven Turkish provinces', 'Paris', 'France.', 'Europe', 'British', 'American', 'Middle Eastern', 'America', 'Georgia', 'UN', 'Saudi Arabia', 'United States', 'Paris', 'France', 'Morocco', 'Argentina', 'USA', 'England', 'Lyon', 'Argentina', 'Turkey', 'US', 'Alaska', 'Hawaii', 'Western Conference', 'British', 'New Zealand', 'Japan', 'Real 

In [22]:
# GPEs extracted using NLTK
nltk_gpe = eval_cnn2['nltk_gpe'].values.tolist()

new_nltk_gpe = []
for x in nltk_gpe:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_nltk_gpe.append(n.lstrip())

print(new_nltk_gpe)

['Gannon', 'Missouri', 'Chiefs', 'City', 'California', 'Match', 'Record', 'Snead', 'Sound', 'American', 'Scotland', 'Germany', 'Unbeknownst', 'Bäumer', 'Russian', 'Ukraine', 'German', 'French', 'Costa Rica', 'Birmingham', 'Alabama', 'Scottish', 'California', 'English', 'Palestinian', 'Palestine', 'Portugal', 'African', 'Israeli-Palestinian', 'Middle East', 'Israel', 'Palestinians', 'Palestianian', 'Jordan', 'Abu Dhabi', 'United', 'Arabic', 'Israeli', 'Tubas', 'Nablus', 'Hamas', 'Yemen', 'Jerusalem', 'Japan', 'United States', 'Singapore', 'Canaan', 'Tunisian', 'Egyptian', 'Entry', 'Asian', 'Egypt', 'Qatar', 'Morocco', 'Mongolia', 'Uzbekistan', 'Saudi Arabia', 'Spain', 'Spanish', 'Hamed', 'Ramallah', 'Portuguese', 'North Korea', 'Jiangyin', 'China', 'Dabbagh', 'European', 'Arouca', 'Belgian', 'Infrastructure', 'Bethlehem', 'Tel Aviv', 'Asia', 'Tiger', 'France', 'Bangkok', 'North', 'South Korea', 'Barbed', 'Uummannaq', 'Greenland', 'Hawaii', 'Kilauea', 'California', 'Celsius', 'South Afri

In [23]:
#nltk precision, recall, f1 score of GPE
correct = 0

for x in new_nltk_gpe:
    if x in new_true_gpe:
        correct+=1

nltk_gpe_recall = correct/len(new_true_gpe)
nltk_gpe_precision = correct/len(new_nltk_gpe)
nltk_gpe_f1_score = 2*((nltk_gpe_precision * nltk_gpe_recall)/ (nltk_gpe_precision + nltk_gpe_recall))
    
print("Recall = ", nltk_gpe_recall)
print()
print("Precision = ", nltk_gpe_precision)
print()
print("f1 score = ", nltk_gpe_f1_score)

Recall =  0.7608695652173914

Precision =  0.24054982817869416

f1 score =  0.36553524804177545


Person

In [24]:
# Person from gold truth
true_per = eval_cnn2["person"].values.tolist()

new_true_per = []
for x in true_per:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_true_per.append(n.lstrip())

print(new_true_per)

['Jonathan Gannon', 'Shane Steichen', 'Patrick Mahomes', 'Travis Kelce', 'Jamaal Charles', 'Donna Kelce', 'Derrick Nnadi', 'Trey Smith', 'Orlando Brown Jr.', 'JuJu Smith-Schuster', 'James Bradberry', 'AJ Brown', 'Tiger Woods', 'Max Homa', 'Keith Mitchell', 'Sam Snead', 'LeBron James', 'Kareem Abdul-Jabbar', 'Rory McIlroy', 'Justin Thomas', 'Scottie Scheffler', 'Lesley Paterson', 'Paul Bäumer', 'Ian Stokell', 'Tiger Woods', 'Jon Rahm', 'Justin Thomas', 'Tiger Woods', 'Jon Rahm', 'Scottie Scheffler', 'Max Homa', 'Patrick Cantlay', 'Rami Hamadeh', 'Ahmed Daraghmeh', 'Mahmoud Sarsak', 'Jibril Rajoub', 'Payne Stewart', 'Tiger Woods', 'Rickie Fowler', 'Shingo Katayama', 'John Daly', 'Campbell Johnstone', 'Ruby Tui', 'Aaron Smith', 'Phil Rees', 'Nigel Owens', 'Gareth Thomas', 'Dan Palmer', 'Grant Robertson', 'Novak Djokovic', 'Rafael Nadal', 'Daniil Medvedev', 'Roger Federer', 'Steffi Graf', 'Muhammad Ali', 'George Foreman', 'Joe Frazier', 'Nicola Adams', 'Claressa Shields', 'Katie Taylor', '

In [25]:
# Persons extracted using NLTK
nltk_per = eval_cnn2['nltk_person'].values.tolist()

new_nltk_per = []
for x in nltk_per:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_nltk_per.append(n.lstrip())

print(new_nltk_per)

['Philadelphia Eagles', 'Jonathan Gannon', 'Shane Steichen', 'Gannon', 'Super Bowl', 'Super Bowl', 'Kansas City', 'Patrick Mahomes', 'Mahomes', 'Kansas', 'Tiger Woods', 'Woods', 'Max Homa', 'Keith Mitchell', 'Sam Snead', 'Kareem Abdul-Jabbar', 'James', 'Kareem', 'Sam', 'Snead', 'Rory McIlroy', 'Justin Thomas', 'Thomas', 'Scottie Scheffler', 'Scheffler', 'Tiger', 'Joaquin Niemann', 'Niemann', 'Lesley Paterson', 'Paterson', 'Erich Maria Remarque', 'Paul Bäumer', 'Felix', 'Ian Stokell', 'Netflix', 'Triathlete', 'Great Britain', 'Sam Mendes', 'Edward Berger', 'Malte Grunert', 'Tiger Woods', 'Woods', 'Tiger Woods', 'Jon Rahm', 'Woods', 'Arab', 'Gaza', 'East Jerusalem', 'Rami', 'Arab Emirates', 'Rami Hamadeh', 'Ahmed Daraghmeh', 'Daraghmeh', 'Mahmoud Sarsak', 'Sarsak', 'Mongolia', 'Hamadeh', 'Jabal', 'Bnei Sakhnin', 'Jibril Rajoub', 'Henley', 'Makram', 'Makram Daboub', 'Daboub', 'Arab Cup', 'Gianni Infantino', 'Ismail El Haddad', 'Yaser Hamed', 'Hamed', 'Thomas Bach', 'Rajoub', 'Firas Abu Hi

In [26]:
#nltk precision, recall, f1 score for person
correct = 0

for x in new_nltk_per:
    if x in new_true_per:
        correct+=1

nltk_per_recall = correct/len(new_true_per)
nltk_per_precision = correct/len(new_nltk_per)
nltk_per_f1_score = 2*((nltk_per_precision * nltk_per_recall)/ (nltk_per_precision + nltk_per_recall))
    
print("Recall = ", nltk_per_recall)
print()
print("Precision = ", nltk_per_precision)
print()
print("f1 score = ", nltk_per_f1_score)

Recall =  0.6

Precision =  0.30982367758186397

f1 score =  0.4086378737541528


Org

In [27]:
# Organisations from gold truth
true_org = eval_cnn2["organisation"].values.tolist()

new_true_org = []
for x in true_org:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_true_org.append(n.lstrip())

print(new_true_org)

['Arizona Cardinals', 'Philadelphia Eagles', 'Indianapolis Colts', 'NFL', 'NFL', 'Kansas City Chiefs', 'Bud Light', 'PGA Tour', 'Riviera Country Club', 'TGR Foundation', 'Sports Illustrated/Getty Images', 'PNC Championship', 'TNT', 'Augusta', 'Netflix', 'PGA Tour', 'PGA Tour', 'CBS', 'Hamas', 'Israel Defense Forces (IDF)', 'Islamic Jihad group', 'Palestinian Football Association (PFA)', 'PGA Tour', 'Team USA', 'All Blacks', 'Canterbury', 'Crusaders', 'Ospreys', 'Biarritz', 'Not mentioned', 'International Boxing Association (IBA)', 'International Olympic Committee (IOC)', 'USA Boxing', 'Angel City FC', 'Harvard-Westlake', "US Women's National Team", 'Total Futbol Academy of MLS Next', 'Nike', 'NWSL', 'Bellator MMA', "WTA (Women's Tennis Association)", 'Manchester United', 'Newcastle United', 'Sky Sports', 'Saudi Arabian Public Investment Fund (PIF)', 'Real Madrid.', 'All Star Perche', 'FFAthlétisme (French Athletics Federation)', 'International Cricket Council (ICC)', 'Dallas Cowboys', 

In [28]:
# Organisations extracted using NLTK
nltk_org = eval_cnn2['nltk_org'].values.tolist()

new_nltk_org = []
for x in nltk_org:
    if len(x)>1:
        txt = x.split(",")
        for n in txt:
            if n != '':
                new_nltk_org.append(n.lstrip())

print(new_nltk_org)

['Arizona Cardinals', 'Eagles', 'Kansas City Chiefs', 'Super Bowl', 'MVP', 'NFL', 'Super Bowl', 'Union Station', 'Chiefs', 'PGA Tour', 'Riviera Country Club', 'Genesis Invitational', 'TGR Foundation', 'Open', 'ZOZO', 'PNC', 'TNT', 'Masters', 'Augusta', 'PGA', 'LeBron James', 'NBA', 'LeBron', 'McIlroy', 'Phoenix Open', 'LIV Golf Tour', 'McIroy', 'First', 'Scottish Highlands', 'Kammerer', 'Armistice Talks', 'BAFTA', 'Oscars', 'XTERRA', 'Genesis Invitational', 'Riviera Country Club', 'PGA Tour', 'Genesis Invitational', 'PGA Tour', 'Open', 'CBS', 'Qatar', 'Atlas Lions', 'Hamadeh', 'Mohammed Bin Zayed Stadium', 'AFC Asian', 'CNN Sports', 'IDF', 'Reuters', 'Islamic Jihad', 'Philippines', 'IDs', 'PFA', 'AP', 'FIFA', 'Partners', 'Daboub', 'AFC', 'WAFF', 'FA Jibril', 'IOC', 'General', 'Shaheen', 'National', 'IFA', 'FIFA Forward', 'Futsal Hall', 'Faisal', 'International', 'Palestine', 'PFA Hilal', 'UEFA', 'PGA Tour', 'Kantarat Golf Course', 'Demilitarized', 'World', 'Washington Post', 'Volcano G

In [29]:
#nltk precision, recall, f1 score of organisation entity
correct = 0

for x in new_nltk_org:
    if x in new_true_org:
        correct+=1

nltk_org_recall = correct/len(new_true_org)
nltk_org_precision = correct/len(new_nltk_org)
nltk_org_f1_score = 2*((nltk_org_precision * nltk_org_recall)/ (nltk_org_precision + nltk_org_recall))
    
print("Recall = ", nltk_org_recall)
print()
print("Precision = ", nltk_org_precision)
print()
print("f1 score = ", nltk_org_f1_score)

Recall =  0.4041095890410959

Precision =  0.18849840255591055

f1 score =  0.2570806100217865


Average recall, precision & f1 score

In [30]:
avg_recall = (nltk_org_recall + nltk_per_recall + nltk_gpe_recall + nltk_loc_recall)/4
avg_precision = (nltk_org_precision + nltk_per_precision + nltk_gpe_precision + nltk_loc_precision)/4
avg_f1_score = (nltk_org_f1_score + nltk_per_f1_score + nltk_gpe_f1_score + nltk_loc_f1_score)/4

print("Recall = ", avg_recall)
print()
print("Precision = ", avg_precision)
print()
print("f1 score = ", avg_f1_score)

Recall =  0.4503356976555309

Precision =  0.2918608342219743

f1 score =  0.27457320949074154


#### spacy

In [31]:
# locations extracted using spaCy
spacy_loc = eval_cnn2['spacy_location'].values.tolist()

new_spacy_loc = []
for x in spacy_loc:
    if len(x)!=0:
        txt = x.split(",")
        for i in txt:
            new_spacy_loc.append(i.lstrip())

print(new_spacy_loc)

['Triathlete Lesley Paterson', 'Southern California', 'the Middle East', 'Asia', 'Death Valley', 'Africa', 'Indian Wells', 'Belarusian', 'Europe', 'Southern California', 'Pacific', 'Cape Town', 'Green Bay Packers', 'Southern Oregon', 'Europe', 'Middle Eastern', 'the Memphis Grizzlies', 'Lucas Oil Stadium']


In [32]:
#spacy precision, recall, f1 score of locations
correct = 0

for x in new_spacy_loc:
    if x in new_true_loc:
        correct+=1

spacy_loc_recall = correct/len(new_true_loc)
spacy_loc_precision = correct/len(new_spacy_loc)
spacy_loc_f1_score = 2*((spacy_loc_precision * spacy_loc_recall)/ (spacy_loc_precision + spacy_loc_recall))
    
print("Recall = ", spacy_loc_recall)
print()
print("Precision = ", spacy_loc_precision)
print()
print("f1 score = ", spacy_loc_f1_score)

Recall =  0.01818181818181818

Precision =  0.16666666666666666

f1 score =  0.032786885245901634


GPE

In [33]:
# GPE extracted using spaCy
spacy_gpe = eval_cnn2['spacy_gpe'].values.tolist()

new_spacy_gpe = []
for x in spacy_gpe:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_spacy_gpe.append(i.lstrip())

print(new_spacy_gpe)

['Eagles', 'Missouri', 'Kansas City', 'California', 'Scotland', 'Germany', 'Ukraine', 'Hollywood', 'Netflix', 'Costa Rica', 'Great Britain', 'Birmingham', 'Alabama', 'California', 'Portugal', 'Israel', 'the West Bank', 'Gaza', 'East Jerusalem', 'Abu Dhabi', 'United Arab Emirates', 'Nablus', 'Gaza City', 'Philippines', 'Mongolia', 'Yemen', 'Hamadeh', 'West Bank', 'the Gaza Strip', 'Jerusalem', 'London', 'Japan', 'the United States', 'Singapore', 'Daboub', 'The Gaza Strip', 'Egypt', 'Qatar', 'Morocco', 'Saudi Arabia', 'Spain', 'Palestine', 'Ramallah', 'North Korea', 'Jiangyin city', 'China', 'Jiangsu', 'Bethlehem', 'Tel Aviv', 'boycotts', 'Asian Cups', 'La Jenny', 'France', 'Bangkok', 'North', 'South Korea', 'Uummannaq', 'Greenland', 'Hawaii', 'Idaho', 'Death Valley', 'California', "South Africa's", 'South Carolina', 'Bolivia', 'Soviet Army', 'Kenya', 'Australia', 'Ceduna', 'New Zealand', 'Wales', 'France', 'Canterbury', 'Australia', 'Spain', 'US', 'US', 'Miami Open', 'The United States'

In [34]:
#spacy precision, recall, f1 score of gpe
correct = 0

for x in new_spacy_gpe:
    if x in new_true_gpe:
        correct+=1

spacy_gpe_recall = correct/len(new_true_gpe)
spacy_gpe_precision = correct/len(new_spacy_gpe)
spacy_gpe_f1_score = 2*((spacy_gpe_precision * spacy_gpe_recall)/ (spacy_gpe_precision + spacy_gpe_recall))
    
print("Recall = ", spacy_gpe_recall)
print()
print("Precision = ", spacy_gpe_precision)
print()
print("f1 score = ", spacy_gpe_f1_score)

Recall =  0.6413043478260869

Precision =  0.30569948186528495

f1 score =  0.4140350877192982


Person

In [35]:
# person extracted using spaCy
spacy_per = eval_cnn2['spacy_person'].values.tolist()

new_spacy_per = []
for x in spacy_per:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_spacy_per.append(i.lstrip())

print(new_spacy_per)

['Philadelphia Eagles', 'Jonathan Gannon', 'Shane Steichen', 'Patrick Mahomes', 'Tiger Woods', 'Max Homa', 'Keith Mitchell', 'LeBron James', 'Kareem Abdul-Jabbar', 'Kareem', 'LeBron', 'Sam', 'Rory McIlroy', 'Justin Thomas', 'McIlroy', 'Thomas', 'Scottie Scheffler', 'Joaquin Niemann', 'Niemann', 'Lesley Paterson', 'Paterson', 'Oscar', "Erich Maria Remarque's", 'Paul Bäumer', 'Felix Kammerer', 'Ian Stokell', "Sam Mendes'", 'Edward Berger', 'Malte Grunert', 'Tiger Woods', 'Jon Rahm', 'Rami Hamadeh', 'Jordan', 'Ahmed Daraghmeh', 'Daraghmeh', 'Mahmoud Sarsak', 'Sarsak', 'Jabal Al-Mukaber', 'Bnei Sakhnin', 'Hamadeh', 'Gazan', 'Makram Daboub', 'Gianni Infantino', 'Ismail El Haddad', 'Yaser Hamed', 'Thomas Bach', 'Firas Abu Hilal', 'Diala Shaheen', 'Song Kum', 'Faisal Al-Husseini Stadium', 'Jibril Rajoub', 'Faisal Al-Husseini', 'Hilal', 'Abu Hilal', "Payne Stewart's", 'Tiger Woods', 'Safari Resort', 'Kabul Golf Course', 'Mohammed Afzal Abdul', 'Vipingo Ridge', 'Atin Kenya', 'Mi Jung Hur', 'Ric

In [36]:
#spacy precision, recall, f1 score for person
correct = 0

for x in new_spacy_per:
    if x in new_true_per:
        correct+=1

spacy_per_recall = correct/len(new_true_per)
spacy_per_precision = correct/len(new_spacy_per)
spacy_per_f1_score = 2*((spacy_per_precision * spacy_per_recall)/ (spacy_per_precision + spacy_per_recall))
    
print("Recall = ", spacy_per_recall)
print()
print("Precision = ", spacy_per_precision)
print()
print("f1 score = ", spacy_per_f1_score)

Recall =  0.551219512195122

Precision =  0.4109090909090909

f1 score =  0.4708333333333334


Organisation

In [37]:
# Organisations extracted using spaCy
spacy_org = eval_cnn2['spacy_org'].values.tolist()

new_spacy_org = []
for x in spacy_org:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_spacy_org.append(i.lstrip())

print(new_spacy_org)

['The Arizona Cardinals', 'the Kansas City Chiefs', 'the Indianapolis Colts', 'Super Bowl LVII', 'Super Bowl', 'Mahomes', 'MVP', 'NFL', 'Union Station', 'Kansas City Chiefs', 'the Riviera Country Club', 'Woods', 'the Genesis Invitational', 'TGR Foundation', 'Riviera Country Club', 'PGA Tour', 'TNT', 'NBA', 'Scheffler', 'The Genesis Invitational', "the PGA Tour's", 'Tour', 'Genesis Invitational', 'McIlroy', 'BAFTA', 'XTERRA', 'Woods', 'the Riviera Country Club', 'the PGA Tour', 'Genesis Invitational', 'the Genesis Invitational', 'PGA Tour', 'Woods', 'CBS', 'the Atlas Lions', 'State', 'AFC Asian Cup', 'CNN Sports', 'Tubas', 'the Israel Defense Forces', 'IDF', 'Reuters', 'Hamas', 'Al-Shifa', 'Islamic Jihad', 'the Palestinian Football Association', 'AP', 'FIFA', 'Henley & Partners', 'Visa', 'AFC', 'The Asian Football Confederation', 'the West Asian Football Federation', 'WAFF', 'IOC', 'PFA', 'the Palestinian National Olympic Committee', 'Oday Dabbagh', 'F.C. Arouca', 'R. Charleroi S.C', "t

In [38]:
#spacy precision, recall, f1 score of organisations
correct = 0

for x in new_spacy_org:
    if x in new_true_org:
        correct+=1

spacy_org_recall = correct/len(new_true_org)
spacy_org_precision = correct/len(new_spacy_org)
spacy_org_f1_score = 2*((spacy_org_precision * spacy_org_recall)/ (spacy_org_precision + spacy_org_recall))
    
print("Recall = ", spacy_org_recall)
print()
print("Precision = ", spacy_org_precision)
print()
print("f1 score = ", spacy_org_f1_score)

Recall =  0.3767123287671233

Precision =  0.2

f1 score =  0.2612826603325416


Average precision, recall, f1-score

In [39]:
avg_recall = (spacy_org_recall + spacy_per_recall + spacy_gpe_recall + spacy_loc_recall)/4
avg_precision = (spacy_org_precision + spacy_per_precision + spacy_gpe_precision + spacy_loc_precision)/4
avg_f1_score = (spacy_org_f1_score + spacy_per_f1_score + spacy_gpe_f1_score + spacy_loc_f1_score)/4

print("Recall = ", avg_recall)
print()
print("Precision = ", avg_precision)
print()
print("f1 score = ", avg_f1_score)

Recall =  0.39685450174253756

Precision =  0.27081880986026063

f1 score =  0.2947344916577687
