In [20]:
import pandas as pd

import re

import nltk
from nltk.tokenize import word_tokenize

import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

from sympy.combinatorics.subsets import Subset

## Definitions

In [2]:
def extract_ne_from_tree ( tree ):
    result = []
    for s in tree.subtrees():
        label = s.label()
        if (label == 'PERSON' or label == 'GPE' or label == 'LOCATION' or label == 'ORGANIZATION'):
            leaves = s.leaves()
            ne = ''
            for l in leaves:
                ne = ne + ' ' + l[0]
            result.append((label, ne[1:]))
    return result

In [3]:
def ner_info_extraction(content ,content_list):
    for x in content:
        res_sents = nltk.sent_tokenize(x)
        res_tokens = [word_tokenize(sents) for sents in res_sents]
        tagged_res = [nltk.pos_tag(tokens) for tokens in res_tokens]
        res_tree = [nltk.ne_chunk(elem) for elem in tagged_res]
        res_list = [extract_ne_from_tree(tree) for tree in res_tree]
        content_list+=[(res_list)]

In [4]:
def ner_sorting(content ,location, gpe, person, org):
    for row in content:
        check_location = []
        check_gpe = []
        check_person = []
        check_org = []
        for sent in row:
            if len(sent)!=0:
                for tup in sent:
                    if tup[1] != 'CNN':
                        if tup[0] == "LOCATION":
                            if tup[1] not in check_location:
                                check_location.append(tup[1])
                        if tup[0] == "GPE":
                            if tup[1] not in check_gpe:
                                check_gpe.append(tup[1])
                        if tup[0] == "PERSON":
                            if tup[1] not in check_person:
                                check_person.append(tup[1])
                        if tup[0] == "ORGANIZATION":
                            if tup[1] not in check_org:
                                check_org.append(tup[1])
        location.append(check_location)
        gpe.append(check_gpe)
        person.append(check_person)
        org.append(check_org)

In [5]:
def spacy_ner(content):
    cnn_list = []
    for doc in content:
        temp = []
        for token in doc:
            ner_tags = [token.text, token.ent_type_]
            if ner_tags[1]=="ORG" or ner_tags[1]=="PERSON" or ner_tags[1]=="GPE" or ner_tags[1]=="LOC":
                temp.append(ner_tags)
        cnn_list.append(temp)
    return cnn_list

In [6]:
def spacy_sorting(content):
    org = []
    gpe = []
    person = []
    location = []
    for row in content:
        check_org = []
        check_gpe = []
        check_person = []
        check_loc = []
        if len(row)!=0:
            for tup in row:
                if tup[0]!= "CNN":
                    if tup[1] == "ORG":
                        if tup[0] not in check_org:
                            check_org.append(tup[0])
                    if tup[1] == "GPE":
                        if tup[0] not in check_gpe:
                            check_gpe.append(tup[0])
                    if tup[1] == "PERSON":
                        if tup[0] not in check_person:
                            check_person.append(tup[0])
                    if tup[1] == "LOC":
                        if tup[0] not in check_loc:
                            check_loc.append(tup[0])
        org.append(check_org)
        gpe.append(check_gpe)
        person.append(check_person)
        location.append(check_loc)
    return [org, gpe, person, location]

## cnn news

In [7]:
import glob
import os

path = r'../2.text_preprocessing/cnn/cnn_FINAL.csv' # use your path

cnn = pd.read_csv(path)
cnn = cnn.loc[:,['url','text']]
print(cnn.shape)
cnn

(82, 2)


Unnamed: 0,url,text
0,https://edition.cnn.com/2023/02/28/sport/los-a...,During the Lakers' 27-point comeback victory o...
1,https://edition.cnn.com/2023/02/16/sport/lesle...,(CNN) It's while running in the moors and hill...
2,https://edition.cnn.com/2023/02/27/sport/damia...,(CNN) Damian Lillard set an NBA record in his ...
3,https://edition.cnn.com/2023/02/27/football/su...,(CNN) Betrayal has formed part of European pol...
4,https://edition.cnn.com/2023/02/22/football/pa...,(CNN) The Palestinian team has never taken par...
...,...,...
77,https://edition.cnn.com/2023/02/28/us/irv-cros...,"(CNN) Irv Cross, a former NFL star and broadca..."
78,https://edition.cnn.com/2023/03/01/football/pa...,(CNN) Paul Pogba made his long-awaited return ...
79,https://edition.cnn.com/2023/03/06/sport/enriq...,(CNN) Spanish hurdler Enrique Llopis had to be...
80,https://edition.cnn.com/2023/03/06/football/ma...,(CNN) Manchester United manager Erik ten Hag s...


NLTK extraction

In [8]:
cnn_content = []

ner_info_extraction(cnn['text'],cnn_content)
    
print(cnn_content)

[[[('ORGANIZATION', 'Lakers'), ('ORGANIZATION', 'Dallas Mavericks'), ('ORGANIZATION', 'LeBron')], [], [('ORGANIZATION', 'Lakers'), ('PERSON', 'James')], [], [('PERSON', 'James'), ('ORGANIZATION', 'Memphis Grizzlies')], [], [('ORGANIZATION', 'Lakers')], [('ORGANIZATION', 'NBA')], [('PERSON', 'Dennis Schröder'), ('GPE', 'Los Angeles'), ('PERSON', 'Jonas Valančiūnas'), ('ORGANIZATION', 'New Orleans Pelicans')], [('ORGANIZATION', 'NBA')], [('PERSON', 'Rob Pelinka'), ('PERSON', 'James'), ('PERSON', 'Anthony Davis'), ('ORGANIZATION', 'Russell Westbrook')], [('PERSON', 'Patrick Beverley'), ('GPE', 'August'), ('ORGANIZATION', 'Lakers')], [('PERSON', 'James'), ('PERSON', 'Pelinka'), ('ORGANIZATION', 'NBA')], [('ORGANIZATION', 'Lakers')], [('LOCATION', 'Western Conference')], [('PERSON', 'Darvin Ham'), ('ORGANIZATION', 'Lakers')], [('ORGANIZATION', 'Lakers')], [('GPE', 'Midseason'), ('PERSON', 'Malik Beasley'), ('PERSON', 'Jarred Vanderbilt'), ('PERSON', 'Mo Bamba'), ('PERSON', 'Davon Reed'), ('

In [9]:
location_with_brac = []
gpe_with_brac = []
person_with_brac = []
org_with_brac = []

ner_sorting(cnn_content, location_with_brac, gpe_with_brac, person_with_brac, org_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

print(org_with_brac)

[['Western Conference'], ['Western Front'], ['Western Conference'], [], ['West Bank', 'Gaza City', 'East Jerusalem', 'Gaza Strip', 'Gaza', 'West Asian'], [], [], [], [], ['Southern California'], ['South Carolina'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Southern Oregon'], [], [], ['South Africa'], [], ['Cardinals'], ['Southern California'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Caribbean'], [], [], [], [], [], [], [], [], [], [], ['South Korean'], [], [], [], [], [], [], [], [], [], [], [], [], [], []]

[['Los Angeles', 'August', 'Midseason', 'Dallas', 'James', 'Portland'], ['Scotland', 'Germany', 'Unbeknownst', 'Bäumer', 'Russian', 'Ukraine', 'German', 'French', 'Costa Rica', 'Birmingham', 'Alabama', 'Scottish', 'California', 'English'], ['Portland', 'Lillard'], ['European', 'Europe', 'Russian', 'American', 'Middle Eastern', 'British', 'Public', 'Zimbalist', 'Juventus', 'Turin', 'Barcelona', 'Justice'], ['P

In [10]:
cnn['location_with_brac'] = location_with_brac
cnn['gpe_with_brac'] = gpe_with_brac
cnn['person_with_brac'] = person_with_brac
cnn['org_with_brac'] = org_with_brac

cnn['nltk_location']=cnn['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['nltk_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['nltk_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['nltk_org'] = cnn['org_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'org_with_brac'],axis = 1)

cnn.head()

Unnamed: 0,url,text,nltk_location,nltk_gpe,nltk_person,nltk_org
0,https://edition.cnn.com/2023/02/28/sport/los-a...,During the Lakers' 27-point comeback victory o...,Western Conference,"Los Angeles, August, Midseason, Dallas, James,...","James, Dennis Schröder, Jonas Valančiūnas, Rob...","Lakers, Dallas Mavericks, LeBron, Memphis Griz..."
1,https://edition.cnn.com/2023/02/16/sport/lesle...,(CNN) It's while running in the moors and hill...,Western Front,"Scotland, Germany, Unbeknownst, Bäumer, Russia...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Kammerer, Armistice..."
2,https://edition.cnn.com/2023/02/27/sport/damia...,(CNN) Damian Lillard set an NBA record in his ...,Western Conference,"Portland, Lillard","Damian Lillard, Lillard, Donovan Mitchell, Kla...","NBA, Houston Rockets, Chicago Bulls, Blazers, ..."
3,https://edition.cnn.com/2023/02/27/football/su...,(CNN) Betrayal has formed part of European pol...,,"European, Europe, Russian, American, Middle Ea...","Julius Caesar, Brute, Cards, Super League, Jef...","Betrayal, Roman Republic, European Super Leagu..."
4,https://edition.cnn.com/2023/02/22/football/pa...,(CNN) The Palestinian team has never taken par...,"West Bank,Gaza City,East Jerusalem,Gaza Strip,...","Palestinian, Palestine, Portugal, African, Isr...","Arab, Gaza, East Jerusalem, Rami, Arab Emirate...","Qatar, Atlas Lions, Hamadeh, Mohammed Bin Zaye..."


spacy

In [11]:
cont = cnn['text'].apply(nlp)

cnn_list = spacy_ner(cont)
    
print(cnn_list)

[[['Lakers', 'PERSON'], ['the', 'ORG'], ['Dallas', 'ORG'], ['Mavericks', 'ORG'], ['LeBron', 'PERSON'], ['James', 'PERSON'], ['James', 'PERSON'], ['looks', 'PERSON'], ['the', 'LOC'], ['Memphis', 'LOC'], ['Grizzlies', 'LOC'], ['NBA', 'ORG'], ['Dennis', 'PERSON'], ['Schröder', 'PERSON'], ['Jonas', 'GPE'], ['Valančiūnas', 'GPE'], ['the', 'ORG'], ['New', 'ORG'], ['Orleans', 'ORG'], ['Pelicans', 'ORG'], ['NBA', 'ORG'], ['Rob', 'PERSON'], ['Pelinka', 'PERSON'], ['James', 'PERSON'], ['Anthony', 'PERSON'], ['Davis', 'PERSON'], ['Russell', 'PERSON'], ['Westbrook', 'PERSON'], ['Patrick', 'PERSON'], ['Beverley', 'PERSON'], ['Lakers', 'PERSON'], ['James', 'PERSON'], ['Pelinka', 'PERSON'], ['NBA', 'ORG'], ['Lakers', 'PERSON'], ['Lakers', 'PERSON'], ['Midseason', 'PERSON'], ['Malik', 'PERSON'], ['Beasley', 'PERSON'], ["D'Angelo", 'PERSON'], ['Russell', 'PERSON'], ['Jarred', 'PERSON'], ['Vanderbilt', 'PERSON'], ['Davon', 'PERSON'], ['Reed', 'PERSON'], ['Mavericks', 'ORG'], ['Dallas', 'GPE'], ['Jason',

In [12]:
sort = spacy_sorting(cnn_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])
print()

print(sort[3])

[['the', 'Dallas', 'Mavericks', 'NBA', 'New', 'Orleans', 'Pelicans', 'Boston', 'Celtics', 'Portland', 'Trail', 'Blazers'], ['BAFTA', 'XTERRA'], ['Damian', 'Lillard', 'NBA', 'the', 'Portland', 'Trail', 'Blazers', 'Houston', 'Chicago', 'Bulls', 'CBS', 'Sports', 'Phoenix', 'Suns', 'The'], ['Julius', 'Caesar', 'European', 'Super', 'League', 'ESL', 'the', 'Champions', 'House', 'of', 'Cards', 'UEFA', 'Apple', ':', 'The', 'War', 'for', 'Football', 'Sports', 'Juventus', 'Club', 'Association', 'Real', 'Madrid', 'Paris', 'Saint', '-', 'Germain', 'Premier', 'FIFA', 'Manchester', 'United', 'PSG', 'Court', 'Justice', 'ECJ', 'Union'], ['the', 'Atlas', 'Lions', 'State', 'AFC', 'Asian', 'Cup', 'Sports', 'Tubas', 'Israel', 'Defense', 'Forces', 'IDF', 'Reuters', 'Hamas', 'Al', '-', 'Shifa', 'Islamic', 'Jihad', 'Palestinian', 'Football', 'Association', 'AP', 'FIFA', 'Henley', '&', 'Partners', 'Visa', 'The', 'Confederation', 'West', 'Federation', 'WAFF', 'IOC', 'PFA', 'National', 'Olympic', 'Committee', '

In [13]:
cnn['org_with_brac'] = sort[0]
cnn['gpe_with_brac'] = sort[1]
cnn['person_with_brac'] = sort[2]
cnn['location_with_brac'] = sort[3]

cnn['spacy_org']=cnn['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['spacy_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['spacy_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['spacy_location'] = cnn['location_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac','location_with_brac'],axis = 1)

cnn.head()

Unnamed: 0,url,text,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location
0,https://edition.cnn.com/2023/02/28/sport/los-a...,During the Lakers' 27-point comeback victory o...,Western Conference,"Los Angeles, August, Midseason, Dallas, James,...","James, Dennis Schröder, Jonas Valančiūnas, Rob...","Lakers, Dallas Mavericks, LeBron, Memphis Griz...","the,Dallas,Mavericks,NBA,New,Orleans,Pelicans,...","Jonas, Valančiūnas, Dallas","Lakers, LeBron, James, looks, Dennis, Schröder...","the, Memphis, Grizzlies"
1,https://edition.cnn.com/2023/02/16/sport/lesle...,(CNN) It's while running in the moors and hill...,Western Front,"Scotland, Germany, Unbeknownst, Bäumer, Russia...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Kammerer, Armistice...","BAFTA,XTERRA","Scotland, Germany, Ukraine, Hollywood, Netflix...","Lesley, Paterson, Oscar, Erich, Maria, Remarqu...","Triathlete, Lesley, Paterson"
2,https://edition.cnn.com/2023/02/27/sport/damia...,(CNN) Damian Lillard set an NBA record in his ...,Western Conference,"Portland, Lillard","Damian Lillard, Lillard, Donovan Mitchell, Kla...","NBA, Houston Rockets, Chicago Bulls, Blazers, ...","Damian,Lillard,NBA,the,Portland,Trail,Blazers,...",Portland,"Lillard, Donovan, Mitchell, 's, Klay, Thompson...","the, west, coast, Bay, Area"
3,https://edition.cnn.com/2023/02/27/football/su...,(CNN) Betrayal has formed part of European pol...,,"European, Europe, Russian, American, Middle Ea...","Julius Caesar, Brute, Cards, Super League, Jef...","Betrayal, Roman Republic, European Super Leagu...","Julius,Caesar,European,Super,League,ESL,the,Ch...","the, Roman, Republic, Leicester, City, Manches...","Betrayal, Jeff, Zimbalist, Aleksander, Čeferin...","Europe, Middle, Eastern"
4,https://edition.cnn.com/2023/02/22/football/pa...,(CNN) The Palestinian team has never taken par...,"West Bank,Gaza City,East Jerusalem,Gaza Strip,...","Palestinian, Palestine, Portugal, African, Isr...","Arab, Gaza, East Jerusalem, Rami, Arab Emirate...","Qatar, Atlas Lions, Hamadeh, Mohammed Bin Zaye...","the,Atlas,Lions,State,AFC,Asian,Cup,Sports,Tub...","Portugal, Israel, the, West, Bank, Gaza, East,...","Rami, Hamadeh, Jordan, Ahmed, Daraghmeh, Mahmo...","the, Middle, East, Asia"


In [14]:
#cnn.to_csv('NER_data/cnn_NER.csv')

### Evaluation

In [15]:
path = r'../1.raw_data/golden_truth/golden_truth_cnn.csv' # use your path

cnn_gold = pd.read_csv(path)
cnn_gold = cnn_gold.loc[:,['url','organisation','gpe','location','person']]
print(cnn_gold.shape)
cnn_gold

(54, 5)


Unnamed: 0,url,organisation,gpe,location,person
0,https://edition.cnn.com/2023/02/26/sport/austr...,International Cricket Council (ICC),Cape Town,Newlands Cricket Ground,"Meg Lanning, Alyssa Healy, Beth Mooney, Nadine..."
1,https://edition.cnn.com/2023/02/27/football/su...,"Roman Republic, European Super League (ESL), U...","Europe, British, American, Middle Eastern",Switzerland,"Julius Caesar, Aleksander Ceferin, Andrea Agne..."
2,https://edition.cnn.com/2023/02/16/golf/tiger-...,"PGA Tour, Riviera Country Club, TGR Foundation...",California,"Riviera Country Club, California, The Masters ...","Tiger Woods, Max Homa, Keith Mitchell, Sam Sne..."
3,https://edition.cnn.com/2023/02/19/sport/tiger...,PGA Tour,"California, USA","Genesis Invitational, Riviera Country Club","Tiger Woods, Jon Rahm, Justin Thomas"
4,https://edition.cnn.com/2023/02/27/sport/jake-...,Love Island,Saudi Arabia,"Diriyah Arena, Riyadh","Jake Paul, Tommy Fury, Tyson Fury, Mike Tyson,..."
5,https://edition.cnn.com/2023/02/27/football/li...,"Paris Saint-Germain (PSG), Marseille, Opta, FE...","Paris, France.",Ligue 1 (French soccer league).,"Kylian Mbappé, Lionel Messi, Cristiano Ronaldo..."
6,https://edition.cnn.com/2023/02/26/sport/byron...,"Dallas Cowboys, Miami Dolphins, NFL",United States,Dallas,Byron Jones
7,https://edition.cnn.com/2023/02/25/tennis/barb...,WTA (Women's Tennis Association),"Dubai, Czech Republic, Poland",Dubai,"Serena Williams, Caroline Wozniacki, Ons Jabeu..."
8,https://edition.cnn.com/2023/02/27/tennis/nova...,"Association of Tennis Professionals (ATP), Wom...",United States,"Dubai, Australian Open, Indian Wells, Miami Op...","Novak Djokovic, Steffi Graf, Rafael Nadal"
9,https://edition.cnn.com/2023/02/26/sport/arman...,"All Star Perche, FFAthlétisme (French Athletic...","France, United States, Ukraine","Clermont-Ferrand, France, Oregon, Donetsk","Armand Duplantis, Renaud Lavillenie, Kurtis Ma..."


In [16]:
eval_cnn = cnn.set_index('url').join(cnn_gold.set_index('url'))
print(eval_cnn.shape)
eval_cnn.head()

(82, 13)


Unnamed: 0_level_0,text,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location,organisation,gpe,location,person
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
https://edition.cnn.com/2023/02/15/sport/jonathan-gannon-eagles-arizona-cardinals-nfl-spt-intl,(CNN) The Arizona Cardinals have hired Philade...,Cardinals,Gannon,"Philadelphia Eagles, Jonathan Gannon, Shane St...","Arizona Cardinals, Eagles, Kansas City Chiefs,...","The,Arizona,Cardinals,the,Kansas,City,Chiefs,I...",Eagles,"Philadelphia, Eagles, Jonathan, Gannon, Shane,...",,"Arizona Cardinals, Philadelphia Eagles, Indian...",Super Bowl,"Arizona, Philadelphia, Kansas City, Indianapol...","Jonathan Gannon, Shane Steichen"
https://edition.cnn.com/2023/02/15/sport/kansas-city-chiefs-super-bowl-parade-spt-intl,(CNN) It was party time in Missouri on Wednesd...,,"Missouri, Chiefs, City","Super Bowl, Kansas City, Patrick Mahomes, Maho...","MVP, NFL, Super Bowl, Union Station, Chiefs","Super,Bowl,Mahomes,MVP,NFL,Union,Station,Kansa...","Missouri, Kansas, City","Patrick, Mahomes",,"NFL, Kansas City Chiefs, Bud Light",Eagles,"Missouri, Kansas City, Union Station, Arrowhea...","Patrick Mahomes, Travis Kelce, Jamaal Charles,..."
https://edition.cnn.com/2023/02/16/golf/tiger-woods-genesis-invitational-pga-tour-spt-intl,"(CNN) In February 1992, a sprightly 16-year-ol...",,"California, Match, Record, Snead, Sound, American","Tiger Woods, Woods, Max Homa, Keith Mitchell, ...","PGA Tour, Riviera Country Club, Genesis Invita...","the,Riviera,Country,Club,Woods,Genesis,Invitat...",California,"Tiger, Woods, Max, Homa, Keith, Mitchell, LeBr...",,"PGA Tour, Riviera Country Club, TGR Foundation...",California,"Riviera Country Club, California, The Masters ...","Tiger Woods, Max Homa, Keith Mitchell, Sam Sne..."
https://edition.cnn.com/2023/02/16/sport/lesley-paterson-all-quiet-on-the-western-front-film-triathlon-spt-intl,(CNN) It's while running in the moors and hill...,Western Front,"Scotland, Germany, Unbeknownst, Bäumer, Russia...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Kammerer, Armistice...","BAFTA,XTERRA","Scotland, Germany, Ukraine, Hollywood, Netflix...","Lesley, Paterson, Oscar, Erich, Maria, Remarqu...","Triathlete, Lesley, Paterson",Netflix,"France, Great Britain","Scotland, Germany, Ukraine, Costa Rica, Birmin...","Lesley Paterson, Paul Bäumer, Ian Stokell"
https://edition.cnn.com/2023/02/19/sport/tiger-woods-genesis-invitational-third-round-spt-intl,(CNN) It was an impressive and eventful third ...,,,"Tiger Woods, Woods","Genesis Invitational, Riviera Country Club, PG...","Woods,the,Riviera,Country,Club,PGA,Tour,Genesi...",,,,PGA Tour,"California, USA","Genesis Invitational, Riviera Country Club","Tiger Woods, Jon Rahm, Justin Thomas"


In [24]:
#eval_cnn2 = eval_cnn.dropna(Subset['organisation', 'gpe', 'location', 'person'], how = 'all')
eval_cnn2 = eval_cnn[ (eval_cnn['organisation'].notna()) & (eval_cnn['gpe'].notna()) & (eval_cnn['location'].notna()) & (eval_cnn['person'].notna())]
print(eval_cnn2.shape)
eval_cnn2.head()

(40, 13)


Unnamed: 0_level_0,text,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location,organisation,gpe,location,person
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
https://edition.cnn.com/2023/02/15/sport/jonathan-gannon-eagles-arizona-cardinals-nfl-spt-intl,(CNN) The Arizona Cardinals have hired Philade...,Cardinals,Gannon,"Philadelphia Eagles, Jonathan Gannon, Shane St...","Arizona Cardinals, Eagles, Kansas City Chiefs,...","The,Arizona,Cardinals,the,Kansas,City,Chiefs,I...",Eagles,"Philadelphia, Eagles, Jonathan, Gannon, Shane,...",,"Arizona Cardinals, Philadelphia Eagles, Indian...",Super Bowl,"Arizona, Philadelphia, Kansas City, Indianapol...","Jonathan Gannon, Shane Steichen"
https://edition.cnn.com/2023/02/15/sport/kansas-city-chiefs-super-bowl-parade-spt-intl,(CNN) It was party time in Missouri on Wednesd...,,"Missouri, Chiefs, City","Super Bowl, Kansas City, Patrick Mahomes, Maho...","MVP, NFL, Super Bowl, Union Station, Chiefs","Super,Bowl,Mahomes,MVP,NFL,Union,Station,Kansa...","Missouri, Kansas, City","Patrick, Mahomes",,"NFL, Kansas City Chiefs, Bud Light",Eagles,"Missouri, Kansas City, Union Station, Arrowhea...","Patrick Mahomes, Travis Kelce, Jamaal Charles,..."
https://edition.cnn.com/2023/02/16/golf/tiger-woods-genesis-invitational-pga-tour-spt-intl,"(CNN) In February 1992, a sprightly 16-year-ol...",,"California, Match, Record, Snead, Sound, American","Tiger Woods, Woods, Max Homa, Keith Mitchell, ...","PGA Tour, Riviera Country Club, Genesis Invita...","the,Riviera,Country,Club,Woods,Genesis,Invitat...",California,"Tiger, Woods, Max, Homa, Keith, Mitchell, LeBr...",,"PGA Tour, Riviera Country Club, TGR Foundation...",California,"Riviera Country Club, California, The Masters ...","Tiger Woods, Max Homa, Keith Mitchell, Sam Sne..."
https://edition.cnn.com/2023/02/16/sport/lesley-paterson-all-quiet-on-the-western-front-film-triathlon-spt-intl,(CNN) It's while running in the moors and hill...,Western Front,"Scotland, Germany, Unbeknownst, Bäumer, Russia...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Kammerer, Armistice...","BAFTA,XTERRA","Scotland, Germany, Ukraine, Hollywood, Netflix...","Lesley, Paterson, Oscar, Erich, Maria, Remarqu...","Triathlete, Lesley, Paterson",Netflix,"France, Great Britain","Scotland, Germany, Ukraine, Costa Rica, Birmin...","Lesley Paterson, Paul Bäumer, Ian Stokell"
https://edition.cnn.com/2023/02/19/sport/tiger-woods-genesis-invitational-third-round-spt-intl,(CNN) It was an impressive and eventful third ...,,,"Tiger Woods, Woods","Genesis Invitational, Riviera Country Club, PG...","Woods,the,Riviera,Country,Club,PGA,Tour,Genesi...",,,,PGA Tour,"California, USA","Genesis Invitational, Riviera Country Club","Tiger Woods, Jon Rahm, Justin Thomas"


#### NLTK

Location

In [60]:
true_loc = eval_cnn2["location"].values.tolist()

new_true_loc = []
for x in true_loc:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_true_loc.append(i)

print(new_true_loc)

['Arizona', ' Philadelphia', ' Kansas City', ' Indianapolis', ' NFL draft.', 'Missouri', ' Kansas City', ' Union Station', ' Arrowhead Stadium', ' Philadelphia', 'Riviera Country Club', ' California', ' The Masters at Augusta', 'Scotland', ' Germany', ' Ukraine', ' Costa Rica', ' Birmingham', ' Alabama', ' California', 'Genesis Invitational', ' Riviera Country Club', 'Southern California', 'Qatar', ' Portugal', ' Israel', ' West Bank', ' Gaza', ' East Jerusalem', ' Abu Dhabi', ' Nablus', ' Tubas', ' Al-Shifa hospital', ' Mongolia', ' Yemen', ' Singapore', ' United States', ' Japan', ' London', 'Not specified', 'New Zealand', ' Wales', ' France', ' Spain', ' US', 'United States', ' Indian Wells', ' Miami', ' Belgrade', ' New York', ' Australia', ' Dubai', 'Ancient Greece', ' Paris', ' Tokyo', ' New Delhi', ' Tashkent', ' UK', ' US', ' Russia', ' Belarus', ' Ukraine', ' Kyiv', 'Wembley Stadium', ' London', ' Los Angeles', ' Australia', ' New Zealand', 'Ukraine', ' Irpin', ' Thailand', ' 

In [61]:
nltk_loc = eval_cnn2['nltk_location'].values.tolist()
print(nltk_loc)

new_nltk_loc = []
for x in nltk_loc:
    if len(x)!=0:
        txt = x.split(",")
        for i in txt:
            new_nltk_loc.append(i)

print(new_nltk_loc)

['Cardinals', '', '', 'Western Front', '', 'Southern California', 'West Bank,Gaza City,East Jerusalem,Gaza Strip,Gaza,West Asian', 'South Carolina', '', '', '', 'Southern California', '', '', '', '', 'South Africa', '', 'Southern Oregon', '', '', '', '', '', '', '', '', '', '', '', '', 'Western Conference', '', '', '', '', '', '', '', '']
['Cardinals', 'Western Front', 'Southern California', 'West Bank', 'Gaza City', 'East Jerusalem', 'Gaza Strip', 'Gaza', 'West Asian', 'South Carolina', 'Southern California', 'South Africa', 'Southern Oregon', 'Western Conference']


In [62]:
#nltk precision, recall, f1 score
correct = 0

for x in new_nltk_loc:
    if x in new_true_loc:
        correct+=1

nltk_loc_recall = correct/len(new_true_loc)
nltk_loc_precision = correct/len(new_nltk_loc)
nltk_loc_f1_score = 2*((nltk_loc_precision * nltk_loc_recall)/ (nltk_loc_precision + nltk_loc_recall))
    
print("Recall = ", nltk_loc_recall)
print()
print("Precision = ", nltk_loc_precision)
print()
print("f1 score = ", nltk_loc_f1_score)

Recall =  0.012121212121212121

Precision =  0.14285714285714285

f1 score =  0.0223463687150838


GPE

In [63]:
true_gpe = eval_cnn2["gpe"].values.tolist()

new_true_gpe = []
for x in true_gpe:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_true_gpe.append(i)

print(new_true_gpe)

['Super Bowl', 'Eagles', ' California', 'France', ' Great Britain', 'California', ' USA', 'None mentioned.', 'Palestinian', ' Morocco', ' Africa', ' Arab', ' United Arab Emirates', ' Middle East', ' FIFA', 'Not specified', 'ew Zealand', ' France', ' Wales', ' Spain', ' US', 'Serbian', 'Russia', ' US', ' Cuba', ' Dutch', 'US', ' England', ' Southern California', ' Philippines', ' Peru', ' Germany', 'Russia', ' Britain', ' United States', ' United Nations High Commissioner for Human Rights (OHCHR)', 'Dubai', ' Czech Republic', ' Poland', 'Saudi Arabia', ' United States.', 'France', ' United States', ' Ukraine', 'Cape Town', 'United States', 'Southern Oregon', ' Virginia', ' Turkey', 'eleven Turkish provinces', 'Paris', ' France.', 'Europe', ' British', ' American', ' Middle Eastern', 'America', ' Georgia', 'UN', 'Saudi Arabia', 'United States', 'Paris', ' France', ' Morocco', 'Argentina', ' USA', ' England', ' Lyon', 'Argentina', 'Turkey', 'US', ' Alaska', ' Hawaii', 'Western Conference'

In [64]:
nltk_gpe = eval_cnn2['nltk_gpe'].values.tolist()

new_nltk_gpe = []
for x in nltk_gpe:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_nltk_gpe.append(i)

print(new_nltk_gpe)

['Gannon', 'Missouri', ' Chiefs', ' City', 'California', ' Match', ' Record', ' Snead', ' Sound', ' American', 'Scotland', ' Germany', ' Unbeknownst', ' Bäumer', ' Russian', ' Ukraine', ' German', ' French', ' Costa Rica', ' Birmingham', ' Alabama', ' Scottish', ' California', ' English', 'Palestinian', ' Palestine', ' Portugal', ' African', ' Israeli-Palestinian', ' Middle East', ' Israel', ' Palestinians', ' Palestianian', ' Jordan', ' Abu Dhabi', ' United', ' Arabic', ' Israeli', ' Tubas', ' Nablus', ' Hamas', ' Yemen', ' Jerusalem', ' Japan', ' United States', ' Singapore', ' Canaan', ' Tunisian', ' Egyptian', ' Entry', ' Asian', ' Egypt', ' Qatar', ' Morocco', ' Mongolia', ' Uzbekistan', ' Saudi Arabia', ' Spain', ' Spanish', ' Hamed', ' Ramallah', ' Portuguese', ' North Korea', ' Jiangyin', ' China', ' Dabbagh', ' European', ' Arouca', ' Belgian', ' Infrastructure', ' Bethlehem', ' Tel Aviv', ' Asia', 'Tiger', ' France', ' Bangkok', ' North', ' South Korea', ' Barbed', ' Uummanna

In [65]:
#nltk precision, recall, f1 score
correct = 0

for x in new_nltk_gpe:
    if x in new_true_gpe:
        correct+=1

nltk_gpe_recall = correct/len(new_true_gpe)
nltk_gpe_precision = correct/len(new_nltk_gpe)
nltk_gpe_f1_score = 2*((nltk_gpe_precision * nltk_gpe_recall)/ (nltk_gpe_precision + nltk_gpe_recall))
    
print("Recall = ", nltk_gpe_recall)
print()
print("Precision = ", nltk_gpe_precision)
print()
print("f1 score = ", nltk_gpe_f1_score)

Recall =  0.6086956521739131

Precision =  0.19243986254295534

f1 score =  0.2924281984334204


Person

In [66]:
true_per = eval_cnn2["person"].values.tolist()

new_true_per = []
for x in true_per:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_true_per.append(i)

print(new_true_per)

['Jonathan Gannon', ' Shane Steichen', 'Patrick Mahomes', ' Travis Kelce', ' Jamaal Charles', ' Donna Kelce', ' Derrick Nnadi', ' Trey Smith', ' Orlando Brown Jr.', ' JuJu Smith-Schuster', ' James Bradberry', ' AJ Brown', 'Tiger Woods', ' Max Homa', ' Keith Mitchell', ' Sam Snead', ' LeBron James', ' Kareem Abdul-Jabbar', ' Rory McIlroy', ' Justin Thomas', ' Scottie Scheffler', 'Lesley Paterson', ' Paul Bäumer', ' Ian Stokell', 'Tiger Woods', ' Jon Rahm', ' Justin Thomas', 'Tiger Woods', ' Jon Rahm', ' Scottie Scheffler', ' Max Homa', ' Patrick Cantlay', 'Rami Hamadeh', ' Ahmed Daraghmeh', ' Mahmoud Sarsak', ' Jibril Rajoub', 'Payne Stewart', ' Tiger Woods', ' Rickie Fowler', ' Shingo Katayama', ' John Daly', 'Campbell Johnstone', ' Ruby Tui', ' Aaron Smith', ' Phil Rees', ' Nigel Owens', ' Gareth Thomas', ' Dan Palmer', ' Grant Robertson', 'Novak Djokovic', ' Rafael Nadal', ' Daniil Medvedev', ' Roger Federer', ' Steffi Graf', 'Muhammad Ali', ' George Foreman', ' Joe Frazier', ' Nicol

In [67]:
nltk_per = eval_cnn2['nltk_person'].values.tolist()

new_nltk_per = []
for x in nltk_per:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_nltk_per.append(i)
    else:
        new_nltk_per.append(x)

print(new_nltk_per)

['Philadelphia Eagles', ' Jonathan Gannon', ' Shane Steichen', ' Gannon', ' Super Bowl', 'Super Bowl', ' Kansas City', ' Patrick Mahomes', ' Mahomes', ' Kansas', 'Tiger Woods', ' Woods', ' Max Homa', ' Keith Mitchell', ' Sam Snead', ' Kareem Abdul-Jabbar', ' James', ' Kareem', ' Sam', ' Snead', ' Rory McIlroy', ' Justin Thomas', ' Thomas', ' Scottie Scheffler', ' Scheffler', ' Tiger', ' Joaquin Niemann', ' Niemann', 'Lesley Paterson', ' Paterson', ' Erich Maria Remarque', ' Paul Bäumer', ' Felix', ' Ian Stokell', ' Netflix', ' Triathlete', ' Great Britain', ' Sam Mendes', ' Edward Berger', ' Malte Grunert', 'Tiger Woods', ' Woods', 'Tiger Woods', ' Jon Rahm', ' Woods', 'Arab', ' Gaza', ' East Jerusalem', ' Rami', ' Arab Emirates', ' Rami Hamadeh', ' Ahmed Daraghmeh', ' Daraghmeh', ' Mahmoud Sarsak', ' Sarsak', ' Mongolia', ' Hamadeh', ' Jabal', ' Bnei Sakhnin', ' Jibril Rajoub', ' Henley', ' Makram', ' Makram Daboub', ' Daboub', ' Arab Cup', ' Gianni Infantino', ' Ismail El Haddad', ' 

In [68]:
#nltk precision, recall, f1 score
correct = 0

for x in new_nltk_per:
    if x in new_true_per:
        correct+=1

nltk_per_recall = correct/len(new_true_per)
nltk_per_precision = correct/len(new_nltk_per)
nltk_per_f1_score = 2*((nltk_per_precision * nltk_per_recall)/ (nltk_per_precision + nltk_per_recall))
    
print("Recall = ", nltk_per_recall)
print()
print("Precision = ", nltk_per_precision)
print()
print("f1 score = ", nltk_per_f1_score)

Recall =  0.551219512195122

Precision =  0.28463476070528965

f1 score =  0.37541528239202654


Org

In [70]:
true_org = eval_cnn2["organisation"].values.tolist()

new_true_org = []
for x in true_org:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_true_org.append(i)

print(new_true_org)

['Arizona Cardinals', ' Philadelphia Eagles', ' Indianapolis Colts', ' NFL', 'NFL', ' Kansas City Chiefs', ' Bud Light', 'PGA Tour', ' Riviera Country Club', ' TGR Foundation', ' Sports Illustrated/Getty Images', ' PNC Championship', ' TNT', ' Augusta', 'Netflix', 'PGA Tour', 'PGA Tour', ' CBS', 'Hamas', ' Israel Defense Forces (IDF)', ' Islamic Jihad group', ' Palestinian Football Association (PFA)', 'PGA Tour', ' Team USA', 'All Blacks', ' Canterbury', ' Crusaders', ' Ospreys', ' Biarritz', 'Not mentioned', 'International Boxing Association (IBA)', ' International Olympic Committee (IOC)', ' USA Boxing', 'Angel City FC', ' Harvard-Westlake', " US Women's National Team", ' Total Futbol Academy of MLS Next', ' Nike', ' NWSL', 'Bellator MMA', "WTA (Women's Tennis Association)", 'Manchester United', ' Newcastle United', ' Sky Sports', ' Saudi Arabian Public Investment Fund (PIF)', ' Real Madrid.', 'All Star Perche', ' FFAthlétisme (French Athletics Federation)', 'International Cricket Co

In [71]:
nltk_orgz = eval_cnn2['nltk_org'].values.tolist()

new_nltk_org = []
for x in nltk_orgz:
    if len(x)>1:
        txt = x.split(",")
        for i in txt:
            new_nltk_org.append(i)

print(new_nltk_org)

['Arizona Cardinals', ' Eagles', ' Kansas City Chiefs', ' Super Bowl', 'MVP', ' NFL', ' Super Bowl', ' Union Station', ' Chiefs', 'PGA Tour', ' Riviera Country Club', ' Genesis Invitational', ' TGR Foundation', ' Open', ' ZOZO', ' PNC', ' TNT', ' Masters', ' Augusta', ' PGA', ' LeBron James', ' NBA', ' LeBron', ' McIlroy', ' Phoenix Open', ' LIV Golf Tour', ' McIroy', 'First', ' Scottish Highlands', ' Kammerer', ' Armistice Talks', ' BAFTA', ' Oscars', ' XTERRA', 'Genesis Invitational', ' Riviera Country Club', ' PGA Tour', 'Genesis Invitational', ' PGA Tour', ' Open', ' CBS', 'Qatar', ' Atlas Lions', ' Hamadeh', ' Mohammed Bin Zayed Stadium', ' AFC Asian', ' CNN Sports', ' IDF', ' Reuters', ' Islamic Jihad', ' Philippines', ' IDs', ' PFA', ' AP', ' FIFA', ' Partners', ' Daboub', ' AFC', ' WAFF', ' FA Jibril', ' IOC', ' General', ' Shaheen', ' National', ' IFA', ' FIFA Forward', ' Futsal Hall', ' Faisal', ' International', ' Palestine', ' PFA Hilal', ' UEFA', 'PGA Tour', ' Kantarat Gol

In [72]:
#nltk precision, recall, f1 score
correct = 0

for x in new_nltk_org:
    if x in new_true_org:
        correct+=1

nltk_org_recall = correct/len(new_true_org)
nltk_org_precision = correct/len(new_nltk_org)
nltk_org_f1_score = 2*((nltk_org_precision * nltk_org_recall)/ (nltk_org_precision + nltk_org_recall))
    
print("Recall = ", nltk_org_recall)
print()
print("Precision = ", nltk_org_precision)
print()
print("f1 score = ", nltk_org_f1_score)

Recall =  0.3219178082191781

Precision =  0.1501597444089457

f1 score =  0.20479302832244012


Average recall, precision & f1 score

In [73]:
avg_recall = (nltk_org_recall + nltk_per_recall + nltk_gpe_recall + nltk_loc_recall)/4
avg_precision = (nltk_org_precision + nltk_per_precision + nltk_gpe_precision + nltk_loc_precision)/4
avg_f1_score = (nltk_org_f1_score + nltk_per_f1_score + nltk_gpe_f1_score + nltk_loc_f1_score)/4

print("Recall = ", avg_recall)
print()
print("Precision = ", avg_precision)
print()
print("f1 score = ", avg_f1_score)

Recall =  0.37348854617735633

Precision =  0.1925228776285834

f1 score =  0.2237457194657427


## cna news

In [None]:
# Merging cna news into 1 dataframe
path = r'./2.text_preprocessing/cna/cna_FINAL.csv' # use your path

cna = pd.read_csv(path)
cna = cna.loc[:,['url','content']]
print(cna.shape)
cna

NLTK extraction

In [16]:
cna_content = []

ner_info_extraction(cna['content'],cna_content)
    
print(cna_content)

[[[('ORGANIZATION', 'AC Milan'), ('ORGANIZATION', 'Champions League'), ('ORGANIZATION', 'Tottenham Hotspur')], [('ORGANIZATION', 'Italians'), ('ORGANIZATION', 'San Siro'), ('GPE', 'Tottenham')], [('PERSON', 'Tottenham'), ('PERSON', 'Antonio Conte'), ('PERSON', 'Cristian Romero')], [('PERSON', 'Milan')], [('PERSON', 'Stefano Pioli'), ('PERSON', 'Milan'), ('PERSON', 'Tottenham'), ('ORGANIZATION', 'Sheffield United'), ('PERSON', 'Wolverhampton Wanderers')], [('PERSON', 'Milan'), ('GPE', 'European'), ('PERSON', 'Pioli')], [], [], [('PERSON', 'Pioli')], [], [('PERSON', 'Tottenham')], [], [('GPE', 'Tottenham'), ('PERSON', 'Harry Kane'), ('PERSON', 'Son Heung-min'), ('PERSON', 'Dejan Kulusevski'), ('PERSON', 'Milan'), ('PERSON', 'Junior Messias')], [('ORGANIZATION', 'Italians'), ('PERSON', 'Brahim Diaz'), ('PERSON', 'Fraser Forster')], [('PERSON', 'Tottenham'), ('PERSON', 'Milan'), ('PERSON', 'Mike Maignan')], [('PERSON', 'Tottenham'), ('PERSON', 'Rafael Leao')], [('PERSON', 'Kane'), ('PERSON

In [17]:
location_with_brac = []
gpe_with_brac = []
person_with_brac = []
org_with_brac = []

ner_sorting(cna_content, location_with_brac, gpe_with_brac, person_with_brac, org_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

print(org_with_brac)

[[], [], [], [], [], [], [], ['Northern Irishman'], [], ['Southern California'], [], [], [], [], [], ['East Brabant'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Ham United'], [], ['South Africa', 'Caribbean'], [], [], [], ['West'], [], [], [], [], [], [], [], [], [], ['West', 'West Indies'], [], [], [], [], [], [], [], [], ['South'], [], [], [], [], [], ['Southeast'], ['South Korean'], [], [], [], [], ['West Ham United'], [], [], [], [], [], [], [], [], [], [], ['Sebastian Munoz'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Ham United', 'West Bromwich'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Germany', 'West German'], ['West'], ['West Ham'], [], ['Western Force'], [], [], [], [], [], [], ['West Ham'], [], [], []]

[['Tottenham', 'European'], ['MUNICH', 'Germans', 'Paris', 'Bayern', 'Europe', 'Messi', 'Vitinha', 'Dutch'], ['Brighton', 'United', 'Chelsea', 'Nor

In [18]:
cna['location_with_brac'] = location_with_brac
cna['gpe_with_brac'] = gpe_with_brac
cna['person_with_brac'] = person_with_brac
cna['org_with_brac'] = org_with_brac

cna['nltk_location']=cna['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['nltk_gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['nltk_person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['nltk_org'] = cna['org_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'org_with_brac'],axis = 1)

cna.head()

Unnamed: 0,url,content,nltk_location,nltk_gpe,nltk_person,nltk_org
0,https://www.channelnewsasia.com/sport/steely-m...,LONDON :Seven-time winners AC Milan reached th...,,"Tottenham, European","Tottenham, Antonio Conte, Cristian Romero, Mil...","AC Milan, Champions League, Tottenham Hotspur,..."
1,https://www.channelnewsasia.com/sport/choupo-m...,MUNICH: Bayern Munich forward Eric-Maxim Choup...,,"MUNICH, Germans, Paris, Bayern, Europe, Messi,...","Bayern Munich, Serge Gnabry, Paris St Germain,...","Champions League, PSG, Leon Goretzka, VfB Stut..."
2,https://www.channelnewsasia.com/sport/chelsea-...,LONDON: Chelsea's Guro Reiten scored from the ...,,"Brighton, United, Chelsea, Norway, Blues, Women","Chelsea, Guro Reiten, Johanna Rytting Kaneryd,...","Super League, Jess Carter, Swede"
3,https://www.channelnewsasia.com/sport/nothing-...,LONDON: AC Milan can dare to dream about going...,,"London, Europe, European, Milan, Serie","Milan, Stefano Pioli, Tottenham Hotspur, Pioli...","Champions League, AS Roma"
4,https://www.channelnewsasia.com/sport/contes-c...,LONDON: Antonio Conte's dismal record in the C...,,"Tottenham, London, Italian, Italy","Antonio Conte, Conte, Juventus, Chelsea, Totte...","Champions League, Tottenham Hotspur, AC Milan,..."


spacy NER extraction

In [19]:
cont = cna['content'].apply(nlp)

cna_list = spacy_ner(cont)

print(cna_list)

[[['LONDON', 'GPE'], ['AC', 'ORG'], ['Milan', 'ORG'], ['London', 'GPE'], ['Tottenham', 'GPE'], ['Tottenham', 'GPE'], ['Antonio', 'PERSON'], ['Conte', 'PERSON'], ['Romero', 'PERSON'], ['Milan', 'GPE'], ['Stefano', 'PERSON'], ['Pioli', 'PERSON'], ["'s", 'PERSON'], ['Milan', 'GPE'], ['Tottenham', 'GPE'], ['Sheffield', 'GPE'], ['United', 'GPE'], ['Wolverhampton', 'PERSON'], ['Wanderers', 'PERSON'], ['Milan', 'GPE'], ['Pioli', 'GPE'], ['Tottenham', 'ORG'], ['Pioli', 'GPE'], ['GEAR', 'ORG'], ['Tottenham', 'GPE'], ['Tottenham', 'ORG'], ['Harry', 'PERSON'], ['Kane', 'PERSON'], ['Son', 'PERSON'], ['Heung', 'PERSON'], ['-', 'PERSON'], ['min', 'PERSON'], ['Dejan', 'PERSON'], ['Kulusevski', 'PERSON'], ['Milan', 'GPE'], ['Brahim', 'PERSON'], ['Diaz', 'PERSON'], ['Fraser', 'GPE'], ['Forster', 'GPE'], ['Tottenham', 'GPE'], ['Milan', 'GPE'], ['Mike', 'PERSON'], ['Maignan', 'PERSON'], ['Pierre', 'PERSON'], ['-', 'PERSON'], ['Emile', 'PERSON'], ['Hojbjerg', 'PERSON'], ['Tottenham', 'ORG'], ['Rafael', 'P

In [20]:
sort = spacy_sorting(cna_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])
print()

print(sort[3])

[['AC', 'Milan', 'Tottenham', 'GEAR', 'Emerson'], ['Choupo', '-', 'Moting', 'Bayern', 'PSG', 'Matthijs', 'de', 'Ligt', 'Mbappe'], ['Chelsea', 'Brighton', '&', 'Hove', 'Albion', 'Women', "'s", 'Super', 'League', 'Arsenal', 'the', 'Cup'], ['AC', 'Milan', 'the', 'Champions', 'League', 'Serie', 'A', 'Chelsea'], ['Champions', 'League', 'AC', 'Milan', 'Juventus', 'Inter', 'Chelsea', 'the', 'Premier', 'Tottenham', 'Europa', 'Conference'], ['the', 'Champions', 'League', 'Bayern', 'Mbappe'], [], ['the', 'PGA', 'Tour', 'a', 'Hall', 'of', 'Fame', 'Sawgrass', 'Player', 'TRIUMPH', '\r\n\r\n', 'McIlroy', "'s", 'Scheffler', 'LIV', 'Golf', 'top-10', 'TPC', 'Anirban', 'Lahiri'], ['Woods'], ['Indian', 'Wells', 'Grand', 'Slam'], ['Commonwealth', 'Sport', 'Canada', 'CSC', "'s", 'the', 'Alberta', '2030', 'Games', 'Corporation', 'Federation'], ['WELLS', 'Netflix'], ['Juventus', "'", 'Europa', 'League', 'last-16', 'Pogba', 'Manchester', 'United', 'Serie', 'A.'], ['Mercedes', 'Bahrain', 'Grand', 'Prix', 'BBC'

In [21]:
cna['org_with_brac'] = sort[0]
cna['gpe_with_brac'] = sort[1]
cna['person_with_brac'] = sort[2]
cna['loc_with_brac'] = sort[3]

cna['spacy_org']=cna['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['spacy_gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['spacy_person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['spacy_location'] = cna['loc_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac', 'loc_with_brac'],axis = 1)

cna.head()

Unnamed: 0,url,content,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location
0,https://www.channelnewsasia.com/sport/steely-m...,LONDON :Seven-time winners AC Milan reached th...,,"Tottenham, European","Tottenham, Antonio Conte, Cristian Romero, Mil...","AC Milan, Champions League, Tottenham Hotspur,...","AC,Milan,Tottenham,GEAR,Emerson","LONDON, London, Tottenham, Milan, Sheffield, U...","Antonio, Conte, Romero, Stefano, Pioli, 's, Wo...",
1,https://www.channelnewsasia.com/sport/choupo-m...,MUNICH: Bayern Munich forward Eric-Maxim Choup...,,"MUNICH, Germans, Paris, Bayern, Europe, Messi,...","Bayern Munich, Serge Gnabry, Paris St Germain,...","Champions League, PSG, Leon Goretzka, VfB Stut...","Choupo,-,Moting,Bayern,PSG,Matthijs,de,Ligt,Mb...","Paris, Neymar, Sommer","Bayern, Munich, Eric, -, Maxim, Choupo, Moting...",Europe
2,https://www.channelnewsasia.com/sport/chelsea-...,LONDON: Chelsea's Guro Reiten scored from the ...,,"Brighton, United, Chelsea, Norway, Blues, Women","Chelsea, Guro Reiten, Johanna Rytting Kaneryd,...","Super League, Jess Carter, Swede","Chelsea,Brighton,&,Hove,Albion,Women,'s,Super,...","LONDON, Manchester, United, Norway, Brighton, ...","Johanna, Rytting, Kaneryd, Sam, Kerr, Jess, Ca...",
3,https://www.channelnewsasia.com/sport/nothing-...,LONDON: AC Milan can dare to dream about going...,,"London, Europe, European, Milan, Serie","Milan, Stefano Pioli, Tottenham Hotspur, Pioli...","Champions League, AS Roma","AC,Milan,the,Champions,League,Serie,A,Chelsea","LONDON, Milan, London, Pioli, Tottenham","Stefano, Pioli, Bayern, Munich, Benfica",Europe
4,https://www.channelnewsasia.com/sport/contes-c...,LONDON: Antonio Conte's dismal record in the C...,,"Tottenham, London, Italian, Italy","Antonio Conte, Conte, Juventus, Chelsea, Totte...","Champions League, Tottenham Hotspur, AC Milan,...","Champions,League,AC,Milan,Juventus,Inter,Chels...","LONDON, London, Tottenham, Italy, Amazon, Prim...","Antonio, Conte, 's, Wolverhampton, Wanderers",


In [22]:
cna.to_csv('NER_data/cna_NER.csv')