In [1]:
import pandas as pd

import re

import nltk
from nltk.tokenize import word_tokenize

import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

### Definitions

In [2]:
def extract_ne_from_tree ( tree ):
    result = []
    for s in tree.subtrees():
        label = s.label()
        if (label == 'PERSON' or label == 'GPE' or label == 'LOCATION' or label == 'ORGANIZATION'):
            leaves = s.leaves()
            ne = ''
            for l in leaves:
                ne = ne + ' ' + l[0]
            result.append((label, ne[1:]))
    return result

In [3]:
def ner_info_extraction(content ,content_list):
    for x in content:
        res_sents = nltk.sent_tokenize(x)
        res_tokens = [word_tokenize(sents) for sents in res_sents]
        tagged_res = [nltk.pos_tag(tokens) for tokens in res_tokens]
        res_tree = [nltk.ne_chunk(elem) for elem in tagged_res]
        res_list = [extract_ne_from_tree(tree) for tree in res_tree]
        content_list+=[(res_list)]

In [4]:
def ner_sorting(content ,location, gpe, person, org):
    for row in content:
        check_location = []
        check_gpe = []
        check_person = []
        check_org = []
        for sent in row:
            if len(sent)!=0:
                for tup in sent:
                    if tup[0] == "LOCATION":
                        if tup[1] not in check_location:
                            check_location.append(tup[1])
                    if tup[0] == "GPE":
                        if tup[1] not in check_gpe:
                            check_gpe.append(tup[1])
                    if tup[0] == "PERSON":
                        if tup[1] not in check_person:
                            check_person.append(tup[1])
                    if tup[0] == "ORGANIZATION":
                        if tup[1] not in check_org:
                            check_org.append(tup[1])
        location.append(check_location)
        gpe.append(check_gpe)
        person.append(check_person)
        org.append(check_org)

In [5]:
def spacy_ner(content):
    cnn_list = []
    for doc in content:
        temp = []
        for token in doc:
            ner_tags = [token.text, token.ent_type_]
            if ner_tags[1]=="ORG" or ner_tags[1]=="PERSON" or ner_tags[1]=="GPE" or ner_tags[1]=="LOC":
                temp.append(ner_tags)
        cnn_list.append(temp)
    return cnn_list

In [6]:
def spacy_sorting(content):
    org = []
    gpe = []
    person = []
    location = []
    for row in content:
        check_org = []
        check_gpe = []
        check_person = []
        check_loc = []
        if len(row)!=0:
            for tup in row:
                if tup[1] == "ORG":
                    if tup[0] not in check_org:
                        check_org.append(tup[0])
                if tup[1] == "GPE":
                    if tup[0] not in check_gpe:
                        check_gpe.append(tup[0])
                if tup[1] == "PERSON":
                    if tup[0] not in check_person:
                        check_person.append(tup[0])
                if tup[1] == "LOC":
                    if tup[0] not in check_loc:
                        check_loc.append(tup[0])
        org.append(check_org)
        gpe.append(check_gpe)
        person.append(check_person)
        location.append(check_loc)
    return [org, gpe, person, location]

### cnn news

In [7]:
import glob
import os

path = r'./text_preprocessing/cnn_FINAL.csv' # use your path

cnn = pd.read_csv(path)
cnn = cnn.loc[:,['url','content']]
print(cnn.shape)
cnn

(85, 2)


Unnamed: 0,url,content
0,https://edition.cnn.com/2023/02/28/sport/los-a...,The Los Angeles Lakers‚Äô rollercoaster season ...
1,https://edition.cnn.com/2023/02/16/sport/lesle...,It‚Äôs while running in the moors and hills of ...
2,https://edition.cnn.com/2023/02/27/sport/damia...,Damian Lillard set an NBA record in his monst...
3,https://edition.cnn.com/2023/02/27/football/su...,Betrayal has formed part of European politics...
4,https://edition.cnn.com/2023/02/22/football/pa...,The Palestinian team has never taken part in ...
...,...,...
80,https://edition.cnn.com/2023/03/08/tennis/nova...,Time might have run out for Novak Djokovic to...
81,https://edition.cnn.com/2023/03/08/sport/ja-mo...,"Facing three injuries, the Memphis Grizzlies ..."
82,https://edition.cnn.com/2023/03/07/football/li...,UEFA plans to refund all Liverpool supporters...
83,https://edition.cnn.com/2023/03/08/football/me...,More than a year after US Soccer and the Unit...


NLTK extraction

In [8]:
cnn_content = []

ner_info_extraction(cnn['content'],cnn_content)
    
print(cnn_content)

[[[('GPE', 'Los Angeles'), ('ORGANIZATION', 'Lakers'), ('ORGANIZATION', 'Dallas Mavericks'), ('ORGANIZATION', 'LeBron')], [('ORGANIZATION', 'Lakers'), ('PERSON', 'James'), ('ORGANIZATION', 'Memphis Grizzlies')], [('ORGANIZATION', 'Lakers')], [('ORGANIZATION', 'NBA'), ('PERSON', 'Dennis Schr√∂der'), ('GPE', 'Los Angeles'), ('PERSON', 'Jonas Valanƒçi≈´nas'), ('ORGANIZATION', 'New Orleans Pelicans')], [('PERSON', 'Kevork'), ('ORGANIZATION', 'NBA'), ('PERSON', 'Rob Pelinka'), ('PERSON', 'James'), ('PERSON', 'Anthony Davis'), ('ORGANIZATION', 'Russell'), ('PERSON', 'Patrick Beverley'), ('GPE', 'August'), ('ORGANIZATION', 'Lakers'), ('PERSON', 'James'), ('PERSON', 'Pelinka'), ('ORGANIZATION', 'NBA'), ('ORGANIZATION', 'Lakers')], [('LOCATION', 'Western'), ('PERSON', 'Darvin Ham'), ('ORGANIZATION', 'Lakers'), ('ORGANIZATION', 'Lakers'), ('PERSON', 'Malik Beasley'), ('PERSON', 'D'), ('PERSON', 'Angelo Russell'), ('PERSON', 'Jarred Vanderbilt'), ('PERSON', 'Mo Bamba'), ('PERSON', 'Davon Reed'), 

In [9]:
location_with_brac = []
gpe_with_brac = []
person_with_brac = []
org_with_brac = []

ner_sorting(cnn_content, location_with_brac, gpe_with_brac, person_with_brac, org_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

print(org_with_brac)

[['Western', 'Western Conference'], ['Western Front'], ['Western Conference'], [], ['West Bank', 'Gaza City', 'East Jerusalem', 'Gaza Strip', 'Gaza', 'West', 'West Asian'], [], [], [], [], ['Southern'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['South Africa', 'South African'], [], [], [], [], ['Southern Oregon'], [], ['Cardinals'], ['Southern California'], [], [], [], [], [], [], ['West Virginia'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['South Korean'], [], [], [], [], ['Caribbean'], [], ['Western Conference'], [], [], []]

[['Los Angeles', 'August', 'Dallas', 'Portland'], ['Scotland', 'Germany', 'B√§umer', 'Russian', 'Ukraine', 'German', 'French', 'Costa Rica', 'Birmingham', 'Alabama', 'Scottish', 'California'], ['Damian', 'Portland', 'Lillard'], ['Betrayal', 'European', 'Europe', 'Russian', 'American', 'Middle Eastern', 'British', 'Juventus', 'Turin.Barcelona', 'Justice']

In [10]:
cnn['location_with_brac'] = location_with_brac
cnn['gpe_with_brac'] = gpe_with_brac
cnn['person_with_brac'] = person_with_brac
cnn['org_with_brac'] = org_with_brac

cnn['nltk_location']=cnn['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['nltk_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['nltk_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['nltk_org'] = cnn['org_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'org_with_brac'],axis = 1)

cnn.head()

Unnamed: 0,url,content,nltk_location,nltk_gpe,nltk_person,nltk_org
0,https://edition.cnn.com/2023/02/28/sport/los-a...,The Los Angeles Lakers‚Äô rollercoaster season ...,"Western,Western Conference","Los Angeles, August, Dallas, Portland","James, Dennis Schr√∂der, Jonas Valanƒçi≈´nas, Kev...","Lakers, Dallas Mavericks, LeBron, Memphis Griz..."
1,https://edition.cnn.com/2023/02/16/sport/lesle...,It‚Äôs while running in the moors and hills of ...,Western Front,"Scotland, Germany, B√§umer, Russian, Ukraine, G...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Armistice Talks, BA..."
2,https://edition.cnn.com/2023/02/27/sport/damia...,Damian Lillard set an NBA record in his monst...,Western Conference,"Damian, Portland, Lillard","Lillard, Donovan Mitchell, Klay Thompson, Chau...","Lillard, NBA, Houston Rockets, Chicago Bulls, ..."
3,https://edition.cnn.com/2023/02/27/football/su...,Betrayal has formed part of European politics...,,"Betrayal, European, Europe, Russian, American,...","Julius Caesar, Brute, Cards, Jeff Zimbalist, A...","Roman, European Super League, ESL, Champions L..."
4,https://edition.cnn.com/2023/02/22/football/pa...,The Palestinian team has never taken part in ...,"West Bank,Gaza City,East Jerusalem,Gaza Strip,...","Palestinian, Palestine, Portugal, African, Mid...","Arab, Gaza, East Jerusalem, Nakba, Arab Emirat...","Qatar, Atlas Lions, Hamadeh, Mohammed Bin Zaye..."


spacy

In [11]:
cont = cnn['content'].apply(nlp)

cnn_list = spacy_ner(cont)
    
print(cnn_list)

[[['The', 'ORG'], ['Los', 'ORG'], ['Angeles', 'ORG'], ['Lakers', 'ORG'], ['‚Äô', 'ORG'], ['the', 'ORG'], ['Dallas', 'ORG'], ['Mavericks', 'ORG'], ['LeBron', 'PERSON'], ['James', 'PERSON'], ['James', 'PERSON'], ['looks', 'PERSON'], ['James', 'PERSON'], ['the', 'LOC'], ['Memphis', 'LOC'], ['Grizzlies', 'LOC'], ['Fkn', 'PERSON'], ['Lakers', 'PERSON'], ['NBA', 'ORG'], ['Jonas', 'GPE'], ['Valanƒçi≈´nas', 'GPE'], ['the', 'ORG'], ['New', 'ORG'], ['Orleans', 'ORG'], ['Pelicans', 'ORG'], ['Kevork', 'PERSON'], ['Djansezian', 'PERSON'], ['NBA', 'ORG'], ['Rob', 'PERSON'], ['Pelinka', 'PERSON'], ['James', 'PERSON'], ['Anthony', 'PERSON'], ['Davis', 'PERSON'], ['Russell', 'PERSON'], ['Westbrook', 'PERSON'], ['Patrick', 'PERSON'], ['Beverley', 'PERSON'], ['Lakers', 'PERSON'], ['James', 'PERSON'], ['Pelinka', 'PERSON'], ['NBA', 'ORG'], ['Lakers', 'PERSON'], ['Lakers', 'PERSON'], ['Malik', 'PERSON'], ['Beasley', 'PERSON'], ['D‚ÄôAngelo', 'PERSON'], ['Russell', 'PERSON'], ['Jarred', 'PERSON'], ['Vanderb

In [12]:
sort = spacy_sorting(cnn_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])
print()

print(sort[3])

[['The', 'Los', 'Angeles', 'Lakers', '‚Äô', 'the', 'Dallas', 'Mavericks', 'NBA', 'New', 'Orleans', 'Pelicans', 'Boston', 'Celtics', 'Portland', 'Trail', 'Blazers'], ['BAFTA', 'XTERRA'], ['Damian', 'Lillard', 'NBA', 'the', 'Portland', 'Trail', 'Blazers', 'Houston', 'Chicago', 'Bulls', 'CBS', 'Sports', 'Phoenix', 'Suns', 'The'], ['Julius', 'Caesar', 'European', 'Super', 'League', 'ESL', 'the', 'Champions', 'UEFA', 'Apple', ':', 'The', 'War', 'for', 'Football', 'Juventus', 'Club', 'Association', 'Real', 'Madrid', 'Paris', 'Saint', '-', 'Germain', 'Premier', 'FIFA', 'Manchester', 'United', 'PSG', 'Marchetti', 'Barcelona', 'Court', 'of', 'Justice', 'ECJ', 'Union'], ['the', 'Atlas', 'Lions', 'AFC', 'Asian', 'Cup', 'Tubas', 'Israel', 'Defense', 'Forces', 'IDF', 'Reuters', 'Hamas', 'Al', '-', 'Shifa', 'Islamic', 'Jihad', 'Palestinian', 'Football', 'Association', 'AP', 'Henley', '&', 'Partners', 'FIFA', 'Suhaib', 'Salem', 'ReutersThe', 'Visa', 'The', 'Confederation', 'West', 'Federation', 'WAFF

In [13]:
cnn['org_with_brac'] = sort[0]
cnn['gpe_with_brac'] = sort[1]
cnn['person_with_brac'] = sort[2]
cnn['location_with_brac'] = sort[3]

cnn['spacy_org']=cnn['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['spacy_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['spacy_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['spacy_location'] = cnn['location_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac','location_with_brac'],axis = 1)

cnn.head()

Unnamed: 0,url,content,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location
0,https://edition.cnn.com/2023/02/28/sport/los-a...,The Los Angeles Lakers‚Äô rollercoaster season ...,"Western,Western Conference","Los Angeles, August, Dallas, Portland","James, Dennis Schr√∂der, Jonas Valanƒçi≈´nas, Kev...","Lakers, Dallas Mavericks, LeBron, Memphis Griz...","The,Los,Angeles,Lakers,‚Äô,the,Dallas,Mavericks,...","Jonas, Valanƒçi≈´nas, Dallas","LeBron, James, looks, Fkn, Lakers, Kevork, Dja...","the, Memphis, Grizzlies"
1,https://edition.cnn.com/2023/02/16/sport/lesle...,It‚Äôs while running in the moors and hills of ...,Western Front,"Scotland, Germany, B√§umer, Russian, Ukraine, G...","Lesley Paterson, Paterson, Erich Maria Remarqu...","First, Scottish Highlands, Armistice Talks, BA...","BAFTA,XTERRA","Scotland, Germany, Ukraine, Hollywood, Netflix...","Lesley, Paterson, Oscar, Erich, Maria, Remarqu...",
2,https://edition.cnn.com/2023/02/27/sport/damia...,Damian Lillard set an NBA record in his monst...,Western Conference,"Damian, Portland, Lillard","Lillard, Donovan Mitchell, Klay Thompson, Chau...","Lillard, NBA, Houston Rockets, Chicago Bulls, ...","Damian,Lillard,NBA,the,Portland,Trail,Blazers,...",Portland,"Lillard, Donovan, Mitchell, ‚Äôs, Klay, Thompson...","the, west, coast, Bay, Area"
3,https://edition.cnn.com/2023/02/27/football/su...,Betrayal has formed part of European politics...,,"Betrayal, European, Europe, Russian, American,...","Julius Caesar, Brute, Cards, Jeff Zimbalist, A...","Roman, European Super League, ESL, Champions L...","Julius,Caesar,European,Super,League,ESL,the,Ch...","the, Roman, Republic, Leicester, City, Manches...","Jeff, Zimbalist, Aleksander, ƒåeferin, Andrea, ...","Europe, Middle, Eastern"
4,https://edition.cnn.com/2023/02/22/football/pa...,The Palestinian team has never taken part in ...,"West Bank,Gaza City,East Jerusalem,Gaza Strip,...","Palestinian, Palestine, Portugal, African, Mid...","Arab, Gaza, East Jerusalem, Nakba, Arab Emirat...","Qatar, Atlas Lions, Hamadeh, Mohammed Bin Zaye...","the,Atlas,Lions,AFC,Asian,Cup,Tubas,Israel,Def...","Qatar, Portugal, Israel, the, West, Bank, Gaza...","Rami, Hamadeh, Jordan, Ulrik, Pedersen, /, CSM...","the, Middle, East, Asia"


In [14]:
cnn.to_csv('NER_data/cnn_NER.csv')

### cna news

In [15]:
# Merging cna news into 1 dataframe
path = r'./text_preprocessing/cna_FINAL.csv' # use your path

cna = pd.read_csv(path)
cna = cna.loc[:,['url','content']]
print(cna.shape)
cna

(139, 2)


Unnamed: 0,url,content
0,https://www.channelnewsasia.com/sport/steely-m...,LONDON :Seven-time winners AC Milan reached th...
1,https://www.channelnewsasia.com/sport/choupo-m...,MUNICH: Bayern Munich forward Eric-Maxim Choup...
2,https://www.channelnewsasia.com/sport/chelsea-...,LONDON: Chelsea's Guro Reiten scored from the ...
3,https://www.channelnewsasia.com/sport/nothing-...,LONDON: AC Milan can dare to dream about going...
4,https://www.channelnewsasia.com/sport/contes-c...,LONDON: Antonio Conte's dismal record in the C...
...,...,...
134,https://www.channelnewsasia.com/sport/fa-cup-q...,LONDON : FA Cup quarter-final draw made on Wed...
135,https://www.channelnewsasia.com/sport/man-unit...,"MANCHESTER, England :Manchester United fought ..."
136,https://www.channelnewsasia.com/sport/formula-...,Statistics for Sunday's season-opening Bahrain...
137,https://www.channelnewsasia.com/sport/prop-hao...,Prop Mohamed Haouas will miss France's remaini...


NLTK extraction

In [16]:
cna_content = []

ner_info_extraction(cna['content'],cna_content)
    
print(cna_content)

[[[('ORGANIZATION', 'AC Milan'), ('ORGANIZATION', 'Champions League'), ('ORGANIZATION', 'Tottenham Hotspur')], [('ORGANIZATION', 'Italians'), ('ORGANIZATION', 'San Siro'), ('GPE', 'Tottenham')], [('PERSON', 'Tottenham'), ('PERSON', 'Antonio Conte'), ('PERSON', 'Cristian Romero')], [('PERSON', 'Milan')], [('PERSON', 'Stefano Pioli'), ('PERSON', 'Milan'), ('PERSON', 'Tottenham'), ('ORGANIZATION', 'Sheffield United'), ('PERSON', 'Wolverhampton Wanderers')], [('PERSON', 'Milan'), ('GPE', 'European'), ('PERSON', 'Pioli')], [], [], [('PERSON', 'Pioli')], [], [('PERSON', 'Tottenham')], [], [('GPE', 'Tottenham'), ('PERSON', 'Harry Kane'), ('PERSON', 'Son Heung-min'), ('PERSON', 'Dejan Kulusevski'), ('PERSON', 'Milan'), ('PERSON', 'Junior Messias')], [('ORGANIZATION', 'Italians'), ('PERSON', 'Brahim Diaz'), ('PERSON', 'Fraser Forster')], [('PERSON', 'Tottenham'), ('PERSON', 'Milan'), ('PERSON', 'Mike Maignan')], [('PERSON', 'Tottenham'), ('PERSON', 'Rafael Leao')], [('PERSON', 'Kane'), ('PERSON

In [17]:
location_with_brac = []
gpe_with_brac = []
person_with_brac = []
org_with_brac = []

ner_sorting(cna_content, location_with_brac, gpe_with_brac, person_with_brac, org_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

print(org_with_brac)

[[], [], [], [], [], [], [], ['Northern Irishman'], [], ['Southern California'], [], [], [], [], [], ['East Brabant'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Ham United'], [], ['South Africa', 'Caribbean'], [], [], [], ['West'], [], [], [], [], [], [], [], [], [], ['West', 'West Indies'], [], [], [], [], [], [], [], [], ['South'], [], [], [], [], [], ['Southeast'], ['South Korean'], [], [], [], [], ['West Ham United'], [], [], [], [], [], [], [], [], [], [], ['Sebastian Munoz'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Ham United', 'West Bromwich'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Germany', 'West German'], ['West'], ['West Ham'], [], ['Western Force'], [], [], [], [], [], [], ['West Ham'], [], [], []]

[['Tottenham', 'European'], ['MUNICH', 'Germans', 'Paris', 'Bayern', 'Europe', 'Messi', 'Vitinha', 'Dutch'], ['Brighton', 'United', 'Chelsea', 'Nor

In [18]:
cna['location_with_brac'] = location_with_brac
cna['gpe_with_brac'] = gpe_with_brac
cna['person_with_brac'] = person_with_brac
cna['org_with_brac'] = org_with_brac

cna['nltk_location']=cna['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['nltk_gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['nltk_person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['nltk_org'] = cna['org_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'org_with_brac'],axis = 1)

cna.head()

Unnamed: 0,url,content,nltk_location,nltk_gpe,nltk_person,nltk_org
0,https://www.channelnewsasia.com/sport/steely-m...,LONDON :Seven-time winners AC Milan reached th...,,"Tottenham, European","Tottenham, Antonio Conte, Cristian Romero, Mil...","AC Milan, Champions League, Tottenham Hotspur,..."
1,https://www.channelnewsasia.com/sport/choupo-m...,MUNICH: Bayern Munich forward Eric-Maxim Choup...,,"MUNICH, Germans, Paris, Bayern, Europe, Messi,...","Bayern Munich, Serge Gnabry, Paris St Germain,...","Champions League, PSG, Leon Goretzka, VfB Stut..."
2,https://www.channelnewsasia.com/sport/chelsea-...,LONDON: Chelsea's Guro Reiten scored from the ...,,"Brighton, United, Chelsea, Norway, Blues, Women","Chelsea, Guro Reiten, Johanna Rytting Kaneryd,...","Super League, Jess Carter, Swede"
3,https://www.channelnewsasia.com/sport/nothing-...,LONDON: AC Milan can dare to dream about going...,,"London, Europe, European, Milan, Serie","Milan, Stefano Pioli, Tottenham Hotspur, Pioli...","Champions League, AS Roma"
4,https://www.channelnewsasia.com/sport/contes-c...,LONDON: Antonio Conte's dismal record in the C...,,"Tottenham, London, Italian, Italy","Antonio Conte, Conte, Juventus, Chelsea, Totte...","Champions League, Tottenham Hotspur, AC Milan,..."


spacy NER extraction

In [19]:
cont = cna['content'].apply(nlp)

cna_list = spacy_ner(cont)

print(cna_list)

[[['LONDON', 'GPE'], ['AC', 'ORG'], ['Milan', 'ORG'], ['London', 'GPE'], ['Tottenham', 'GPE'], ['Tottenham', 'GPE'], ['Antonio', 'PERSON'], ['Conte', 'PERSON'], ['Romero', 'PERSON'], ['Milan', 'GPE'], ['Stefano', 'PERSON'], ['Pioli', 'PERSON'], ["'s", 'PERSON'], ['Milan', 'GPE'], ['Tottenham', 'GPE'], ['Sheffield', 'GPE'], ['United', 'GPE'], ['Wolverhampton', 'PERSON'], ['Wanderers', 'PERSON'], ['Milan', 'GPE'], ['Pioli', 'GPE'], ['Tottenham', 'ORG'], ['Pioli', 'GPE'], ['GEAR', 'ORG'], ['Tottenham', 'GPE'], ['Tottenham', 'ORG'], ['Harry', 'PERSON'], ['Kane', 'PERSON'], ['Son', 'PERSON'], ['Heung', 'PERSON'], ['-', 'PERSON'], ['min', 'PERSON'], ['Dejan', 'PERSON'], ['Kulusevski', 'PERSON'], ['Milan', 'GPE'], ['Brahim', 'PERSON'], ['Diaz', 'PERSON'], ['Fraser', 'GPE'], ['Forster', 'GPE'], ['Tottenham', 'GPE'], ['Milan', 'GPE'], ['Mike', 'PERSON'], ['Maignan', 'PERSON'], ['Pierre', 'PERSON'], ['-', 'PERSON'], ['Emile', 'PERSON'], ['Hojbjerg', 'PERSON'], ['Tottenham', 'ORG'], ['Rafael', 'P

In [20]:
sort = spacy_sorting(cna_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])
print()

print(sort[3])

[['AC', 'Milan', 'Tottenham', 'GEAR', 'Emerson'], ['Choupo', '-', 'Moting', 'Bayern', 'PSG', 'Matthijs', 'de', 'Ligt', 'Mbappe'], ['Chelsea', 'Brighton', '&', 'Hove', 'Albion', 'Women', "'s", 'Super', 'League', 'Arsenal', 'the', 'Cup'], ['AC', 'Milan', 'the', 'Champions', 'League', 'Serie', 'A', 'Chelsea'], ['Champions', 'League', 'AC', 'Milan', 'Juventus', 'Inter', 'Chelsea', 'the', 'Premier', 'Tottenham', 'Europa', 'Conference'], ['the', 'Champions', 'League', 'Bayern', 'Mbappe'], [], ['the', 'PGA', 'Tour', 'a', 'Hall', 'of', 'Fame', 'Sawgrass', 'Player', 'TRIUMPH', '\r\n\r\n', 'McIlroy', "'s", 'Scheffler', 'LIV', 'Golf', 'top-10', 'TPC', 'Anirban', 'Lahiri'], ['Woods'], ['Indian', 'Wells', 'Grand', 'Slam'], ['Commonwealth', 'Sport', 'Canada', 'CSC', "'s", 'the', 'Alberta', '2030', 'Games', 'Corporation', 'Federation'], ['WELLS', 'Netflix'], ['Juventus', "'", 'Europa', 'League', 'last-16', 'Pogba', 'Manchester', 'United', 'Serie', 'A.'], ['Mercedes', 'Bahrain', 'Grand', 'Prix', 'BBC'

In [21]:
cna['org_with_brac'] = sort[0]
cna['gpe_with_brac'] = sort[1]
cna['person_with_brac'] = sort[2]
cna['loc_with_brac'] = sort[3]

cna['spacy_org']=cna['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['spacy_gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['spacy_person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['spacy_location'] = cna['loc_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac', 'loc_with_brac'],axis = 1)

cna.head()

Unnamed: 0,url,content,nltk_location,nltk_gpe,nltk_person,nltk_org,spacy_org,spacy_gpe,spacy_person,spacy_location
0,https://www.channelnewsasia.com/sport/steely-m...,LONDON :Seven-time winners AC Milan reached th...,,"Tottenham, European","Tottenham, Antonio Conte, Cristian Romero, Mil...","AC Milan, Champions League, Tottenham Hotspur,...","AC,Milan,Tottenham,GEAR,Emerson","LONDON, London, Tottenham, Milan, Sheffield, U...","Antonio, Conte, Romero, Stefano, Pioli, 's, Wo...",
1,https://www.channelnewsasia.com/sport/choupo-m...,MUNICH: Bayern Munich forward Eric-Maxim Choup...,,"MUNICH, Germans, Paris, Bayern, Europe, Messi,...","Bayern Munich, Serge Gnabry, Paris St Germain,...","Champions League, PSG, Leon Goretzka, VfB Stut...","Choupo,-,Moting,Bayern,PSG,Matthijs,de,Ligt,Mb...","Paris, Neymar, Sommer","Bayern, Munich, Eric, -, Maxim, Choupo, Moting...",Europe
2,https://www.channelnewsasia.com/sport/chelsea-...,LONDON: Chelsea's Guro Reiten scored from the ...,,"Brighton, United, Chelsea, Norway, Blues, Women","Chelsea, Guro Reiten, Johanna Rytting Kaneryd,...","Super League, Jess Carter, Swede","Chelsea,Brighton,&,Hove,Albion,Women,'s,Super,...","LONDON, Manchester, United, Norway, Brighton, ...","Johanna, Rytting, Kaneryd, Sam, Kerr, Jess, Ca...",
3,https://www.channelnewsasia.com/sport/nothing-...,LONDON: AC Milan can dare to dream about going...,,"London, Europe, European, Milan, Serie","Milan, Stefano Pioli, Tottenham Hotspur, Pioli...","Champions League, AS Roma","AC,Milan,the,Champions,League,Serie,A,Chelsea","LONDON, Milan, London, Pioli, Tottenham","Stefano, Pioli, Bayern, Munich, Benfica",Europe
4,https://www.channelnewsasia.com/sport/contes-c...,LONDON: Antonio Conte's dismal record in the C...,,"Tottenham, London, Italian, Italy","Antonio Conte, Conte, Juventus, Chelsea, Totte...","Champions League, Tottenham Hotspur, AC Milan,...","Champions,League,AC,Milan,Juventus,Inter,Chels...","LONDON, London, Tottenham, Italy, Amazon, Prim...","Antonio, Conte, 's, Wolverhampton, Wanderers",


In [22]:
cna.to_csv('NER_data/cna_NER.csv')