In [1]:
import pandas as pd

import re

import nltk
from nltk.tokenize import word_tokenize

import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

### Definitions

In [2]:
def cleaning(content):
    cleaned = []
    for x in content:
        remove_cnn = re.sub("CNN", "", x)
        remove_nextline = re.sub("[\n]{2}", "", remove_cnn)
        remove_dash = re.sub("\u2014", "", remove_nextline)
        cleaned.append(remove_dash)
    return cleaned

In [3]:
def extract_ne_from_tree ( tree ):
    result = []
    for s in tree.subtrees():
        label = s.label()
        if (label == 'PERSON' or label == 'GPE' or label == 'LOCATION' or label == 'ORGANIZATION'):
            leaves = s.leaves()
            ne = ''
            for l in leaves:
                ne = ne + ' ' + l[0]
            result.append((label, ne[1:]))
    return result

In [4]:
def ner_info_extraction(content ,content_list):
    for x in content:
        res_sents = nltk.sent_tokenize(x)
        res_tokens = [word_tokenize(sents) for sents in res_sents]
        tagged_res = [nltk.pos_tag(tokens) for tokens in res_tokens]
        res_tree = [nltk.ne_chunk(elem) for elem in tagged_res]
        res_list = [extract_ne_from_tree(tree) for tree in res_tree]
        content_list+=[(res_list)]

In [5]:
def ner_sorting(content ,location, gpe, person):
    for row in content:
        check_location = []
        check_gpe = []
        check_person = []
        for sent in row:
            if len(sent)!=0:
                for tup in sent:
                    if tup[0] == "LOCATION":
                        if tup[1] not in check_location:
                            check_location.append(tup[1])
                    if tup[0] == "GPE":
                        if tup[1] not in check_gpe:
                            check_gpe.append(tup[1])
                    if tup[0] == "PERSON":
                        if tup[1] not in check_person:
                            check_person.append(tup[1])
        location.append(check_location)
        gpe.append(check_gpe)
        person.append(check_person)

In [6]:
def spacy_ner(content):
    cnn_list = []
    for doc in content:
        temp = []
        for token in doc:
            ner_tags = [token.text, token.ent_type_]
            if ner_tags[1]=="ORG" or ner_tags[1]=="PERSON" or ner_tags[1]=="GPE":
                temp.append(ner_tags)
        cnn_list.append(temp)
    return cnn_list

In [7]:
def spacy_sorting(content):
    org = []
    gpe = []
    person = []
    for row in content:
        check_org = []
        check_gpe = []
        check_person = []
        if len(row)!=0:
            for tup in row:
                if tup[1] == "ORG":
                    if tup[0] not in check_org:
                        check_org.append(tup[0])
                if tup[1] == "GPE":
                    if tup[0] not in check_gpe:
                        check_gpe.append(tup[0])
                if tup[1] == "PERSON":
                    if tup[0] not in check_person:
                        check_person.append(tup[0])
        org.append(check_org)
        gpe.append(check_gpe)
        person.append(check_person)
    return [org, gpe, person]

### cnn news

In [8]:
# merge cnn news into 1 dataframe
cnn_27feb = pd.read_csv('cnn_27feb.csv')
cnn_28feb = pd.read_csv('cnn_28feb.csv')
cnn_1mar = pd.read_csv('cnn_1mar.csv')
cnn_2mar = pd.read_csv('cnn_2mar.csv')
cnn_3mar = pd.read_csv('cnn_3mar.csv')
cnn_8mar = pd.read_csv('cnn_8mar.csv')
cnn_9mar = pd.read_csv('cnn_9mar.csv')
cnn = pd.concat([cnn_27feb, cnn_28feb, cnn_1mar, cnn_2mar, cnn_3mar, cnn_8mar, cnn_9mar])
print(cnn.shape)
cnn.head()

# import glob
# import os

# path = r'C:\latest_news_scraper\cna_news' # use your path
# all_files = glob.glob(path + "/*.csv")

# df_list = (pd.read_csv(file) for file in all_files)
# cnn   = pd.concat(df_list, ignore_index=True)
# cnn = cnn.loc[:,['url','content']]
# cnn

(212, 3)


Unnamed: 0.1,Unnamed: 0,url,content
0,0,https://edition.cnn.com/2023/02/26/sport/austr...,CNN —\n\nAustralia won the Women’s T20 World C...
1,1,https://edition.cnn.com/2023/02/27/football/su...,CNN —\n\nBetrayal has formed part of European ...
2,2,https://edition.cnn.com/2023/02/16/golf/tiger-...,"CNN —\n\nIn February 1992, a sprightly 16-year..."
3,3,https://edition.cnn.com/2023/02/19/sport/tiger...,CNN —\n\nIt was an impressive and eventful thi...
4,4,https://edition.cnn.com/2023/02/27/sport/jake-...,CNN —\n\nYouTuber Jake Paul suffered the first...


Cleaning
- Remove CNN
- Remove stopwords

In [9]:
cleaned = cleaning(cnn['content'])
print(cleaned)

[" Australia won the Women’s T20 World Cup in brilliant fashion, defeating home side South Africa by 19 runs in front of a sold out Newlands Cricket Ground in Cape Town on SaturdayVictory once again underlined Australia’s dominance in the sport, as the team completed a repeat three-peat under captain Meg Lanning and won the tournament for the sixth time in seven editions.“It is a pretty special effort from the group,” Lanning told Sky Sports afterwards.“We felt we had a good score and felt confident if we could hit our areas. We set the tone in an excellent powerplay. We have a special group, not just the players but also the support staff.”After Lanning won the toss and elected to bat first, the Australian openers, Alyssa Healy and Beth Mooney, navigated their way through the first few overs as the home crowd urged on the South African attack.Healy fell in the fifth over, caught by Nadine de Klerk off Marizanne Kapp’s bowling, but Mooney stayed at the crease for an impressive unbeaten

In [10]:
cnn['cleaned_content'] = cleaned

cnn.head()

Unnamed: 0.1,Unnamed: 0,url,content,cleaned_content
0,0,https://edition.cnn.com/2023/02/26/sport/austr...,CNN —\n\nAustralia won the Women’s T20 World C...,Australia won the Women’s T20 World Cup in br...
1,1,https://edition.cnn.com/2023/02/27/football/su...,CNN —\n\nBetrayal has formed part of European ...,Betrayal has formed part of European politics...
2,2,https://edition.cnn.com/2023/02/16/golf/tiger-...,"CNN —\n\nIn February 1992, a sprightly 16-year...","In February 1992, a sprightly 16-year-old gol..."
3,3,https://edition.cnn.com/2023/02/19/sport/tiger...,CNN —\n\nIt was an impressive and eventful thi...,It was an impressive and eventful third round...
4,4,https://edition.cnn.com/2023/02/27/sport/jake-...,CNN —\n\nYouTuber Jake Paul suffered the first...,YouTuber Jake Paul suffered the first defeat ...


NLTK extraction

In [11]:
cnn_content = []

ner_info_extraction(cnn['cleaned_content'],cnn_content)
    
print(cnn_content)

[[[('GPE', 'Australia'), ('PERSON', 'T20'), ('LOCATION', 'South Africa'), ('ORGANIZATION', 'Newlands Cricket Ground'), ('GPE', 'Cape Town'), ('ORGANIZATION', 'SaturdayVictory'), ('GPE', 'Australia'), ('PERSON', 'Meg Lanning'), ('PERSON', 'Sky Sports')], [], [('GPE', 'Australian'), ('PERSON', 'Alyssa Healy'), ('PERSON', 'Beth Mooney'), ('GPE', 'South'), ('GPE', 'African'), ('PERSON', 'Nadine de Klerk'), ('PERSON', 'Marizanne Kapp'), ('PERSON', 'Mooney'), ('GPE', 'Women'), ('ORGANIZATION', 'T20')], [('PERSON', 'Matthew'), ('LOCATION', 'South African'), ('PERSON', 'Gardner'), ('PERSON', 'Mooney'), ('GPE', 'Australia'), ('GPE', 'South Africa'), ('GPE', 'South Africa'), ('PERSON', 'Laura Wolvaardt'), ('PERSON', 'Australia')]], [[('GPE', 'Betrayal'), ('GPE', 'European'), ('PERSON', 'Julius Caesar'), ('PERSON', 'Brute'), ('ORGANIZATION', 'Roman'), ('ORGANIZATION', 'European Super League'), ('ORGANIZATION', 'ESL'), ('ORGANIZATION', 'Champions League'), ('ORGANIZATION', 'House'), ('PERSON', 'Ca

In [12]:
location_with_brac = []
gpe_with_brac = []
person_with_brac = []

ner_sorting(cnn_content, location_with_brac, gpe_with_brac, person_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

[['South Africa', 'South African'], [], [], [], [], [], [], [], [], [], ['Western Front'], [], [], [], ['Southern Oregon'], ['West Bank', 'Gaza City', 'East Jerusalem', 'Gaza Strip', 'Gaza', 'West', 'West Asian'], [], ['Southern'], [], ['Cardinals'], [], ['Western Conference'], [], ['Southern California'], [], [], [], [], [], [], [], [], ['Southern'], ['Southern Oregon'], [], [], [], [], [], ['South Africa', 'South African'], [], [], ['Cardinals'], ['Western Conference'], ['West Bank', 'Gaza City', 'East Jerusalem', 'Gaza Strip', 'Gaza', 'West', 'West Asian'], [], [], ['Southern California'], [], [], [], ['Western Front'], [], [], ['Western', 'Western Conference'], ['Western Front'], ['Western Conference'], [], ['West Bank', 'Gaza City', 'East Jerusalem', 'Gaza Strip', 'Gaza', 'West', 'West Asian'], [], [], [], [], ['Southern'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Bank', 'Gaza City', 'East Jerusalem', 'Gaza Strip', 'Gaza', 'West', 

In [13]:
cnn['location_with_brac'] = location_with_brac
cnn['gpe_with_brac'] = gpe_with_brac
cnn['person_with_brac'] = person_with_brac

cnn['nltk_location']=cnn['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['nltk_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['nltk_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'Unnamed: 0'],axis = 1)

cnn.head()

Unnamed: 0,url,content,cleaned_content,nltk_location,nltk_gpe,nltk_person
0,https://edition.cnn.com/2023/02/26/sport/austr...,CNN —\n\nAustralia won the Women’s T20 World C...,Australia won the Women’s T20 World Cup in br...,"South Africa,South African","Australia, Cape Town, Australian, South, Afric...","T20, Meg Lanning, Sky Sports, Alyssa Healy, Be..."
1,https://edition.cnn.com/2023/02/27/football/su...,CNN —\n\nBetrayal has formed part of European ...,Betrayal has formed part of European politics...,,"Betrayal, European, Europe, Russian, American,...","Julius Caesar, Brute, Cards, Jeff Zimbalist, A..."
2,https://edition.cnn.com/2023/02/16/golf/tiger-...,"CNN —\n\nIn February 1992, a sprightly 16-year...","In February 1992, a sprightly 16-year-old gol...",,"California, Match, Sound, American","Tiger Woods, Woods, Max Homa, Keith, Robert, C..."
3,https://edition.cnn.com/2023/02/19/sport/tiger...,CNN —\n\nIt was an impressive and eventful thi...,It was an impressive and eventful third round...,,California,"Tiger Woods, Woods, Jon Rahm, Justin Thomas"
4,https://edition.cnn.com/2023/02/27/sport/jake-...,CNN —\n\nYouTuber Jake Paul suffered the first...,YouTuber Jake Paul suffered the first defeat ...,,"Saudi, British, US, American, Riyadh, Saudi Ar...","Paul, Tommy Fury, Fury, Fayez, Tyson Fury, Mik..."


spacy

In [14]:
cont = cnn['cleaned_content'].apply(nlp)

cnn_list = spacy_ner(cont)
    
print(cnn_list)

[[['Australia', 'GPE'], ['the', 'ORG'], ['Women', 'ORG'], ['’s', 'ORG'], ['T20', 'ORG'], ['World', 'ORG'], ['Cup', 'ORG'], ['South', 'GPE'], ['Africa', 'GPE'], ['Newlands', 'ORG'], ['Cricket', 'ORG'], ['Ground', 'ORG'], ['SaturdayVictory', 'ORG'], ['Australia', 'GPE'], ['Meg', 'PERSON'], ['Lanning', 'PERSON'], ['Sky', 'ORG'], ['Sports', 'ORG'], ['Lanning', 'PERSON'], ['Alyssa', 'PERSON'], ['Healy', 'PERSON'], ['Beth', 'PERSON'], ['Mooney', 'PERSON'], ['Healy', 'GPE'], ['Nadine', 'ORG'], ['de', 'ORG'], ['Klerk', 'ORG'], ['Marizanne', 'ORG'], ['Kapp', 'ORG'], ['’s', 'ORG'], ['Mooney', 'PERSON'], ['Australia', 'GPE'], ['Matthew', 'PERSON'], ['Lewis', 'PERSON'], ['/', 'PERSON'], ['ICC', 'PERSON'], ['/', 'PERSON'], ['Getty', 'PERSON'], ['ImagesShe', 'PERSON'], ['Ash', 'PERSON'], ['Gardner', 'PERSON'], ['Australia', 'GPE'], ['South', 'GPE'], ['Africa', 'GPE'], ['South', 'GPE'], ['Africa', 'GPE'], ['Laura', 'PERSON'], ['Wolvaardt', 'PERSON'], ['’s', 'PERSON'], ['Australia', 'GPE']], [['Julius

In [15]:
sort = spacy_sorting(cnn_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])

[['the', 'Women', '’s', 'T20', 'World', 'Cup', 'Newlands', 'Cricket', 'Ground', 'SaturdayVictory', 'Sky', 'Sports', 'Nadine', 'de', 'Klerk', 'Marizanne', 'Kapp'], ['Julius', 'Caesar', 'European', 'Super', 'League', 'ESL', 'the', 'Champions', 'UEFA', 'Apple', ':', 'The', 'War', 'for', 'Football', 'Juventus', 'Club', 'Association', 'Real', 'Madrid', 'Paris', 'Saint', '-', 'Germain', 'Premier', 'FIFA', 'Manchester', 'United', 'PSG', 'Marchetti', 'Barcelona', 'Court', 'of', 'Justice', 'ECJ', 'Union'], ['the', 'Riviera', 'Country', 'Club', 'Woods', 'Genesis', 'Invitational', 'TGR', 'Foundation', 'PGA', 'Tour', 'TNT', 'Cliff', 'Hawkins', '/', 'Getty', 'ImagesRecord', 'NBA', 'Scheffler', 'The', '’s', 'McIlroy'], ['Woods', 'the', 'Riviera', 'Country', 'Club', 'PGA', 'Tour', 'Genesis', 'Invitational'], ['Fury', 'Fayez', 'Nureldine', 'Tyson', 'YouTuber', 'Tommy'], ['Paris', 'Saint', '-', 'Germain', 'Marseille', 'PSG', 'Pays', 'de', 'Cassel', 'Nantes'], ['NFL', 'the', 'Dallas', 'Cowboys', 'Miami'

In [16]:
cnn['org_with_brac'] = sort[0]
cnn['gpe_with_brac'] = sort[1]
cnn['person_with_brac'] = sort[2]

cnn['spacy_org']=cnn['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['spacy_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['spacy_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac'],axis = 1)

cnn.head()

Unnamed: 0,url,content,cleaned_content,nltk_location,nltk_gpe,nltk_person,spacy_org,spacy_gpe,spacy_person
0,https://edition.cnn.com/2023/02/26/sport/austr...,CNN —\n\nAustralia won the Women’s T20 World C...,Australia won the Women’s T20 World Cup in br...,"South Africa,South African","Australia, Cape Town, Australian, South, Afric...","T20, Meg Lanning, Sky Sports, Alyssa Healy, Be...","the,Women,’s,T20,World,Cup,Newlands,Cricket,Gr...","Australia, South, Africa, Healy","Meg, Lanning, Alyssa, Healy, Beth, Mooney, Mat..."
1,https://edition.cnn.com/2023/02/27/football/su...,CNN —\n\nBetrayal has formed part of European ...,Betrayal has formed part of European politics...,,"Betrayal, European, Europe, Russian, American,...","Julius Caesar, Brute, Cards, Jeff Zimbalist, A...","Julius,Caesar,European,Super,League,ESL,the,Ch...","the, Roman, Republic, Leicester, City, Manches...","Jeff, Zimbalist, Aleksander, Čeferin, Andrea, ..."
2,https://edition.cnn.com/2023/02/16/golf/tiger-...,"CNN —\n\nIn February 1992, a sprightly 16-year...","In February 1992, a sprightly 16-year-old gol...",,"California, Match, Sound, American","Tiger Woods, Woods, Max Homa, Keith, Robert, C...","the,Riviera,Country,Club,Woods,Genesis,Invitat...",California,"Tiger, Woods, Max, Homa, Keith, Mitchell, Robe..."
3,https://edition.cnn.com/2023/02/19/sport/tiger...,CNN —\n\nIt was an impressive and eventful thi...,It was an impressive and eventful third round...,,California,"Tiger Woods, Woods, Jon Rahm, Justin Thomas","Woods,the,Riviera,Country,Club,PGA,Tour,Genesi...",California,"Gary, A., Vasquez, /, USA, Today, Sports, Jon,..."
4,https://edition.cnn.com/2023/02/27/sport/jake-...,CNN —\n\nYouTuber Jake Paul suffered the first...,YouTuber Jake Paul suffered the first defeat ...,,"Saudi, British, US, American, Riyadh, Saudi Ar...","Paul, Tommy Fury, Fury, Fayez, Tyson Fury, Mik...","Fury,Fayez,Nureldine,Tyson,YouTuber,Tommy","Saudi, Arabia, Riyadh, US","YouTuber, Jake, Paul, Tommy, Fury, Mike, Tyson..."


In [17]:
cnn.to_csv('cnn_cleaned_with_NER.csv')

### cna news

In [18]:
# Merging cna news into 1 dataframe
cna_27feb = pd.read_csv('cna_27feb.csv')
cna_28feb = pd.read_csv('cna_28feb.csv')
cna_1mar = pd.read_csv('cna_1mar.csv')
cna_2mar = pd.read_csv('cna_2mar.csv')
cna_3mar = pd.read_csv('cna_3mar.csv')
cna_8mar = pd.read_csv('cna_8mar.csv')
cna_9mar = pd.read_csv('cna_9mar.csv')
cna = pd.concat([cna_27feb, cna_28feb, cna_1mar, cna_2mar, cna_3mar, cna_8mar, cna_9mar])
print(cna.shape)
cna.head()

(145, 3)


Unnamed: 0.1,Unnamed: 0,url,content
0,0,https://www.channelnewsasia.com/sport/djokovic...,Novak Djokovic said he was now playing without...
1,2,https://www.channelnewsasia.com/sport/future-l...,LONDON: Manchester United's near six-year trop...
2,4,https://www.channelnewsasia.com/sport/clinical...,LONDON: Manchester United crushed Newcastle Un...
3,6,https://www.channelnewsasia.com/sport/klinsman...,Juergen Klinsmann has been named head coach of...
4,7,https://www.channelnewsasia.com/sport/spurs-be...,LONDON: Tottenham Hotspur beat Chelsea 2-0 in ...


cleaning

In [19]:
cleaned = cleaning(cna['content'])
print(cleaned)

['Novak Djokovic said he was now playing without pain after recovering from a hamstring injury as he prepares to return to the ATP Tour in Dubai this week following his record-extending 10th Australian Open title last month.Djokovic suffered a 3cm hamstring tear en route to winning the Adelaide warm-up event before claiming his 22nd Grand Slam title at Melbourne Park to go level with Rafa Nadal.The 35-year-old Serb said his Dubai participation was in doubt until a few days ago but he had passed all his fitness tests."Had a couple of weeks of no tennis. The last few days it\'s really getting as much practice as possible to get myself back in shape to be able to compete at a high level," Djokovic said."I haven\'t felt pain on the court for a week. I\'m getting closer to 100 per cent. Still not there in terms of the game and how I feel on court. But the important thing is there\'s no pain. I don\'t have a hindrance in the way I move on the court."After every injury it takes time for mecha

In [20]:
cna['cleaned_content'] = cleaned

cna.head()

Unnamed: 0.1,Unnamed: 0,url,content,cleaned_content
0,0,https://www.channelnewsasia.com/sport/djokovic...,Novak Djokovic said he was now playing without...,Novak Djokovic said he was now playing without...
1,2,https://www.channelnewsasia.com/sport/future-l...,LONDON: Manchester United's near six-year trop...,LONDON: Manchester United's near six-year trop...
2,4,https://www.channelnewsasia.com/sport/clinical...,LONDON: Manchester United crushed Newcastle Un...,LONDON: Manchester United crushed Newcastle Un...
3,6,https://www.channelnewsasia.com/sport/klinsman...,Juergen Klinsmann has been named head coach of...,Juergen Klinsmann has been named head coach of...
4,7,https://www.channelnewsasia.com/sport/spurs-be...,LONDON: Tottenham Hotspur beat Chelsea 2-0 in ...,LONDON: Tottenham Hotspur beat Chelsea 2-0 in ...


NLTK extraction

In [21]:
cna_content = []

ner_info_extraction(cna['cleaned_content'],cna_content)
    
print(cna_content)

[[[('PERSON', 'Novak'), ('PERSON', 'Djokovic'), ('ORGANIZATION', 'ATP Tour'), ('GPE', 'Dubai'), ('ORGANIZATION', 'Adelaide'), ('PERSON', 'Rafa'), ('PERSON', 'Serb'), ('GPE', 'Dubai')], [], [('PERSON', 'Djokovic')], [], [], [], [], [], [], [], [], [('PERSON', 'Djokovic'), ('GPE', 'Dubai'), ('PERSON', 'Steffi Graf')], [('PERSON', 'Graf'), ('PERSON', 'Djokovic')], [], [], [('PERSON', 'Czech Tomas Machac')]], [[('GPE', 'United'), ('ORGANIZATION', 'League'), ('ORGANIZATION', 'Newcastle United'), ('ORGANIZATION', 'Wembley'), ('ORGANIZATION', 'Old Trafford'), ('GPE', 'Dutch'), ('PERSON', 'Erik'), ('ORGANIZATION', 'Ajax Amsterdam'), ('ORGANIZATION', 'Old Trafford'), ('ORGANIZATION', 'American Glazer'), ('GPE', 'Portuguese'), ('ORGANIZATION', 'Hag')], [], [('ORGANIZATION', 'Europa League'), ('GPE', 'Barcelona'), ('PERSON', 'Hag')], [('ORGANIZATION', 'Brentford'), ('GPE', 'August'), ('ORGANIZATION', 'Manchester'), ('GPE', 'United'), ('PERSON', 'Marcus Rashford'), ('PERSON', 'Casemiro'), ('PERSON

In [22]:
location_with_brac = []
gpe_with_brac = []
person_with_brac = []

ner_sorting(cna_content, location_with_brac, gpe_with_brac, person_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

[[], [], [], [], [], [], [], [], [], ['West Ham United'], [], ['South Africa'], [], [], [], ['West'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['West Ham United'], [], ['South Africa'], ['Southeast'], ['South Korean'], [], [], [], [], ['West Ham United'], [], [], [], [], [], [], [], [], [], [], ['Sebastian Munoz'], [], [], [], [], [], [], [], [], [], ['West Germany', 'West German'], ['West'], ['West Ham'], [], [], [], [], [], [], [], [], ['West Ham'], [], [], [], [], [], [], [], ['West', 'West Indies'], [], [], [], [], [], [], [], [], ['South'], [], [], [], [], [], [], [], [], [], ['West Ham United', 'West Bromwich'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Northern Irishman'], [], ['Southern California'], [], [], [], [], [], ['East Brabant'], [], [], [], [], []]

[['Dubai'], ['United', 'Dutch', 'Portuguese', 'Barcelona', 'August', 'Newcastle', 'Weghorst'], ['Newcastle', 'Saudi', 'U

In [23]:
cna['location_with_brac'] = location_with_brac
cna['gpe_with_brac'] = gpe_with_brac
cna['person_with_brac'] = person_with_brac

cna['location']=cna['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'Unnamed: 0'],axis = 1)

cna.head()

Unnamed: 0,url,content,cleaned_content,location,gpe,person
0,https://www.channelnewsasia.com/sport/djokovic...,Novak Djokovic said he was now playing without...,Novak Djokovic said he was now playing without...,,Dubai,"Novak, Djokovic, Rafa, Serb, Steffi Graf, Graf..."
1,https://www.channelnewsasia.com/sport/future-l...,LONDON: Manchester United's near six-year trop...,LONDON: Manchester United's near six-year trop...,,"United, Dutch, Portuguese, Barcelona, August, ...","Erik, Hag, Marcus Rashford, Casemiro, Lisandro..."
2,https://www.channelnewsasia.com/sport/clinical...,LONDON: Manchester United crushed Newcastle Un...,LONDON: Manchester United crushed Newcastle Un...,,"Newcastle, Saudi, United, English","United, Casemiro, Sven Botman, Eddie Howe, Eri..."
3,https://www.channelnewsasia.com/sport/klinsman...,Juergen Klinsmann has been named head coach of...,Juergen Klinsmann has been named head coach of...,,"South Korea, Germany, United States, Canada, M...","Juergen, Klinsmann, Paulo Bento, Brazil, Bayer..."
4,https://www.channelnewsasia.com/sport/spurs-be...,LONDON: Tottenham Hotspur beat Chelsea 2-0 in ...,LONDON: Tottenham Hotspur beat Chelsea 2-0 in ...,,"British, Chelsea, Spurs","Tottenham Hotspur, Chelsea, Oliver Skipp, Harr..."


spacy NER extraction

In [24]:
cont = cna['cleaned_content'].apply(nlp)

cna_list = spacy_ner(cont)

print(cna_list)

[[['Novak', 'PERSON'], ['Djokovic', 'PERSON'], ['the', 'ORG'], ['ATP', 'ORG'], ['Tour', 'ORG'], ['Dubai', 'GPE'], ['Adelaide', 'ORG'], ['Rafa', 'PERSON'], ['Nadal', 'PERSON'], ['Dubai', 'GPE'], ['Dubai', 'GPE'], ['Tomas', 'PERSON'], ['Machac', 'PERSON']], [['LONDON', 'GPE'], ['Manchester', 'ORG'], ['United', 'ORG'], ["'s", 'ORG'], ['Newcastle', 'GPE'], ['United', 'GPE'], ["'s", 'GPE'], ['Wembley', 'PERSON'], ['Erik', 'PERSON'], ['ten', 'PERSON'], ['Hag', 'PERSON'], ['Ajax', 'ORG'], ['Amsterdam', 'ORG'], ['Old', 'ORG'], ['Trafford', 'ORG'], ['Glazer', 'PERSON'], ['Cristiano', 'ORG'], ['Ronaldo', 'ORG'], ['the', 'ORG'], ['Europa', 'ORG'], ['League', 'ORG'], ['Barcelona', 'GPE'], ['the', 'ORG'], ['Premier', 'ORG'], ['League', 'ORG'], ['Brentford', 'PERSON'], ['Manchester', 'GPE'], ['City', 'GPE'], ['United', 'ORG'], ['Marcus', 'PERSON'], ['Rashford', 'PERSON'], ['Casemiro', 'GPE'], ['Lisandro', 'PERSON'], ['Martinez', 'PERSON'], ['Raphael', 'PERSON'], ['Varane', 'PERSON'], ['Newcastle', '

In [25]:
sort = spacy_sorting(cna_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])

[['the', 'ATP', 'Tour', 'Adelaide'], ['Manchester', 'United', "'s", 'Ajax', 'Amsterdam', 'Old', 'Trafford', 'Cristiano', 'Ronaldo', 'the', 'Europa', 'League', 'Premier', 'Ten', 'Hag'], ['Manchester', 'United', 'Sven', 'Botman', 'Wembley', 'Bruno', 'Fernandes'], ['The', 'Korea', 'Football', 'Association', 'KFA', 'Klinsmann'], ['Skipp', 'Kepa', 'Arrizabalaga', 'Chelsea'], ['Mbappe', 'PSG', 'Marseille', 'Nice', 'Montpellier', 'Marquinhos', 'Nuno', 'Tavares', "'s", 'Nantes', 'the', 'Parc', 'des', 'Princes'], ['LaLiga', 'Real', 'Madrid', 'Atletico', 'the', 'Europa', 'League', 'Barca', 'Malian', 'Marc', '-', 'Andre'], ['BOLOGNA', 'Inter', 'Milan', 'Serie', 'A', 'Simone', 'Inzaghi', "'s", 'AC', 'AS', 'Roma', 'Skriniar', 'Champions', 'League', 'Alessandro', 'Bastoni', 'Dominguez'], ['Al', 'Duhail', 'the', 'Asian', 'Champions', 'League', 'Urawa', 'Red', 'Diamonds', 'Ighalo', 'put', 'Hilal', 'Manchester', 'United', 'Ittihad'], ['Manchester', 'United', "'s", 'Sven', 'Botman', 'Rashford', 'the', '

In [26]:
cna['org_with_brac'] = sort[0]
cna['gpe_with_brac'] = sort[1]
cna['person_with_brac'] = sort[2]

cna['spacy_org']=cna['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['spacy_gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['spacy_person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac'],axis = 1)

cna.head()

Unnamed: 0,url,content,cleaned_content,location,gpe,person,spacy_org,spacy_gpe,spacy_person
0,https://www.channelnewsasia.com/sport/djokovic...,Novak Djokovic said he was now playing without...,Novak Djokovic said he was now playing without...,,Dubai,"Novak, Djokovic, Rafa, Serb, Steffi Graf, Graf...","the,ATP,Tour,Adelaide",Dubai,"Novak, Djokovic, Rafa, Nadal, Tomas, Machac"
1,https://www.channelnewsasia.com/sport/future-l...,LONDON: Manchester United's near six-year trop...,LONDON: Manchester United's near six-year trop...,,"United, Dutch, Portuguese, Barcelona, August, ...","Erik, Hag, Marcus Rashford, Casemiro, Lisandro...","Manchester,United,'s,Ajax,Amsterdam,Old,Traffo...","LONDON, Newcastle, United, 's, Barcelona, Manc...","Wembley, Erik, ten, Hag, Glazer, Brentford, Ma..."
2,https://www.channelnewsasia.com/sport/clinical...,LONDON: Manchester United crushed Newcastle Un...,LONDON: Manchester United crushed Newcastle Un...,,"Newcastle, Saudi, United, English","United, Casemiro, Sven Botman, Eddie Howe, Eri...","Manchester,United,Sven,Botman,Wembley,Bruno,Fe...","LONDON, Newcastle, United, 's, Casemiro, Saudi...","Eddie, Howe, Erik, ten, Hag, 's, Jose, Mourinh..."
3,https://www.channelnewsasia.com/sport/klinsman...,Juergen Klinsmann has been named head coach of...,Juergen Klinsmann has been named head coach of...,,"South Korea, Germany, United States, Canada, M...","Juergen, Klinsmann, Paulo Bento, Brazil, Bayer...","The,Korea,Football,Association,KFA,Klinsmann","South, Korea, Germany, 's, Brazil, the, United...","Juergen, Klinsmann, Paulo, Bento, Gus, Hiddink..."
4,https://www.channelnewsasia.com/sport/spurs-be...,LONDON: Tottenham Hotspur beat Chelsea 2-0 in ...,LONDON: Tottenham Hotspur beat Chelsea 2-0 in ...,,"British, Chelsea, Spurs","Tottenham Hotspur, Chelsea, Oliver Skipp, Harr...","Skipp,Kepa,Arrizabalaga,Chelsea","LONDON, Spurs","Chelsea, Oliver, Skipp, Harry, Kane, Spurs, Gr..."


In [27]:
cna.to_csv('cna_cleaned_with_NER.csv')