In [1]:
import pandas as pd

import re

import nltk
from nltk.tokenize import word_tokenize

import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

### Definitions

In [2]:
def cleaning(content):
    cleaned = []
    for x in content:
        remove_cnn = re.sub("CNN", "", x)
        remove_nextline = re.sub("[\n]{2}", "", remove_cnn)
        remove_dash = re.sub("\u2014", "", remove_nextline)
        cleaned.append(remove_dash)
    return cleaned

In [3]:
def extract_ne_from_tree ( tree ):
    result = []
    for s in tree.subtrees():
        label = s.label()
        if (label == 'PERSON' or label == 'GPE' or label == 'LOCATION' or label == 'ORGANIZATION'):
            leaves = s.leaves()
            ne = ''
            for l in leaves:
                ne = ne + ' ' + l[0]
            result.append((label, ne[1:]))
    return result

In [4]:
def ner_info_extraction(content ,content_list):
    for x in content:
        res_sents = nltk.sent_tokenize(x)
        res_tokens = [word_tokenize(sents) for sents in res_sents]
        tagged_res = [nltk.pos_tag(tokens) for tokens in res_tokens]
        res_tree = [nltk.ne_chunk(elem) for elem in tagged_res]
        res_list = [extract_ne_from_tree(tree) for tree in res_tree]
        content_list+=[(res_list)]

In [5]:
def ner_sorting(content ,location, gpe, person):
    for row in content:
        check_location = []
        check_gpe = []
        check_person = []
        for sent in row:
            if len(sent)!=0:
                for tup in sent:
                    if tup[0] == "LOCATION":
                        if tup[1] not in check_location:
                            check_location.append(tup[1])
                    if tup[0] == "GPE":
                        if tup[1] not in check_gpe:
                            check_gpe.append(tup[1])
                    if tup[0] == "PERSON":
                        if tup[1] not in check_person:
                            check_person.append(tup[1])
        location.append(check_location)
        gpe.append(check_gpe)
        person.append(check_person)

In [6]:
def spacy_ner(content):
    cnn_list = []
    for doc in content:
        temp = []
        for token in doc:
            ner_tags = [token.text, token.ent_type_]
            if ner_tags[1]=="ORG" or ner_tags[1]=="PERSON" or ner_tags[1]=="GPE":
                temp.append(ner_tags)
        cnn_list.append(temp)
    return cnn_list

In [7]:
def spacy_sorting(content):
    org = []
    gpe = []
    person = []
    for row in content:
        check_org = []
        check_gpe = []
        check_person = []
        if len(row)!=0:
            for tup in row:
                if tup[1] == "ORG":
                    if tup[0] not in check_org:
                        check_org.append(tup[0])
                if tup[1] == "GPE":
                    if tup[0] not in check_gpe:
                        check_gpe.append(tup[0])
                if tup[1] == "PERSON":
                    if tup[0] not in check_person:
                        check_person.append(tup[0])
        org.append(check_org)
        gpe.append(check_gpe)
        person.append(check_person)
    return [org, gpe, person]

### cnn news

In [9]:
# merge cnn news into 1 dataframe
# cnn_27feb = pd.read_csv('cnn_27feb.csv')
# cnn_28feb = pd.read_csv('cnn_28feb.csv')
# cnn_1mar = pd.read_csv('cnn_1mar.csv')
# cnn_2mar = pd.read_csv('cnn_2mar.csv')
# cnn_3mar = pd.read_csv('cnn_3mar.csv')
# cnn_8mar = pd.read_csv('cnn_8mar.csv')
# cnn_9mar = pd.read_csv('cnn_9mar.csv')
# cnn = pd.concat([cnn_27feb, cnn_28feb, cnn_1mar, cnn_2mar, cnn_3mar, cnn_8mar, cnn_9mar])
# print(cnn.shape)
# cnn.head()

import glob
import os

path = r'C:\latest_news_scraper\cna_news' # use your path
all_files = glob.glob(path + "/*.csv")

df_list = (pd.read_csv(file) for file in all_files)
cnn   = pd.concat(df_list, ignore_index=True)
cnn = cnn.loc[:,['url','content']]
cnn

ValueError: No objects to concatenate

Cleaning
- Remove CNN
- Remove stopwords

In [None]:
cleaned = cleaning(cnn['content'])
print(cleaned)

In [None]:
cnn['cleaned_content'] = cleaned

cnn.head()

NLTK extraction

In [None]:
cnn_content = []

ner_info_extraction(cnn['cleaned_content'],cnn_content)
    
print(cnn_content)

In [None]:
location_with_brac = []
gpe_with_brac = []
person_with_brac = []

ner_sorting(cnn_content, location_with_brac, gpe_with_brac, person_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

In [None]:
cnn['location_with_brac'] = location_with_brac
cnn['gpe_with_brac'] = gpe_with_brac
cnn['person_with_brac'] = person_with_brac

cnn['nltk_location']=cnn['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['nltk_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['nltk_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'Unnamed: 0'],axis = 1)

cnn.head()

spacy

In [None]:
cont = cnn['cleaned_content'].apply(nlp)

cnn_list = spacy_ner(cont)
    
print(cnn_list)

In [None]:
sort = spacy_sorting(cnn_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])

In [None]:
cnn['org_with_brac'] = sort[0]
cnn['gpe_with_brac'] = sort[1]
cnn['person_with_brac'] = sort[2]

cnn['spacy_org']=cnn['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cnn['spacy_gpe'] = cnn['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cnn['spacy_person'] = cnn['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cnn = cnn.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac'],axis = 1)

cnn.head()

In [None]:
cnn.to_csv('cnn_cleaned_with_NER.csv')

### cna news

In [None]:
# Merging cna news into 1 dataframe
cna_27feb = pd.read_csv('cna_27feb.csv')
cna_28feb = pd.read_csv('cna_28feb.csv')
cna_1mar = pd.read_csv('cna_1mar.csv')
cna_2mar = pd.read_csv('cna_2mar.csv')
cna_3mar = pd.read_csv('cna_3mar.csv')
cna_8mar = pd.read_csv('cna_8mar.csv')
cna_9mar = pd.read_csv('cna_9mar.csv')
cna = pd.concat([cna_27feb, cna_28feb, cna_1mar, cna_2mar, cna_3mar, cna_8mar, cna_9mar])
print(cna.shape)
cna.head()

cleaning

In [None]:
cleaned = cleaning(cna['content'])
print(cleaned)

In [None]:
cna['cleaned_content'] = cleaned

cna.head()

NLTK extraction

In [None]:
cna_content = []

ner_info_extraction(cna['cleaned_content'],cna_content)
    
print(cna_content)

In [None]:
location_with_brac = []
gpe_with_brac = []
person_with_brac = []

ner_sorting(cna_content, location_with_brac, gpe_with_brac, person_with_brac)

print(location_with_brac)
print()

print(gpe_with_brac)
print()

print(person_with_brac)
print()

In [None]:
cna['location_with_brac'] = location_with_brac
cna['gpe_with_brac'] = gpe_with_brac
cna['person_with_brac'] = person_with_brac

cna['location']=cna['location_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['location_with_brac','gpe_with_brac','person_with_brac', 'Unnamed: 0'],axis = 1)

cna.head()

spacy NER extraction

In [None]:
cont = cna['cleaned_content'].apply(nlp)

cna_list = spacy_ner(cont)

print(cna_list)

In [None]:
sort = spacy_sorting(cna_list)

print(sort[0])
print()

print(sort[1])
print()

print(sort[2])

In [None]:
cna['org_with_brac'] = sort[0]
cna['gpe_with_brac'] = sort[1]
cna['person_with_brac'] = sort[2]

cna['spacy_org']=cna['org_with_brac'].apply(lambda x: ','.join(map(str, x)))
cna['spacy_gpe'] = cna['gpe_with_brac'].apply(lambda x: ', '.join(map(str, x)))
cna['spacy_person'] = cna['person_with_brac'].apply(lambda x: ', '.join(map(str, x)))

cna = cna.drop(columns=['org_with_brac','gpe_with_brac','person_with_brac'],axis = 1)

cna.head()

In [None]:
cna.to_csv('cna_cleaned_with_NER.csv')