In [1]:
import pandas as pd
import nltk
from entity_extraction import get_cameo_mappings, parse_sentence, tokenize, format_parsed_str, send_to_petr
import csv
import json
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lawli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lawli\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
news = pd.read_json("News_Category_Dataset_v3.json", lines= "True")
news = news[news['category'] != "SPORTS"]
news.reset_index(inplace=True)
print(news.shape)
news.head()

(204450, 7)


Unnamed: 0,index,link,headline,category,short_description,authors,date
0,0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
config_path = "config.ini"
map_dict = get_cameo_mappings(config_path)

In [4]:
def get_entity_information(df):
    story_id = str(df[0])
    text = df[2] + ". " +  df[4]
    date = "202209230908"

    event_dict = {story_id: {}}
    event_dict[story_id]["sents"] = {}
    event_dict[story_id]["meta"] = {}
    event_dict[story_id]["meta"]["date"] = date

    sentences = nltk.sent_tokenize(text)

    for i, _ in enumerate(sentences):
        sent = sentences[i]
        event_dict[story_id]["sents"][str(i)] = {}
        event_dict[story_id]["sents"][str(i)]["content"] = ' '.join(tokenize(sent))
        corenlp_parsed = parse_sentence(sent)
        event_dict[story_id]["sents"][str(i)]["parsed"] = format_parsed_str(corenlp_parsed)
    event_updated = send_to_petr(event_dict)
    event_updated[story_id]["meta"]["date"] = df[6]
    return event_updated

def resolve_list(x):
    if isinstance(x, list):
        return [a for i in x for a in resolve_list(i)]
    else:
        return [str(x).strip().lower()]

def resolve_list_join(x):

    if isinstance(x, list):
        if any(isinstance(item, list) for item in x):
            return [a for i in x for a in resolve_list(i)]
        else:
            return [" ".join(x).strip().lower()]
    else:
        return [x.strip().lower()]


In [5]:
field_names = ['link', 'headline', 'category', 'short_description', 'authors', 'date',
              'actor1', 'actor2', 'event_code', 'goldstein_score', 'nouns', 'actortext', 'actorroot',
              'eventtext', 'issues', 'events']

country_relations = pd.DataFrame(columns = field_names)
country_relations.head()

Unnamed: 0,link,headline,category,short_description,authors,date,actor1,actor2,event_code,goldstein_score,nouns,actortext,actorroot,eventtext,issues,events


In [52]:

rows = []
start = 0
interval = 100
end = start + interval
i = start

drop_rows = [28473, 65613, 73486, 119174, 124136, 125657, 128507, 128562, 177432, 182749, 206601]
news_resume = news.iloc[start : ][:]
drop_index = news_resume[news_resume['index'].isin(drop_rows)].index
news_resume.drop(drop_index, inplace=True)
news_resume.head(45)

total = news.shape[0]

for row in tqdm(news_resume.iterrows()):

    if i >= start and i <end:
        
        info = get_entity_information(row[1]) 
        story_id = str(list(info.keys())[0])

        if info[story_id]['sents']:
            for sent in info[story_id]['sents']:

                if 'events' in info[story_id]['sents'][sent]:

                    row_dict = { column_name : None for column_name in field_names }

                    row_dict['events'] = resolve_list(info[story_id]['sents'][sent]['events'])
                    row_dict['link'] = row[1][1]
                    row_dict['headline'] = row[1][2]
                    row_dict['short_description'] = row[1][4]
                    row_dict['authors'] = row[1][5]
                    row_dict['date'] = str(row[1][6])
                    row_dict['category'] = row[1][3]
                    row_dict['actor1'] = row_dict['events'][0]
                    row_dict['actor2'] = row_dict['events'][1]
                    row_dict['event_code'] = row_dict['events'][2]
                    try:
                        row_dict['goldstein_score'] = map_dict[row_dict['event_code']]
                    except:
                        print("skipping..")
                    row_dict['nouns'] = resolve_list_join(info[story_id]['sents'][sent]['meta']['nouns'])
                    row_dict['actortext'] = resolve_list(info[story_id]['sents'][sent]['meta']['actortext'])
                    row_dict['actorroot'] = resolve_list(info[story_id]['sents'][sent]['meta']['actorroot'])
                    row_dict['eventtext'] = resolve_list(info[story_id]['sents'][sent]['meta']['eventtext'])

                    if 'issues' in info[story_id]['sents'][sent]:
                        row_dict['issues'] = resolve_list(info[story_id]['sents'][sent]['issues'])
                    
                    rows.append(row_dict)
 
    else:
        print(f"Processing rows {start} to {end} out of {total} total rows")
        print(f"Saving {start} to {end} to disk...")
        country_relations_dict = pd.DataFrame.from_dict(rows)
        country_relations_dict.to_json(f"json_dir/country_relations_{start}_{end}.json", indent = 2)
        rows = []
        
        start = end
        end = end + interval
        
    i += 1
    
    if i > total:
        break

       

100it [02:05,  1.05it/s]

Processing rows 203200 to 203300 out of 204450 total rows
Saving 203200 to 203300 to disk...


200it [03:39,  1.24it/s]

Processing rows 203300 to 203400 out of 204450 total rows
Saving 203300 to 203400 to disk...


300it [05:10,  1.64it/s]

Processing rows 203400 to 203500 out of 204450 total rows
Saving 203400 to 203500 to disk...


400it [06:50,  1.16it/s]

Processing rows 203500 to 203600 out of 204450 total rows
Saving 203500 to 203600 to disk...


500it [08:44,  2.15s/it]

Processing rows 203600 to 203700 out of 204450 total rows
Saving 203600 to 203700 to disk...


600it [10:27,  1.74s/it]

Processing rows 203700 to 203800 out of 204450 total rows
Saving 203700 to 203800 to disk...


700it [12:13,  1.85it/s]

Processing rows 203800 to 203900 out of 204450 total rows
Saving 203800 to 203900 to disk...


800it [13:51,  1.40it/s]

Processing rows 203900 to 204000 out of 204450 total rows
Saving 203900 to 204000 to disk...


900it [15:42,  1.10it/s]

Processing rows 204000 to 204100 out of 204450 total rows
Saving 204000 to 204100 to disk...


1000it [17:30,  1.65it/s]

Processing rows 204100 to 204200 out of 204450 total rows
Saving 204100 to 204200 to disk...


1100it [19:16,  1.27s/it]

Processing rows 204200 to 204300 out of 204450 total rows
Saving 204200 to 204300 to disk...


1200it [20:59,  1.21it/s]

Processing rows 204300 to 204400 out of 204450 total rows
Saving 204300 to 204400 to disk...


1250it [21:51,  1.05s/it]


In [53]:
import glob, os

json_dir = 'json_dir'

json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

country_relations = pd.DataFrame(columns = field_names)

for file in file_list:
    temp_df = pd.read_json(file)
    country_relations = pd.concat([country_relations, temp_df], axis =0)

country_relations.reset_index(inplace=True)
country_relations.drop(['index'], axis = 1, inplace =True)
print(country_relations.shape)
country_relations.head(40)

(5075, 16)


Unnamed: 0,link,headline,category,short_description,authors,date,actor1,actor2,event_code,goldstein_score,nouns,actortext,actorroot,eventtext,issues,events
0,https://www.huffpost.com/entry/reporter-gets-a...,Reporter Gets Adorable Surprise From Her Boyfr...,U.S. NEWS,"""Who's that behind you?"" an anchor for New Yor...",Elyse Wanshel,2022-09-22 00:00:00,usa,---med,20,3.0,"[new york, usa, united, states, of, america, j...","[new york, journalist]","[united states of america, ---]",[asked],,"[usa, ---med, 020]"
1,https://www.huffpost.com/entry/puerto-rico-wat...,Puerto Ricans Desperate For Water After Hurric...,WORLD NEWS,More than half a million people remained witho...,"DÁNICA COTO, AP",2022-09-22 00:00:00,---ppl,usa,180,-9.0,"[people, ~ppl, ~, u.s., usa, united, states, o...","[people, u.s.]","[---, united states of america]",[remained ... lashed],,"[---ppl, usa, 180]"
2,https://www.huffpost.com/entry/russian-control...,4 Russian-Controlled Ukrainian Regions Schedul...,WORLD NEWS,The concerted and quickening Kremlin-backed ef...,"Jon Gambrell, AP",2022-09-20 00:00:00,ukr,rus,40,1.0,"[ukrainian, ukr, ukraine, russia, rus, russian...","[ukrainian, russia]","[ukraine, russian federation]",[schedule ... join],,"[ukr, rus, 040]"
3,https://www.huffpost.com/entry/mark-frerichs-r...,Family Of American Held In Afghanistan Says He...,U.S. NEWS,"Mark Frerichs, a Navy veteran and civilian con...","Eric Tucker and Rahim Faiez, AP",2022-09-19 00:00:00,afg,---milcvl,181,-9.0,"[navy, civilian, ~milcvl, ~, ~, afghanistan, a...","[afghanistan, navy ... civilian]","[afghanistan, ---]",[<~milcvl> ... kidnapped in afg ... in <--->],,"[afg, ---milcvl, 181]"
4,https://www.huffpost.com/entry/bc-as-pakistan-...,WHO Raises Alarm On Disease In Flood-hit Areas...,WORLD NEWS,The World Health Organization is raising the a...,"ZARAR KHAN, AP",2022-09-17 00:00:00,igounohlhwho,pakhlhlab,8,5.0,"[world health organization, igounohlhwho, worl...","[world health organization, ---]","[world health organization, ---]",[raising ... alarm],"[natural_disaster, 1]","[igounohlhwho, pakhlhlab, 08]"
5,https://www.huffpost.com/entry/african-land-sn...,German Customs Officials Follow Trail Of Slime...,WEIRD NEWS,“Never in the history of the Duesseldorf custo...,,2022-09-16 00:00:00,---med,---med,10,0.0,"[spokesman, ~med, ~, media, ~med, ~]","[spokesman, spokesman]","[---, ---]",[told],"[id_smuggling, 1]","[---med, ---med, 010]"
6,https://www.huffpost.com/entry/afghan-adjustme...,Bill To Help Afghans Who Escaped Taliban Faces...,POLITICS,Republican outrage over the shoddy U.S. withdr...,Hamed Ahmadi and Arthur Delaney,2022-09-16 00:00:00,usa,---ref,51,3.4,"[afghanistan, afg, afghanistan, u.s., usa, uni...","[u.s., refugees]","[united states of america, ---]",[has ... spurred],"[refugees, 1]","[usa, ---ref, 051]"
7,https://www.huffpost.com/entry/sept-11-anniver...,"Biden Honors 9/11 Victims, Vows Commitment To ...",POLITICS,President Joe Biden has marked the 21st annive...,"COLLEEN LONG and AAMER MADHANI, AP",2022-09-11 00:00:00,usagov,usamil,30,4.0,"[president, joe biden, usagov, ~, joe, biden, ...","[president joe biden, pentagon]","[joe biden, united states military]",[has marked ... anniversary ... taking part],,"[usagov, usamil, 030]"
8,https://www.huffpost.com/entry/ukraine-forces-...,Ukraine Claws Back Some Territory; Nuclear Pla...,WORLD NEWS,Ukrainian forces are claiming new success in t...,"HANNA ARHIROVA and YURAS KARMANAU, AP",2022-09-10 00:00:00,ukr,rus,13,0.4,"[ukrainian, ukr, ukraine, russian, rus, russia...","[ukrainian, russian]","[ukraine, russian federation]",[are claiming ... success],,"[ukr, rus, 013]"
9,https://www.huffpost.com/entry/newspaper-front...,'Our Hearts Are Broken': Historic Front Pages ...,WORLD NEWS,Both British and international newspapers hono...,Kate Nicholson,2022-09-09 00:00:00,---med,gbr,51,3.4,"[newspapers, ~med, ~, u.k, gbr, united, kingdom]","[newspapers, u.k]","[---, united kingdom]",[honor],,"[---med, gbr, 051]"


In [54]:
#Save final processed file to disk
country_relations.to_json(f"country_relations_final.json", indent = 2)

In [58]:
#Reload final file
country_relations_final = pd.read_json("country_relations_final.json")
country_relations_final.head()

Unnamed: 0,link,headline,category,short_description,authors,date,actor1,actor2,event_code,goldstein_score,nouns,actortext,actorroot,eventtext,issues,events
0,https://www.huffpost.com/entry/reporter-gets-a...,Reporter Gets Adorable Surprise From Her Boyfr...,U.S. NEWS,"""Who's that behind you?"" an anchor for New Yor...",Elyse Wanshel,2022-09-22,usa,---med,20,3.0,"[new york, usa, united, states, of, america, j...","[new york, journalist]","[united states of america, ---]",[asked],,"[usa, ---med, 020]"
1,https://www.huffpost.com/entry/puerto-rico-wat...,Puerto Ricans Desperate For Water After Hurric...,WORLD NEWS,More than half a million people remained witho...,"DÁNICA COTO, AP",2022-09-22,---ppl,usa,180,-9.0,"[people, ~ppl, ~, u.s., usa, united, states, o...","[people, u.s.]","[---, united states of america]",[remained ... lashed],,"[---ppl, usa, 180]"
2,https://www.huffpost.com/entry/russian-control...,4 Russian-Controlled Ukrainian Regions Schedul...,WORLD NEWS,The concerted and quickening Kremlin-backed ef...,"Jon Gambrell, AP",2022-09-20,ukr,rus,40,1.0,"[ukrainian, ukr, ukraine, russia, rus, russian...","[ukrainian, russia]","[ukraine, russian federation]",[schedule ... join],,"[ukr, rus, 040]"
3,https://www.huffpost.com/entry/mark-frerichs-r...,Family Of American Held In Afghanistan Says He...,U.S. NEWS,"Mark Frerichs, a Navy veteran and civilian con...","Eric Tucker and Rahim Faiez, AP",2022-09-19,afg,---milcvl,181,-9.0,"[navy, civilian, ~milcvl, ~, ~, afghanistan, a...","[afghanistan, navy ... civilian]","[afghanistan, ---]",[<~milcvl> ... kidnapped in afg ... in <--->],,"[afg, ---milcvl, 181]"
4,https://www.huffpost.com/entry/bc-as-pakistan-...,WHO Raises Alarm On Disease In Flood-hit Areas...,WORLD NEWS,The World Health Organization is raising the a...,"ZARAR KHAN, AP",2022-09-17,igounohlhwho,pakhlhlab,8,5.0,"[world health organization, igounohlhwho, worl...","[world health organization, ---]","[world health organization, ---]",[raising ... alarm],"[natural_disaster, 1]","[igounohlhwho, pakhlhlab, 08]"
