## Set up Wikipedia API

In [5]:
!pip install wikipedia



You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import wikipedia
# the following imports resolve the certificate error
import os
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
os.environ['CURL_CA_BUNDLE'] = ""
os.environ['PYTHONWARNINGS']="ignore:Unverified HTTPS request"

## Load in Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("GNI88.csv")
data.head(2)

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,Legacy Quote Tag,...,Source Religion,Legacy Source Tag,Constituent Group,Media Name,Media Medium,Journalist Name,Constituent Author,Article Issues,Custom Group,Media Group
0,2018-01-10,3759306,North Korea makes deals and threats,7023849,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Neutral,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,
1,2018-01-10,3759306,North Korea makes deals and threats,7023842,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Negative,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,


In [4]:
# Filter out the non-names from the dataset

#non_name_regex = ".*(?:staff$|staff ).*|Letters to the Editor|from.*|(?i).*editorial.*|(?i).*editors.*|No by-line,|(?i).*readers.*|(?i)blog$"
nonname_flags = ['Unnamed', 'Unknown', 'Official', 'Spokesperson', 'Statement', 'Foreign', 'Lawmaker',
                 'Military','Navy','Advocacy','Journal', 'Analyst',
                 'Government','Agency', 'Agencies', 'Ministry','Department']
df = data[~data['Source Name'].str.contains('|'.join(nonname_flags), na=False)]

In [5]:
speakers = df['Source Name'].unique()
print("Number of unique speakers:", len(speakers))
print("Number of non-names:", len(data['Source Name'].unique()) - len(speakers))

Number of unique speakers: 14239
Number of non-names: 1162


In [6]:
data['Source Type'].unique()

array(['Foreign Gov/Mil Official', 'Federal Official', 'US Military',
       'Citizen', 'Media/Journalist', 'US Rep. & Staff',
       'State/Local Official', 'US Senate & Staff', 'Analyst/Commentator',
       'Nuke Organization - Academic', 'Non-Profit/NGO',
       'Nuke Organization', 'Other', 'Former Admin. Officials',
       'Think Tanks', 'Academic', 'Partisans/Fmr. Politicians',
       'International Orgs', 'Judicial Official', 'Regulator',
       'Corporate Official', 'Blogger', 'Public Polling', 'Attorney', nan,
       'Nuclear Scientist', 'Religious/Clerical', 'US Police',
       'Terrorist/Extremist', 'Ambassador', 'Defense', 'Defense Forces',
       'Information minister', 'Deputy', 'Nuke Organization - Other',
       'Nuclear Official', 'Chairman'], dtype=object)

## Testing Wikipedia API on Speaker Names

Using wikipedia.summary() directly can raise a disambiguation error because there are multiple possible search results. To work around this while keeping runtime costs down, it is assumed that the first item that wikipedia.search() returns is what we'd want.

However, this can also lead to an increase of false results if there just isn't a corresponding wikipedia page. sameName() is a helper function to determine whether two different strings are referring to the same person.

In [7]:
import re
from difflib import SequenceMatcher

"""Compares two strings and returns true if they're likely the same name.
Approx parameter accounts for nicknames (e.g., James Mattis as Jim)"""
def sameName(name1, name2, approx=True):
    name1 = re.sub(r'[^a-zA-Z]', '', name1).lower() #only alphabetic characters, lowercase
    name2 = re.sub(r'[^a-zA-Z]', '', name2).lower()
    if approx:
        if SequenceMatcher(a=name1,b=name2).ratio() > 0.7: #somewhat arbitrary; ratio of James Mattis / Jim Mattis = 0.81
            return True
    if name1 == name2 or name1 in name2 or name2 in name1:
        return True
    return False

#test
print(sameName('Kim Jong-un', 'kim jong'))
print(sameName('James Mattis', 'Jim Mattis'))

True
True


In [8]:
"""Takes in list of speakers, outputs 
- a dictionary with the speaker name and the corresponding first two sentences from the wikipedia summary
- a list of names that errored."""
def getSummary(speakers):
    error_names = []
    dict = {}
    for name in speakers: 
        try: 
            pageTitle = wikipedia.search(name)[0] #assuming the first term will most closely match
            if not sameName(name, pageTitle):
                error_names.append(name)
                continue
            wiki_summary = wikipedia.summary(pageTitle, sentences = 2, auto_suggest=False) 
            #auto_suggest=False makes sure the function will use the literal input; code will bug otherwise
        except: 
            error_names.append(name)
        else: 
            dict[name] = wiki_summary
    return dict, error_names

In [9]:
%%time 

summary, error_names = getSummary(speakers[:5001]) #~3hrs to run whole dataset

Wall time: 59min 58s


In [10]:
summary_df = pd.DataFrame.from_dict(summary, orient='index').reset_index()

In [11]:
summary_df.to_csv("wiki_summary_5000.csv",index=False)

In [12]:
summary_df.head()

Unnamed: 0,index,0
0,\nMoon Jae-in,Moon Jae-in (Korean: 문재인; Hanja: 文在寅; Korean p...
1,Ri Son Gwon,Ri Son-gwon (Korean: 리선권) is a North Korean po...
2,Cho Myoung- Gyon,Cho Myoung-gyon (Korean: 조명균; Hanja: 趙明均; born...
3,Paul Selva,"Paul Joseph Selva (born September 27, 1958) is..."
4,Donald Trump OLD,"Donald John Trump (born June 14, 1946) is an A..."


In [13]:
print("Number of Errors:", len(error_names))
print("Percent of Errors:", len(error_names)/5000)
error_names #still has some non-names from misspellings and noncomprehensive filter list earlier; account for case-sensitive
# more filter words: diplomat, think tank, media, advisers, Administration, U.N., Media, Air Force, 
# Army, Diplomat, Experts, Report, source, deal, poll, command, citizen

Number of Errors: 2290


['Rob Soofer',
 'Lisa Foxen',
 'Atji Pai',
 'Hiroyuku Suenaga',
 'Baek Tae-Hyun',
 'Lu Kang',
 'Many analysts',
 'Fumiaki Kubo',
 'Greg Weaver',
 'Sapan Shah',
 'Uri Friedman (OLD)',
 'David Welna',
 'Alexander Golts',
 'Malcolm Chalmers',
 'Report DOD',
 'Euan Graham',
 'Zhang Liangui',
 'Evans J.R. Revere',
 'Stephen Noerper',
 'Citizen South Korea',
 'Former U.S official',
 'Pavel Podvig (Russian Nuclear Forces Project)',
 'Go Myong-Hyun',
 'Masoud Jazayeri',
 'Charles Schumer',
 'Michele Kelemen',
 'Joshua Pollack',
 'Tong Zhao',
 'Aidan Foster-Carter',
 'Du Hyeogn Cha',
 'Koh Yu-hwan',
 'Rahul Bedi',
 'Goverment China',
 'Christopher Hill',
 'Kim Hack-yong',
 'Yang Shilong',
 'Robert A. Wood',
 'Marta Vicentini',
 'Harry Harris',
 'Patrick McEachern',
 'Alexander Shein',
 'William Potter',
 'Joseph Pae',
 'Leif-Eric Easley',
 'Alexandra Bell (Constituent)',
 'Christine Wormuth (OLD)',
 'Daryl Kimball',
 'Jenny Town',
 'Kingston Reif',
 'Yuri Borisov',
 'Yoon Young-chan',
 'Chun Ha

## Feature Testing

Ideas
- flag for music / popculture summaries (may indicate wrong associated person)
- account for  key words like "former" and "was..." (e.g., if a politican has formerly served in the military, can confuse classification)

In [None]:
#'|'.join(nonname_flags)
# case insensitive regex: /aBc/i would match AbC
media_flags = 'journalist|newspaper|website|magazine|news|agency'
US_military_flags = 'general|air force|commander|army|navy|marine' #add indicator for US as well
foreign_gov_flags = 'foreign|south korea|iran|north korea|china|prime minister'
academic_flags = 'professor|academic|academia|college|university'

## Suggested Key Words

Government
- politican
- administration
- diplomat
- government official
- president
- senator
- democrat
- republican
- representative
- policy advisor
- prime minister
- judge

Academic
- professor
- academic

Media
- journalist
- newspaper
- website
- magazine
- news agency

Military
- general
- air force
- commander
- army
- navy
- marine