# Celebrity Death Data (via Wikipedia)

Creating a notebook to produce the dataset found at the [Kaggle Celebrity Deaths Page](https://www.kaggle.com/hugodarwood/celebrity-deaths).

Attempting to replace the current dataset since it isn't complete (up-to-date) since there's no notebook to run to get up-to-date information and it has bad parses for some of the fields.

**Current branch: Consolidating pipeline into one notebook while implementing batch queries.**

In [1]:
import numpy as np
import pandas as pd
import re
import json
import csv
import requests

from os import listdir
from os.path import isfile, join
import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

from bs4 import BeautifulSoup as bs

### Batch query Wikipedia for monthly death pages

Iterators:

In [2]:
# batch query limit is 50
year_list = range(2004,2017)
print 'Date range:', min(year_list), '-', max(year_list)

Date range: 2004 - 2016


Other global variables

In [3]:
month_to_num = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

In [4]:
# max number of terms in one Wikipedia batch query
api_qmax = 50

# 
mo_yr_url_prefix = 'https://en.wikipedia.org/w/api.php?action=query&titles='
mo_yr_url_suffix = '&prop=revisions&rvprop=content&format=json'

### Helper functions

Functions from previous version (master branch) that ran individual queries (instead of batch queries) in serial.

In [5]:
desc_death_re = re.compile('(.*?),? ?((?:.+)*?). (?:.*?)?', re.DOTALL|re.MULTILINE)
death_clean_no_url_re = re.compile('\s?(.[^<]+)\.? ?(?=<|\[?http)(?:.*)?$')
# death_clean_no_url_re = re.compile('\s?\w(.[^<])+[.]?(?:<|\[?http)(?:.*)?$')

"""
Input: single text string to be processed

Output: list of two string elements
  - first string is description of person
  - second string is cause of death 
  (last clause of input when more than one comma in field)
  
"""
# bad design below: just remove urls and refs instead of extracting
def get_description_and_death(text):
    text_no_url = text
    has_death_urls = death_clean_no_url_re.match(text)
    
    if has_death_urls:
        text_no_url = has_death_urls.groups()[0]
    else:
        text_no_url = text
        
    text_parts = text_no_url.replace('=','').split(',')
    num_parts = len(text_parts)
    
    if num_parts == 0:
        return ['', '']
    elif num_parts == 1:
        return text_parts + ['']
    else:
        return ([",".join(text_parts[:-1])] + [text_parts[-1]])
    return text
    
"""
Runs get_description_and_death() on the last element of a list

Input: list of length n
Output: list of length (n+1) with last element broken into description and death
"""
def add_description_and_death(entry_list):
    return entry_list[:-1] + get_description_and_death(entry_list[-1])

mo_yr_key_re = re.compile('(\d+)_(\d+).*?')
name_age_re = re.compile('\s?\[\[(.*?)\]\], (\d+), (.+)?$', re.MULTILINE)

"""
Add the month and year as elements to an entry of type list
"""
def add_month_year_list(entry_list, mo_yr_key='_'):
    base_list = mo_yr_key.split('_')
    base_list.extend(entry_list)
    return base_list

"""
Inputs: month-year key string, text entry string
Outputs: list of length 4 of month, year, name, and age
"""
def parse_month_year_name_age(text_entry):
    out_text = text_entry.replace('\n', '')
    yr_age_match = re.match(name_age_re, out_text)
    if yr_age_match:
        out_text = yr_age_match.groups()
        return list(out_text)
    return

link_re = re.compile('\[\[([^\|\]]*)(?=\||\]\])', re.DOTALL)
link_all_re = re.compile('(\[\[(?:[^\[\]])+\]\])')

"""
Used to be messy, not anymore!
"""

"""
Find wikitext links and convert them to the displayed text

Input: text block
Output: text block with wikitext URL text extracted and URL characters removed
"""
def extract_link_text(link_block):
    link_present = link_re.search(link_block)
    if link_present:
        return link_present.groups()
    return link_block

"""
Helper function for removing link text when using re.sub--identifies a wikitext URL

Input: re.match object
Output: text of matched object 
"""
def link_only(matchobj):
    cleaned_text = extract_link_text(matchobj.groups()[0])[0]
    return cleaned_text


"""
Substitute all wikitext URL links with the display text for the URL

Input: text block
Output: text block with links removed
"""
def remove_link_text(text_block):
    return re.sub(link_all_re, link_only, text_block)


natl_pattern1 = re.compile(' ?((?:[A-Z][^\s]+ ?)+) ', re.UNICODE)

natl_unmatched_list = []

def get_nationality_text(desc_text):
    natl_match = natl_pattern1.match(desc_text.strip('['))
    if natl_match:
        return natl_match.groups()[0]
    natl_unmatched_list.append(desc_text)
    return desc_text
    
        
# essentially does the same thing as extract_link_text
def get_wiki_url(name_text):
    return name_text.split('|')[0].strip('[').strip(']')


def remove_end_period(text):
    return re.sub('\.$', '', re.sub('\s$','',text))

def remove_beginning_space(text):
    return re.sub('^ +','',text)

def clean_text(text):
    if type(text) != str:
        return text
    
    new_text = text
    url_match = re.match(death_clean_no_url_re, text)
    if url_match:
        new_text = url_match.groups()[0]
    return remove_beginning_space(
        remove_end_period(
            remove_link_text(new_text)
        ).replace('[','').replace(']','')
    )

New functions in this branch/notebook:

In [6]:
"""
Convert Wikipedia 'Deaths in (str: month) (int: year)' titles into '(int: month)_(int: year)'
"""
date_eol_re = re.compile('([A-Z][a-z]+) (\d{4})$')

def month_str2key(month_str):
    date_eol = re.search(date_eol_re, month_str)
    if date_eol:
        date_parts = date_eol.groups()
        month_num = str(month_to_num[date_parts[0]])
        year_num = date_parts[1]
        return year_num + '_' + month_num
    return month_str

"""
Remove URLs from a text block
"""
no_url_re = re.compile('\[?https?:\/\/.*[\r\n]*', flags=re.MULTILINE)

def remove_urls(text):
    return re.sub(no_url_re, '', text)

In [7]:
assert(month_str2key('asdfasdf asdf asdf December 2013') == '2013_12')

### API Batch Scrape

To avoid API call limits and throttling. Also makes queries faster.

#### List of monthly death pages to be queried

Create individual search terms for API query:

In [8]:
death_rep_words = 'Deaths_in_'
mo_yr_elems = [death_rep_words + month + '_' + str(year) 
              for month in month_to_num.keys()
              for year in year_list]

num_mo_yr_elems = len(mo_yr_elems)
num_mo_yr_batch_queries = num_mo_yr_elems / api_qmax

Group query terms into batches of 50 (Wikipedia API's batch query limit):

In [10]:
mo_yr_batch_queries = []

# collect entries into groups of 50
for i in xrange(0,num_mo_yr_batch_queries):
    start_idx = i * api_qmax
    end_idx = start_idx + api_qmax
    mo_yr_batch_queries.append("|".join(mo_yr_elems[start_idx:end_idx]))

# remaining entries
mo_yr_batch_queries.append("|".join(mo_yr_elems[end_idx:num_mo_yr_elems]))

#### Query Wikipedia API

Batch queries to Wikipedia API stored as a list of results for each batch query.

In [11]:
%%time

# query API
mo_yr_batch_results = []

for mo_yr_batch_query in mo_yr_batch_queries:
    batch_url = mo_yr_url_prefix + mo_yr_batch_query + mo_yr_url_suffix
    json_ret_val = requests.get(batch_url).json()
    mo_yr_batch_results.append(json_ret_val)

CPU times: user 836 ms, sys: 133 ms, total: 969 ms
Wall time: 12.7 s


In [12]:
q_headers = []
q_contents = []

for result in mo_yr_batch_results:
    new_headers = [month_str2key(date_elem.values()[0]) 
                   for date_elem in result['query']['normalized']]
    q_headers.extend(new_headers)
    
    new_contents = [[page_result['title'], page_result['revisions'][0]['*']] 
                    for page_result in result['query']['pages'].values()]
    q_contents.extend(new_contents)

Unpacking queries into lists of summaries by month

In [13]:
%%time
q_contents_dict = {}

for q_page in q_contents:
    q_key = month_str2key(q_page[0])
    q_list = [
        add_month_year_list(
            add_description_and_death(
                parse_month_year_name_age(remove_urls(entry))),
        q_key)
        for entry in q_page[1].encode('utf-8').rstrip().split('*')
        if re.match(name_age_re, entry.replace('\n', ''))
    ]
    q_contents_dict[q_key] = q_list

CPU times: user 1.68 s, sys: 121 ms, total: 1.8 s
Wall time: 1.78 s


In [14]:
all_entries = [entry for entry_list in q_contents_dict.values() for entry in entry_list]

In [15]:
df_full = pd.DataFrame(all_entries)
print df_full.shape
df_full.head()

(55492, 6)


Unnamed: 0,0,1,2,3,4,5
0,2013,9,Zvonko Bušić,67,Croatian airplane hijacker ([[TWA Flight 355]]),suicide by gunshot.
1,2013,9,Joaquim Justino Carreira,63,Portuguese-born Brazilian Roman Catholic prelate,Bishop of [[Roman Catholic Diocese of Guarulh...
2,2013,9,Pál Csernai,80,Hungarian footballer and manager ([[FC Bayern ...,[[North Korea national football team|North Ko...
3,2013,9,Ignacio Eizaguirre,92,"Spanish footballer ([[Valencia CF|Valencia]], ...",[[Spain national football team|national team]]).
4,2013,9,Ole Ernst,73,Danish actor.,


### Python Pooling Multi-Processing Scrape

For additional speed. However, might be very taxing on Wikipedia servers.