# Celebrity Death Data (via Wikipedia)

Creating a notebook to produce the dataset found at the [Kaggle Celebrity Deaths Page](https://www.kaggle.com/hugodarwood/celebrity-deaths).

Attempting to replace the current dataset since it isn't complete (up-to-date) since there's no notebook to run to get up-to-date information and it has bad parses for some of the fields.

# Part I: Scraping Raw Pages, Local Download

This notebook gets all Wikipedia death summary pages and stores them locally as JSON files.

### Description

This notebook downloads all monthly death lists to a local folder.

In [104]:
import numpy as np
import pandas as pd
import re
import json
import csv
import requests

from os import listdir
from os.path import isfile, join
import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

from bs4 import BeautifulSoup as bs

In [2]:
month_to_num = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

Iterators:

In [126]:
# batch query limit is 50
year_list = range(2004,2007)

### Scrape all monthly death pages

Scrape all death pages and store into the '../out/raw_pages' directory

In [54]:
base_url_prefix = 'https://en.wikipedia.org/w/api.php?action=query&titles='
base_url_suffix = '&prop=revisions&rvprop=content&format=json'

%%time
for year in year_list:
    for month in month_to_num.keys():
        filename = str(year) + '_' + str(month_to_num[month]) + '_deaths.json'
        url = (base_url_prefix +  
               'Deaths_in_' +
               month + '_' + str(year) + 
               base_url_suffix)
        content = requests.get(url).json()
        with open("../out/raw_pages/" + filename, "wb") as outfile:
            json.dump(content, outfile)
            outfile.close()

### Helper functions

In [105]:
desc_death_re = re.compile('(.*?),? ?((?:.+)*?). (?:.*?)?', re.DOTALL|re.MULTILINE)
death_clean_no_url_re = re.compile('\s?(.[^<]+)\.? ?(?=<|\[?http)(?:.*)?$')
# death_clean_no_url_re = re.compile('\s?\w(.[^<])+[.]?(?:<|\[?http)(?:.*)?$')

"""
Input: single text string to be processed

Output: list of two string elements
  - first string is description of person
  - second string is cause of death 
  (last clause of input when more than one comma in field)
  
"""
# bad design below: just remove urls and refs instead of extracting
def get_description_and_death(text):
    text_no_url = text
    if ('http' in text_no_url) or ('<ref' in text_no_url):
        # bad/sloppy design here - test if match instead of catching exceptions
        try:
            text_no_url = death_clean_no_url_re.match(text).groups()[0]
        except AttributeError:
            print text
            raw_input("Press enter to continue")
            raw_input("Press enter to continue")
    text_parts = text_no_url.replace('=','').split(',')
    num_parts = len(text_parts)
    if num_parts == 0:
        return ['', '']
    elif num_parts == 1:
        return text_parts + ['']
    else:
        return ([",".join(text_parts[:-1])] + [text_parts[-1]])
    return
    
"""
Runs get_description_and_death() on the last element of a list

Input: list of length n
Output: list of length (n+1) with last element broken into description and death
"""
def add_description_and_death(entry_list):
    return entry_list[:-1] + get_description_and_death(entry_list[-1])

mo_yr_key_re = re.compile('(\d+)_(\d+).*?')
name_age_re = re.compile('\s?\[\[(.*?)\]\], (\d+), (.+)?$', re.MULTILINE)


"""
Inputs: month-year key string, text entry string
Outputs: list of length 4 of month, year, name, and age
"""
def parse_month_year_name_age(my_key, text_entry):
    return (list(re.match(mo_yr_key_re, my_key).groups()) +
            list(re.match(name_age_re, text_entry.replace('\n', '')).groups()))

link_re = re.compile('\[\[([^\|\]]*)(?=\||\]\])', re.DOTALL)
link_all_re = re.compile('(\[\[(?:[^\[\]])+\]\])')

"""
Used to be messy, not anymore!
"""

"""
Find wikitext links and convert them to the displayed text

Input: text block
Output: text block with wikitext URL text extracted and URL characters removed
"""
def extract_link_text(link_block):
    link_present = link_re.search(link_block)
    if link_present:
        return link_present.groups()
    return link_block

"""
Helper function for removing link text when using re.sub--identifies a wikitext URL

Input: re.match object
Output: text of matched object 
"""
def link_only(matchobj):
    cleaned_text = extract_link_text(matchobj.groups()[0])[0]
    return cleaned_text

"""
Testing function
"""
def link_only_special(text):
    print text.groups()

"""
Substitute all wikitext URL links with the display text for the URL

Input: text block
Output: text block with links removed
"""
def remove_link_text(text_block):
    return re.sub(link_all_re, link_only, text_block)


natl_pattern1 = re.compile(' ?((?:[A-Z][^\s]+ ?)+) ', re.UNICODE)

natl_unmatched_list = []

def get_nationality_text(desc_text):
    natl_match = natl_pattern1.match(desc_text.strip('['))
    if natl_match:
        return natl_match.groups()[0]
    else:
        natl_unmatched_list.append(desc_text)
    return
    
        
# essentially does the same thing as extract_link_text
def get_wiki_url(name_text):
    return name_text.split('|')[0].strip('[').strip(']')


def remove_end_period(text):
    return re.sub('\.$', '', re.sub('\s$','',text))

def remove_beginning_space(text):
    return re.sub('^ +','',text)

def text_clean(text):
    if type(text) != str:
        return text
    
    new_text = text
    url_match = re.match(death_clean_no_url_re, text)
    if url_match:
        new_text = url_match.groups()[0]
    return remove_beginning_space(
    remove_end_period(
        remove_link_text(new_text)
        ).replace('[','').replace(']','')
    )

### API Batch Scrape

To avoid API call limits and throttling. Also makes queries faster.

In [127]:
batch_year_titles = []
death_rep_words = 'Deaths_in_'

for year in year_list:
    month_titles_list = []
    
    for month in month_to_num.keys():
        month_titles_list.append(death_rep_words + month + '_' + str(year))
        
    year_base_str = '|'.join(month_titles_list)
    batch_year_titles.append(year_base_str)

batch_year_str = '|'.join(batch_year_titles)

Single year:

In [128]:
batch_year_titles[0]

'Deaths_in_February_2004|Deaths_in_October_2004|Deaths_in_January_2004|Deaths_in_April_2004|Deaths_in_November_2004|Deaths_in_March_2004|Deaths_in_August_2004|Deaths_in_May_2004|Deaths_in_December_2004|Deaths_in_June_2004|Deaths_in_September_2004|Deaths_in_July_2004'

In [129]:
batch_url1 = base_url_prefix + batch_year_titles[0] + base_url_suffix

In [130]:
batch_url1

'https://en.wikipedia.org/w/api.php?action=query&titles=Deaths_in_February_2004|Deaths_in_October_2004|Deaths_in_January_2004|Deaths_in_April_2004|Deaths_in_November_2004|Deaths_in_March_2004|Deaths_in_August_2004|Deaths_in_May_2004|Deaths_in_December_2004|Deaths_in_June_2004|Deaths_in_September_2004|Deaths_in_July_2004&prop=revisions&rvprop=content&format=json'

In [131]:
batch_raw1 = requests.get(batch_url1).json()

In [132]:
# get page names for each month
page_names = [date_elem.values()[0] for date_elem in batch_raw1['query']['normalized']]

All years:

In [133]:
batch_url_all = base_url_prefix + batch_year_str + base_url_suffix

In [134]:
%%time
batch_raw_all = requests.get(batch_url_all).json()

CPU times: user 68.2 ms, sys: 15.2 ms, total: 83.5 ms
Wall time: 1.28 s


In [135]:
page_names_all = [date_elem.values()[0] for date_elem in batch_raw_all['query']['normalized']]

In [146]:
month_year_lists = {}

num_months = len(page_names_all)
for i in xrange(0,num_months):
    month_str = page_names_all[i]
    my_key = '2111_22' # need to change to match month_str
    raw_page = batch_raw_all['query']['pages'].values()[i]['revisions'][0]['*']
    
    month_year_lists[month_str] = [
        add_description_and_death(
            parse_month_year_name_age(my_key, 
                                      re.sub(r'^https?:\/\/.*[\r\n]*', '', 
                                             entry, flags=re.MULTILINE)
                                     )
        )
        for entry in raw_page.encode('utf-8').rstrip().split('*')
        if re.match(name_age_re, entry.replace('\n', ''))]

In [148]:
len(month_year_lists.values()[0])

179

In [None]:
batch_raw_all['query']['pages'].values()[0]['revisions'][0]['*']

### Python Pooling Multi-Processing Scrape

For additional speed. However, might be very taxing on Wikipedia servers.