# Celebrity Death Data (via Wikipedia)

Creating a notebook to produce the dataset found at the [Kaggle Celebrity Deaths Page](https://www.kaggle.com/hugodarwood/celebrity-deaths).

Attempting to replace the current dataset since it isn't complete (up-to-date) since there's no notebook to run to get up-to-date information and it has bad parses for some of the fields.

**Current branch: Consolidating pipeline into one notebook while implementing batch queries.**

In [376]:
import numpy as np
import pandas as pd
import re
import json
import csv
import time
import requests
import pickle

from os import listdir
from os.path import isfile, join


"""
use for writing out characters
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from bs4 import BeautifulSoup as bs

In [295]:
import requests

# Get a copy of the default headers that requests would use
headers = requests.utils.default_headers()

# Update the headers with your custom ones
# You don't have to worry about case-sensitivity with
# the dictionary keys, because default_headers uses a custom
# CaseInsensitiveDict implementation within requests' source code.
headers.update(
    {
        'User-Agent': 'Celeb Death Scraper GZ',
        'From': 'geordgez@gmail.com'
    }
)




### Batch query Wikipedia for monthly death pages

Iterators:

In [160]:
# batch query limit is 50
year_list = range(2004,2017)
print 'Date range:', min(year_list), '-', max(year_list)

Date range: 2004 - 2016


Other global variables

In [161]:
month_to_num = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

In [382]:
# max number of terms in one Wikipedia batch query
api_qmax = 50

# base URL for death summaries by month (and year)
mo_yr_url_prefix = 'https://en.wikipedia.org/w/api.php?action=query&titles='
mo_yr_url_suffix = '&prop=revisions&rvprop=content&format=json'

# page sizes
indiv_url_prefix = 'https://en.wikipedia.org/w/api.php?action=query&titles='
indiv_url_suffix = '&prop=revisions&rvprop=size&format=json'

# dates of birth and death
base_bday_prefix = 'https://en.wikipedia.org/w/api.php?action=query&titles='
base_bday_suffix = '&prop=revisions&rvprop=content&rvsection=0&format=json'

### Helper functions

Functions from previous version (master branch) that ran individual queries (instead of batch queries) in serial.

In [163]:
desc_death_re = re.compile('(.*?),? ?((?:.+)*?). (?:.*?)?', re.DOTALL|re.MULTILINE)
death_clean_no_url_re = re.compile('\s?(.[^<]+)\.? ?(?=<|\[?http)(?:.*)?$')
# death_clean_no_url_re = re.compile('\s?\w(.[^<])+[.]?(?:<|\[?http)(?:.*)?$')

"""
Input: single text string to be processed

Output: list of two string elements
  - first string is description of person
  - second string is cause of death 
  (last clause of input when more than one comma in field)
  
"""
# bad design below: just remove urls and refs instead of extracting
def get_description_and_death(text):
    text_no_url = text
    has_death_urls = death_clean_no_url_re.match(text)
    
    if has_death_urls:
        text_no_url = has_death_urls.groups()[0]
    else:
        text_no_url = text
        
    text_parts = text_no_url.replace('=','').split(',')
    num_parts = len(text_parts)
    
    if num_parts == 0:
        return ['', '']
    elif num_parts == 1:
        return text_parts + ['']
    elif text_parts[-1]:
        if text_parts[-1][-1] == ')':
            return ([','.join(text_parts)] + [''])
        else:
            return ([','.join(text_parts)] + [text_parts[-1]])
    else:
        return ([','.join(text_parts)] + [text_parts[-1]])
    return text
    
"""
Runs get_description_and_death() on the last element of a list

Input: list of length n
Output: list of length (n+1) with last element broken into description and death
"""
def add_description_and_death(entry_list):
    return entry_list[:-1] + get_description_and_death(entry_list[-1])

mo_yr_key_re = re.compile('(\d+)_(\d+).*?')
name_age_re = re.compile('\s?\[\[(.*?)\]\], (\d+), (.+)?$', re.MULTILINE)

"""
Add the month and year as elements to an entry of type list
"""
def add_month_year_list(entry_list, mo_yr_key='_'):
    base_list = mo_yr_key.split('_')
    base_list.extend(entry_list)
    return base_list

"""
Inputs: month-year key string, text entry string
Outputs: list of length 4 of month, year, name, and age
"""
def parse_month_year_name_age(text_entry):
    out_text = text_entry.replace('\n', '')
    yr_age_match = re.match(name_age_re, out_text)
    if yr_age_match:
        out_text = yr_age_match.groups()
        return list(out_text)
    return

link_re = re.compile('\[\[([^\|\]]*)(?=\||\]\])', re.DOTALL)
link_all_re = re.compile('(\[\[(?:[^\[\]])+\]\])')

"""
Used to be messy, not anymore!
"""

"""
Find wikitext links and convert them to the displayed text

Input: text block
Output: text block with wikitext URL text extracted and URL characters removed
"""
def extract_link_text(link_block):
    link_present = link_re.search(link_block)
    if link_present:
        return link_present.groups()
    return link_block

"""
Helper function for removing link text when using re.sub--identifies a wikitext URL

Input: re.match object
Output: text of matched object 
"""
def link_only(matchobj):
    cleaned_text = extract_link_text(matchobj.groups()[0])[0]
    return cleaned_text


"""
Substitute all wikitext URL links with the display text for the URL

Input: text block
Output: text block with links removed
"""
def remove_link_text(text_block):
    return re.sub(link_all_re, link_only, text_block)


natl_pattern1 = re.compile(' ?((?:[A-Z][^\s]+ ?)+) ', re.UNICODE)

natl_unmatched_list = []

def get_nationality_text(desc_text):
    natl_match = natl_pattern1.match(desc_text.strip('['))
    if natl_match:
        return natl_match.groups()[0]
    natl_unmatched_list.append(desc_text)
    return desc_text
    
        
# essentially does the same thing as extract_link_text
def get_wiki_url(name_text):
    return name_text.split('|')[0].strip('[').strip(']')


def remove_end_period(text):
    return re.sub('\.$', '', re.sub('\s$','',text))

def remove_beginning_space(text):
    return re.sub('^ +','',text)

def clean_text(text):
    if type(text) != str:
        return text
    
    new_text = text
    url_match = re.match(death_clean_no_url_re, text)
    if url_match:
        new_text = url_match.groups()[0]
    return remove_beginning_space(
        remove_end_period(
            remove_link_text(new_text)
        ).replace('[','').replace(']','')
    )

New functions in this branch/notebook:

In [164]:
"""
Convert Wikipedia 'Deaths in (str: month) (int: year)' titles into '(int: month)_(int: year)'
"""
date_eol_re = re.compile('([A-Z][a-z]+) (\d{4})$')

def month_str2key(month_str):
    date_eol = re.search(date_eol_re, month_str)
    if date_eol:
        date_parts = date_eol.groups()
        month_num = str(month_to_num[date_parts[0]])
        year_num = date_parts[1]
        return year_num + '_' + month_num
    return month_str

"""
Remove URLs from a text block
"""
no_url_re = re.compile('\[?https?:\/\/.*[\r\n]*', flags=re.MULTILINE)

def remove_urls(text):
    return re.sub(no_url_re, '', text)


In [165]:
assert(month_str2key('asdfasdf asdf asdf December 2013') == '2013_12')

Additional function for easily batch scraping in the future:

In [216]:
def get_batch_queries(term_list, prefix_url, suffix_url, max_qlim = 48):
    num_elems = len(term_list)
    num_batches = num_elems / max_qlim
    
    batch_queries = []
    for i in xrange(0, num_batches):
        start_idx = i * max_qlim
        end_idx = start_idx + max_qlim
        query_str = prefix_url + "|".join(term_list[start_idx:end_idx]) + suffix_url
        batch_queries.append(query_str)
    batch_queries.append(prefix_url + "|".join(term_list[end_idx:num_elems]) + suffix_url)
    
    return batch_queries

### API Batch Scrape

To avoid API call limits and throttling. Also makes queries faster.

#### List of monthly death pages to be queried

Create individual search terms for API query:

In [167]:
death_rep_words = 'Deaths_in_'
mo_yr_elems = [death_rep_words + month + '_' + str(year) 
              for month in month_to_num.keys()
              for year in year_list]
mo_yr_elems.append('Deaths_in_January_2017')

mo_yr_batch_queries = get_batch_queries(mo_yr_elems, mo_yr_url_prefix, mo_yr_url_suffix)

#### Query Wikipedia API

Batch queries to Wikipedia API stored as a list of results for each batch query.

In [189]:
%%time

# query API
mo_yr_batch_results = []

for mo_yr_batch_query in mo_yr_batch_queries:
    json_ret_val = requests.get(mo_yr_batch_query, headers=headers).json()
    mo_yr_batch_results.append(json_ret_val)

CPU times: user 823 ms, sys: 157 ms, total: 980 ms
Wall time: 7.75 s


Unpack query results:

In [190]:
q_contents = []

for result in mo_yr_batch_results:
    new_contents = [[page_result['title'], page_result['revisions'][0]['*']] 
                    for page_result in result['query']['pages'].values()]
    q_contents.extend(new_contents)

Unpacking queries into lists of summaries by month

In [191]:
%%time
q_contents_dict = {}

for q_page in q_contents:
    q_key = month_str2key(q_page[0])
    q_list = [
        add_month_year_list(
            add_description_and_death(
                parse_month_year_name_age(remove_urls(entry))),
        q_key)
        for entry in q_page[1].encode('utf-8').rstrip().split('*')
        if re.match(name_age_re, entry.replace('\n', ''))
    ]
    q_contents_dict[q_key] = q_list

CPU times: user 1.57 s, sys: 134 ms, total: 1.71 s
Wall time: 1.67 s


In [192]:
all_entries = [entry for entry_list in q_contents_dict.values() for entry in entry_list]

In [193]:
df_full = pd.DataFrame(all_entries)
df_full.columns = ['year', 'month', 'name', 'age', 'desc', 'cause_of_death']
print df_full.shape
df_full.head()

(55492, 6)


Unnamed: 0,year,month,name,age,desc,cause_of_death
0,2013,9,Zvonko Bušić,67,Croatian airplane hijacker ([[TWA Flight 355]]...,suicide by gunshot.
1,2013,9,Joaquim Justino Carreira,63,Portuguese-born Brazilian Roman Catholic prela...,Bishop of [[Roman Catholic Diocese of Guarulh...
2,2013,9,Pál Csernai,80,Hungarian footballer and manager ([[FC Bayern ...,[[North Korea national football team|North Ko...
3,2013,9,Ignacio Eizaguirre,92,"Spanish footballer ([[Valencia CF|Valencia]], ...",[[Spain national football team|national team]]).
4,2013,9,Ole Ernst,73,Danish actor.,


### Cleaning up (similar to old notebook)

In [175]:
%%time
df_full['desc'] = df_full.desc.map(clean_text)
df_full['cause_of_death'] = df_full.cause_of_death.map(clean_text)
df_full['nationality'] = df_full.desc.map(get_nationality_text)
df_full['name'] = df_full.name.map(get_wiki_url)

df_full['desc'] = df_full.desc.map(clean_text)
df_full['cause_of_death'] = df_full.cause_of_death.map(clean_text)

CPU times: user 4.95 s, sys: 132 ms, total: 5.09 s
Wall time: 5.12 s


In [176]:
print df_full.shape
df_full.head()

(55492, 7)


Unnamed: 0,year,month,name,age,desc,cause_of_death,nationality
0,2013,9,Zvonko Bušić,67,"Croatian airplane hijacker (TWA Flight 355), s...",suicide by gunshot,Croatian
1,2013,9,Joaquim Justino Carreira,63,Portuguese-born Brazilian Roman Catholic prela...,Bishop of Roman Catholic Diocese of Guarulhos ...,Portuguese-born Brazilian Roman Catholic
2,2013,9,Pál Csernai,80,Hungarian footballer and manager (FC Bayern Mu...,North Korea national football team),Hungarian
3,2013,9,Ignacio Eizaguirre,92,"Spanish footballer (Valencia CF, Real Sociedad...",Spain national football team),Spanish
4,2013,9,Ole Ernst,73,Danish actor,,Danish


Parentheses issue:

In [177]:
full_2_list = [df_full.columns.tolist()] + list(df_full.values.tolist())

for row in full_2_list[1:]:
    if len(row[5]) > 0:
        if row[5][-1] == ')':
            row[4] = row[4] + ", " + row[5]
            row[5] = ''

### Write out file


In [178]:
with open('../out/celeb_deaths_wikipedia_full_1.csv', 'wb') as df_full_2_outfile:
    out_writer = csv.writer(df_full_2_outfile, delimiter=',')
    for row in full_2_list:
        out_writer.writerow(row)
    df_full_2_outfile.close()

### Getting page size, date of birth, date of death, etc.

In [205]:
all_names = list(df_full.name.map(lambda name: name.replace(' ','_')).values)
len(all_names)

55492

In [213]:
batch_names_q = get_batch_queries(all_names, indiv_url_prefix, indiv_url_suffix)

In [214]:
len(batch_names_q)

1388

In [234]:
print name_batch_results[66]['query']['pages'].values()[0]

{u'ns': 0, u'pageid': 10258809, u'revisions': [{u'size': 2579}], u'title': u'Roger Terry'}


### Page sizes

Find remaining entries with missing page sizes:

In [336]:
names_info_current = pd.read_csv('../out/names_fame_birth_death_1.csv')
names_wo_pg_size = names_info_current[names_info_current.page_size.map(lambda val: np.isnan(val))]
batch_names_q = get_batch_queries(list(names_wo_pg_size.name.values), indiv_url_prefix, indiv_url_suffix)

In [338]:
len(batch_names_q)

136

Query Wikipedia for page sizes. For 136 queries (x48 terms/query ~= 6500 queries), takes about 4min 30sec (270 seconds).

In [359]:
%%time
counter = 0

# query API
name_batch_results = []

for batch in batch_names_q:
    ret_val = requests.get(batch, headers=headers)
    # print json_ret_val
    try:
        json_ret_val = ret_val.json()
        name_batch_results.append(json_ret_val)
    except ValueError:
        continue
    
    time.sleep(1)
    
    print counter, "-",
    counter += 1

print ""

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 80 - 81 - 82 - 83 - 84 - 85 - 86 - 87 - 88 - 89 - 90 - 91 - 92 - 93 - 94 - 95 - 96 - 97 - 98 - 99 - 100 - 101 - 102 - 103 - 104 - 105 - 106 - 107 - 108 - 109 - 110 - 111 - 112 - 113 - 114 - 115 - 116 - 117 - 118 - 119 - 120 - 121 - 122 - 123 - 124 - 125 - 126 - 127 - 128 -CPU times: user 4.24 s, sys: 311 ms, total: 4.55 s
Wall time: 4min 22s



In [368]:
names_q_contents = []

for result in name_batch_results:
    for page_result in result['query']['pages'].values():
        if 'revisions' in page_result.keys():
            size_info = page_result['revisions']
            new_contents = [page_result['title'], page_result['revisions'][0]['size']]
            names_q_contents.append(new_contents)

In [369]:
len(names_q_contents)

5952

Save new page sizes

In [375]:
with open('../out/names_fame_birth_death_1_rem.csv', 'wb') as remaining_outfile:
    csv_rem_writer = csv.writer(remaining_outfile, delimiter=',')
    for row in names_q_contents:
        csv_rem_writer.writerow(row)
    remaining_outfile.close()