In [None]:
# Import all the required packages
import requests #This allows you to get the HTML code
from pattern import web #This allows you to explore the dom (i.e. the HTML structure)
from bs4 import BeautifulSoup #This is an alternaticeve way of exploring the dom
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
%matplotlib inline

In [None]:
# This function takes a column that will eventually be made numeric but is all dirty with badly made strings. It removes
# the thousand-delimiters, it replaces the decimaldelimiters with periods, and removes any user-chosen 
# additional set of characters. But it leaves it as a string.
def stripOffUselessCharacters(column,decimaldelimiter=".",thousanddelimeter=",",listofstringstoremove=["-"," ","%"]):
    toremoveregex = str(listofstringstoremove + [thousanddelimeter]).rstrip("]'").lstrip("'[").replace("', '","|")
    cleancolumn = column.astype(str).str.replace(toremoveregex,"").str.replace(decimaldelimiter,".")
    return cleancolumn

# This function takes a website ending relating to a surname, fetches from http://forebears.io/
# the prevalence of that surname, and turns that information into a table. The function then returns 
# a pandas dataframe where the first column is an ordered list of countries and the second is
# the relative probability that the person with the surname comes from that country
def getNationalitiesAndProbabilities(website_ending):
    url = "http://forebears.io/" + website_ending #This is the URL whose HTML we want to scrape
    # if the URL has parameters you can give it, we can do so with the dictionary,
    # e.g. additional_requirements = {'key1': 'value1', 'key2': 'value2'} will make the URL:
    # http://www.examplewebsite.com/subpage?key2=value2&key1=value1
    additional_requirements = {}
    r = requests.get(url, params=additional_requirements)
    #The variable r contains the website HTML
    # print r.url gives us the website address we just fetched
    # print r.text gives the whole HTML for the page
    
    # Now we're going to go through the DOM of the website to pick out the table containing nationalities
    soup = BeautifulSoup(r.text, 'html.parser')
    # The table with nationalities and their frequencies is in the tag <data-tabset ...>
    # Within that, each row is in a separate <tr> and the column headers are in <th>
    frequencytable = soup.find_all('data-tabset')[0]
    tablerows = frequencytable.find_all('tr')
    columntitles = tablerows[0].find_all('th')
    columntitles = [title.get_text() for title in columntitles]
    
    # Now we'll make a pandas dataframe containing the table information.
    # First we'll populate a 2d python list with the data
    tabledata = [[entry.get_text() for entry in row.find_all("td")] for row in tablerows[1:]]
    if [] in tabledata:
        tabledata = tabledata[:tabledata.index([])]
    tabledata = pd.DataFrame(tabledata, columns=columntitles)
    
    # The numeric data needs cleaning up, since it is in the form of strings at the moment
    for colname in tabledata:
        if colname!="Country":
            # we have a column that should be numeric. Begin by cleaning up the string
            tabledata.loc[:,colname] = stripOffUselessCharacters(tabledata[colname])
            # One column has the format 1:xxxx to indicate the prevalence of people with a given 
            # surname (one in xxxx has the name)
            columnwithoutOneColon = tabledata[colname].str.replace("1:","")
            if columnwithoutOneColon.equals(tabledata[colname]):
                # we have a numeric column that isn't a ratio
                tabledata.loc[:,colname] = pd.to_numeric(tabledata[colname]).astype(float)
            else:
                # we have the column which is a ratio
                tabledata.loc[:,colname] = 1.0/pd.to_numeric(columnwithoutOneColon).astype(float)
    
    # Now we'll turn the Incidence column into a probability for the given surname to come from a certain country
    tabledata.loc[:,"Incidence"] = tabledata["Incidence"] / tabledata["Incidence"].sum()
    
    return tabledata#[["Country","Incidence"]]

# This function takes a surname, searches the website http://forebears.io , and takes the name on their database that
# most closely matches the given surname. It then returns a list of the form [name,website link ending]
# The second element of this list is intended to be fed to the function getNationalitiesAndProbabilities.
def getProbableNameAndWebsite(surname):    
    url = "http://forebears.io/surnames"
    additional_requirements = {"q":surname}
    r = requests.get(url, params=additional_requirements)
    soup = BeautifulSoup(r.text, 'html.parser')
    listofcases = soup.find_all("div", class_="bigItms")[0]
    if len(list(listofcases.children))>0:
        mostprobablecase = listofcases.div.div.a
        link = mostprobablecase["href"]
        name = mostprobablecase.get_text()
    else:
        name = surname
        link = "unknown"
    return [name,link]

# This function takes a surname (as a string) and returns the country that this surname is likeliest to have come from
def mostLikelyCountry(surname):
    #print surname
    #time.sleep(random.uniform(0.1,0.5)) # This is to be kind to the website and not overload it with requests all at once
    nameandlink = getProbableNameAndWebsite(surname)
    if nameandlink[1] != "unknown":
        #time.sleep(random.uniform(0.3,0.7)) # again we're kind to the website
        probablenationality = getNationalitiesAndProbabilities(nameandlink[1])["Country"][0]
    else:
        probablenationality = "Unknown"
    return probablenationality

# This function takes a name in the format "Surname, title and firstname" and returns "Surname"
def pickOutLastName(fullname):
    surname = fullname
    # first we remove all text after the comma
    if "," in surname:
        surname = surname[:surname.index(",")]
    # then we remove all text after a possible hyphen 
    # (in case of hyphenated names, sometimes found in multicultural children)
    if "-" in surname:
        surname = surname[:surname.index("-")]
    return surname

In [None]:
#================================================================================================
# USER INPUT!
# Here I specify which data files need reading in
datafilenames = ["cleantrain.csv","cleantest.csv"]
#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

alldataframes = [pd.read_csv(filename) for filename in datafilenames]

In [None]:
#================================================================================================
# USER INPUT!
# ONLY RUN THIS IF YOU HAVEN'T RUN IT ALREADY! IT TAKES QUITE A LONG TIME DUE TO THE PAUSE TIME WE GIVE TO THE WEBSITE
# We now want to add columns that can be inferred from the other columns (feature engineering for feature selection)

ii=0
namecolumn = alldataframes[ii]["Name"].apply(pickOutLastName)

# Now we'll get the countries from the website http://forebears.io/ and plug them into a new column
countrycolumn = namecolumn.apply(mostLikelyCountry)
alldataframes[ii]["Nationality"] = countrycolumn
ii=1
namecolumn = alldataframes[ii]["Name"].apply(pickOutLastName)
countrycolumn = namecolumn.apply(mostLikelyCountry)
alldataframes[ii]["Nationality"] = countrycolumn
#================================================================================================


In [None]:
#================================================================================================
# USER INPUT!
# We are finished cleaning the data. We'll now output the clean data to a new csv file
# Here I specify the names of the clean-data files
outputfilenames = ["natinalitycleantrain.csv","natinalitycleantest.csv"]
#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

for ii in range(len(alldataframes)):
    alldataframes[ii].to_csv(outputfilenames[ii])