In [1]:
import os
import sys

# set path for module imports
sys.path[0] = '../'

import pandas as pd
import nltk
from mordecai import Geoparser
import pycountry

# Download NLP data for country extraction // make this modular by setting nltk data path
nltk.download('treebank')
nltk.download('maxent_treebank_pos_tagger')
nltk.download('punkt') # Download corpora for GPE extraction
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Display options
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns

Using TensorFlow backend.
[nltk_data] Downloading package treebank to
[nltk_data]     /Users/gregory/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /Users/gregory/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/gregory/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gregory/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/gregory/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/gregory/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [9]:
# Set path and import
data = pd.read_csv(sys.path[0] + 'data/processed/titanic_final.csv')

In [13]:
# Create dictionary of Country:[strings]
# If row.`home.dest` is in map assign Country value

abbrev_map = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    "Alberta": "AB",
    "British Columbia": "BC",
    "Manitoba": "MB",
    "New Brunswick": "NB",
    "Newfoundland": "NL",
    "Northwest Territories": "NT",
    "Nova Scotia": "NS",
    "Nunavut": "NU",
    "Ontario": "ON",
    "Prince Edward Island": "PE",
    "Quebec": "PQ",
    "Saskatchewan": "SK",
    "Yukon": "YT",
    "Northern Ireland": "NI"}

# Invert mappings
abbrev_map = {v: k for k, v in abbrev_map.items()}

# Apply mappings to replace state abbreviations
#data['home.dest'] = data['home.dest'].str.split().apply(lambda x: ' '.join([abbrev_map.get(word, word) for word in x]))

def remap_abbrev(series):
    remapper = lambda row: ' '.join([abbrev_map.get(word, word) for word in row])
    series = series.str.split().apply(remapper)
    return series

In [15]:
remap_abbrev(data['home.dest'])

0                                      St Louis, Missouri
1                Montreal, Quebec / Chesterville, Ontario
2                Montreal, Quebec / Chesterville, Ontario
3                Montreal, Quebec / Chesterville, Ontario
4                Montreal, Quebec / Chesterville, Ontario
5                                      New York, New York
6                                        Hudson, New York
7                               Belfast, Northern Ireland
8                               Bayside, Queens, New York
9                                     Montevideo, Uruguay
10                                     New York, New York
11                                     New York, New York
12                                          Paris, France
13                                Unspecified Destination
14                                          Hessle, Yorks
15                                     New York, New York
16                                       Montreal, Quebec
17            

In [322]:
# Can be used with Pandas apply method. Use batch version for better speed.
def extract_country(row):
    geo = Geoparser(country_threshold=0.9)
    inferred = geo.geoparse(row)
    country_range = range(len(inferred))
    home_countries = set([inferred[i]['country_predicted'] for i in country_range])
    home_countries = ", ".join(home_countries)
    
    return home_countries

In [323]:
# Finished
def batch_extract_country(series):
    countries = []
    geo = Geoparser()
    batch = geo.batch_geoparse(series)
    for doc_list in batch:
        row = ", ".join(set([entry['country_predicted'] for entry in doc_list]))
        countries.append(row)
    
    return pd.Series(countries)

In [324]:
# Finished
def lookup_country_name(row):
    if row == "":
        return ""
    else:
        words = row.split(', ')
        lookup = lambda country: pycountry.countries.lookup(country).name
        names = list(map(lookup, words))
        names = ", ".join(names)
        return names

In [325]:
# Parses home.dest to infer destination countries for each passenger
# This step takes a while depending on the machine
#data['home.country'] = batch_extract_country(data['home.dest'])

 19%|█▊        | 245/1309 [01:52<05:05,  3.48it/s] GET http://localhost:9200/geonames/_search [status:N/A request:10.141s]
Traceback (most recent call last):
  File "/Users/gregory/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 384, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "/Users/gregory/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 380, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/gregory/anaconda3/lib/python3.7/http/client.py", line 1344, in getresponse
    response.begin()
  File "/Users/gregory/anaconda3/lib/python3.7/http/client.py", line 306, in begin
    version, status, reason = self._read_status()
  File "/Users/gregory/anaconda3/lib/python3.7/http/client.py", line 267, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/gregory/anaconda3/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.r

In [328]:
# Converts ISO country code to country name
data['home.country'] = data['home.country'].apply(lookup_country_name)

In [332]:
# Save dataset
#data.to_csv(sys.path[0] + 'data/clean/titanic_final.csv', index=False)