# Build Features

## Setting up the Environment

An initialization step is needed to setup the environment:
- The locale needs to be set for all categories to the user’s default setting (typically specified in the LANG environment variable) to enable correct sorting of words with accents.

In [None]:
import locale
    
locale.setlocale(locale.LC_ALL, '')

In [None]:
from datetime import datetime

import numpy as np
import pandas as pd
from pycountry_convert import country_alpha2_to_country_name
from pycountry_convert import country_name_to_country_alpha3
from pycountry_convert import country_alpha2_to_continent_code
from pycountry_convert import country_alpha3_to_country_alpha2
from sklearn.preprocessing import MultiLabelBinarizer

from src.data.country_utils import nationality_to_alpha2_code 

In [None]:
train_physicists = pd.read_csv(
    '../data/processed/train-physicists-from-1901.csv')
train_physicists.head()

In [None]:
nobel_physicists = pd.read_csv(
    '../data/raw/nobel-physics-prize-laureates.csv')
nobel_physicists.head()

In [None]:
nobel_columns = ['Year', 'Laureate', 'name', 'Country', 'Rationale']
nobel_physicists = pd.merge(nobel_physicists, train_physicists,
                            how = 'left', left_on = 'Laureate',
                            right_on = 'fullName')[nobel_columns]
nobel_physicists.head()

In [None]:
nobel_chemists = pd.read_csv(
    '../data/raw/nobel-chemistry-prize-laureates.csv')
nobel_chemists.head()

In [None]:
nobel_chemists = pd.merge(nobel_chemists, train_physicists,
                          how = 'left', left_on = 'Laureate',
                          right_on = 'fullName')[nobel_columns]
nobel_chemists.head()

In [None]:
places = pd.read_csv('../data/processed/places.csv',
                     keep_default_na=False)
places = places.replace('', np.nan)
assert(all(places[
    places.countryAlpha3Code == 'USA']['continentCode'].values == 'NA'))
places.head()

In [None]:
nationalities = pd.read_csv('../data/processed/Countries-List.csv',
                            keep_default_na=False)
nationalities = nationalities.replace('', np.nan)
assert(nationalities[
    nationalities.Name == 'Namibia']['ISO 3166 Code'].values == 'NA')
nationalities.head()

In [None]:
def build_features(physicists, nobel_physicists, nobel_chemists,
                   places, nationalities):
    features = physicists.copy()[['fullName', 'name', 'gender']].rename(
        mapper={'fullName': 'full_name'}, axis='columns')
    features['years_lived'] = _build_years_lived(physicists.birthDate,
                                                 physicists.deathDate)
    
    _build_physics_subfield_features(features, physicists)
    _build_num_laureates_features(features, physicists,
                                  nobel_physicists, nobel_chemists)
    
    _build_citizenship_features(features, physicists, nationalities)
    
    _build_places_features(features, physicists, places)
    
    features = _binarize_list_features(features)

    features = features.drop('name', axis='columns')
    return features


def _build_physics_subfield_features(features, physicists):
    features_to_build = {
        'theoretical_physicist': {'categories': 'Theoretical physicists',
                                  'others': 'theoretical physic'},
        'experimental_physicist': {'categories': 'Experimental physicists',
                                   'others': 'experimental physic'},
        'astronomer': {'categories': 'astronomers',
                       'others': 'astronom'}
    }
    
    for feature, search_terms in features_to_build.items():
        features[feature] = _build_physics_subfield(
            physicists.categories, physicists.field, physicists.description,
            physicists.comment, search_terms=search_terms)
    


def _build_num_laureates_features(features, physicists, nobel_physicists,
                                  nobel_chemists):
    features_to_build = {
        'laureate_academic_advisors': 'academicAdvisor',
        'laureate_doctoral_advisors': 'doctoralAdvisor',
        'laureate_doctoral_students': 'doctoralStudent',
        'laureate_notable_students': 'notableStudent',
        'laureate_children': 'child',
        'laureate_parents': 'parent',
        'laureate_spouses': 'spouse',
        'laureate_influenced': 'influenced',
        'laureate_influenced_by': 'influencedBy'
    }
    
    for feature, relation in features_to_build.items():
        features['num_physics_' + feature] = _build_num_laureates(
            physicists[relation], nobel_physicists.Laureate, nobel_physicists.name)
        features['num_chemistry_' + feature] = _build_num_laureates(
            physicists[relation], nobel_chemists.Laureate, nobel_chemists.name)


    
def _build_places_features(features, physicists, places):
    features_to_build = {
        'birth_country_alpha_3_codes': 'birthPlace',
        'birth_continent_codes': 'birthPlace',
        'death_country_alpha_3_codes': 'deathPlace',
        'death_continent_codes': 'deathPlace',
        'residence_country_alpha_3_codes': 'residence',
        'residence_continent_codes': 'residence',
        'alma_mater': 'almaMater',
        'alma_mater_country_alpha_3_codes': 'almaMater',
        'alma_mater_continent_codes': 'almaMater',
        'workplaces': 'workplaces',
        'workplaces_country_alpha_3_codes': 'workplaces',
        'workplaces_continent_codes': 'workplaces'
    }
    
    for feature, place in features_to_build.items():
        code = 'countryAlpha3Code'
        if 'continent' in feature:
            code = 'continentCode'
            
        if feature in ['alma_mater', 'workplaces']:
            features[feature] = physicists[place].apply(
                _get_alma_mater_or_workplaces)           
        else:
            features[feature] = _build_places_codes(
                physicists[place], places.fullName, places[code])
        features['num_' + feature] = features[feature].apply(len)


    
def _build_citizenship_features(features, physicists, nationalities):
    citizenship = physicists.citizenship.apply(
        _get_citizenship_codes, args=(nationalities,))
    nationality = physicists.nationality.apply(
        _get_citizenship_codes, args=(nationalities,))
    citizenship_description = physicists.description.apply(
        _get_citizenship_codes, args=(nationalities,))
    features['citizenship_country_alpha_3_codes'] = (
        (citizenship + nationality + citizenship_description).apply(
            lambda ctz: list(sorted(set(ctz)))))
    features['num_citizenship_country_alpha_3_codes'] = (
        features.citizenship_country_alpha_3_codes.apply(len))
    features['citizenship_continent_codes'] = (
        features.citizenship_country_alpha_3_codes.apply(
            lambda al3: list(sorted({country_alpha2_to_continent_code(
                country_alpha3_to_country_alpha2(cd)) for cd in al3}))))
    features['num_citizenship_continent_codes'] = (
        features.citizenship_continent_codes.apply(len))



def _binarize_list_features(features):
    # union of places and citizenship (without the counts)
    series_to_binarize = {
        'birth_country_alpha_3_codes': 'born_in_',
        'birth_continent_codes': 'born_in_',
        'death_country_alpha_3_codes': 'died_in_',
        'death_continent_codes': 'died_in_',
        'residence_country_alpha_3_codes': 'lived_in_',
        'residence_continent_codes': 'lived_in_',
        'alma_mater': 'alumnus_of_',
        'alma_mater_country_alpha_3_codes': 'alumnus_in_',
        'alma_mater_continent_codes': 'alumnus_in_',
        'workplaces': 'worked_at_',
        'workplaces_country_alpha_3_codes': 'worked_in_',
        'workplaces_continent_codes': 'worked_in_',
        'citizenship_country_alpha_3_codes': 'citizen_of_',
        'citizenship_continent_codes': 'citizen_in_'
    }
        
    for series, prefix in series_to_binarize.items():
        binarized = _binarize_list_feature(features[series], prefix)
        features = features.drop(series, axis='columns').join(binarized)
    return features
    
    

def _build_years_lived(birth_date, death_date):
    death_date_no_nan = death_date.apply(_date_no_nan)
    birth_date_no_nan = birth_date.apply(_date_no_nan)
    years_lived = ((death_date_no_nan - birth_date_no_nan) / pd.to_timedelta(1, 'Y'))
    return years_lived.astype('int64')


def _build_physics_subfield(categories, field, description, comment, search_terms):
    cat_theoretical_physicist = categories.apply(
        lambda cat: search_terms['categories'] in cat)
    field_theoretical_physicist = field.apply(
        lambda fld: search_terms['others'] in fld.lower() if isinstance(fld, str)
        else False)
    desc_theoretical_physicist = description.apply(
        lambda desc: search_terms['others'] in desc.lower() if isinstance(desc, str)
        else False)
    comm_theoretical_physicist = description.apply(
        lambda comm: search_terms['others'] in comm.lower() if isinstance(comm, str)
        else False)
    subfield = (cat_theoretical_physicist |
                field_theoretical_physicist |
                desc_theoretical_physicist |
                comm_theoretical_physicist)
    subfield = subfield.apply(lambda val: 'yes' if val == True else 'no')
    return subfield


def _binarize_list_feature(series, prefix):
    mlb = MultiLabelBinarizer()
    binarized = pd.DataFrame(
        mlb.fit_transform(series),
        columns=[prefix + class_.replace(' ', '_') for class_ in mlb.classes_],
        index=series.index)
    binarized = binarized.applymap(lambda val: 'yes' if val == 1 else 'no')
    return binarized
    


def _build_num_laureates(series, laureates, names):
    laureate_names = series.apply(_get_nobel_laureates,
                                  args=(laureates, names))
    return laureate_names.apply(len)


def _build_places_codes(places_in_physicists, full_name_in_places, places_codes):
    codes = places_in_physicists.apply(_get_places_codes,
                                       args=(full_name_in_places, places_codes))
    return codes


def _get_alma_mater_or_workplaces(cell):
    if isinstance(cell, float):
        return list()
    
    places = set()
    places_in_cell = cell.split('|')
    for place_in_cell in places_in_cell:
        if place_in_cell.endswith(', Cambridge'):
            places.add('University of Cambridge')
        elif place_in_cell.endswith(', Oxford'):
            places.add('University of Oxford')
        else:
            places.add(place_in_cell)
    
    places = list(places)
    places.sort(key=locale.strxfrm)
    return places


def _get_citizenship_codes(series, nationalities):
    alpha_2_codes = nationality_to_alpha2_code(series, nationalities)
    if isinstance(alpha_2_codes, float):
        return list()
    alpha_2_codes = alpha_2_codes.split('|')
    alpha_3_codes = [country_name_to_country_alpha3(
        country_alpha2_to_country_name(alpha_2_code))
                     for alpha_2_code in alpha_2_codes]
    return alpha_3_codes


def _get_nobel_laureates(cell, laureates, names):
    laureates_in_cell = set()
    
    if isinstance(cell, str):
        # assume the same name if only differs by a hyphen
        # or whitespace at front or end of string
        values = cell.strip().replace('-', ' ').split('|')
        for value in values:
            if value in laureates.values:
                laureates_in_cell.add(value)
            if names.str.contains(value, regex=False).sum() > 0:
                laureates_in_cell.add(value)
                    
    laureates_in_cell = list(laureates_in_cell)
    return laureates_in_cell

    
def _get_places_codes(cell, full_name_in_places, places_codes):
    codes = set()

    if isinstance(cell, str):
        places = cell.split('|')
        for place in places:
            code_indices = full_name_in_places[
                full_name_in_places == place].index
            assert(len(code_indices) <= 1)
            if len(code_indices) != 1:
                continue
            code_index = code_indices[0]
            codes_text = places_codes[code_index]
            if isinstance(codes_text, float):
                continue
            codes_in_cell = codes_text.split('|')
            for code_in_cell in codes_in_cell:
                if code_in_cell:
                    codes.add(code_in_cell)

    codes = list(codes)
    codes.sort()
    return codes
    

def _date_no_nan(date):
    if isinstance(date, str):
        return datetime.strptime(date, '%Y-%m-%d').date()
    return datetime.now().date()

In [None]:
train_physicists_features = build_features(train_physicists, nobel_physicists,
                                           nobel_chemists, places, nationalities)
assert((len(train_physicists_features) == len(train_physicists)))
assert(len(train_physicists_features.columns) == 779)
train_physicists_features.head()

## Build Target

In [None]:
def build_target(full_name, laureate):
    laureate = full_name.apply(
        lambda name: name in laureate.values).map({True: 'yes', False: 'no'})
    laureate.name = 'physics_laureate'
    return laureate

In [None]:
target = build_target(train_physicists.fullName, nobel_physicists.Laureate)
assert((len(target) == len(train_physicists_features)))
assert(isinstance(target, pd.core.series.Series))
assert((target == 'yes').sum() == 123)
target.head()

## Persisting the Data

Now I have the training features dataframe and the target series, I'll persist them for future use.

In [None]:
train_physicists_features = train_physicists_features.reindex(
    sorted(train_physicists_features.columns), axis='columns')
train_physicists_features.head()

In [None]:
train_physicists_features.to_csv('../data/processed/train-features.csv', index=False)
target.to_csv('../data/processed/train-target.csv', index=False, header=True)

Let's perform a quick sanity check to make sure the data is as expected.

In [None]:
train_on_disk = pd.read_csv('../data/processed/train-features.csv')
target_on_disk = pd.read_csv('../data/processed/train-target.csv', squeeze=True)
assert(train_on_disk.equals(train_physicists_features))
assert(target_on_disk.equals(target))