# Build Features

## Setting up the Environment

An initialization step is are needed to setup the environment:
- The locale needs to be set for all categories to the user’s default setting (typically specified in the LANG environment variable) to enable correct sorting of words with accents.

In [None]:
import locale
    
locale.setlocale(locale.LC_ALL, '')

In [None]:
from datetime import datetime

import numpy as np
import pandas as pd

In [None]:
train_physicists = pd.read_csv(
    '../data/processed/train-physicists-from-1901.csv')
train_physicists.head()

In [None]:
nobel_physicists = pd.read_csv(
    '../data/raw/nobel-physics-prize-laureates.csv')
nobel_physicists.head()

In [None]:
nobel_columns = ['Year', 'Laureate', 'name', 'Country', 'Rationale']
nobel_physicists = pd.merge(nobel_physicists, train_physicists, how = 'left', left_on = 'Laureate',
                            right_on = 'fullName')[nobel_columns]
nobel_physicists.head()

In [None]:
nobel_chemists = pd.read_csv(
    '../data/raw/nobel-chemistry-prize-laureates.csv')
nobel_chemists.head()

In [None]:
nobel_chemists = pd.merge(nobel_chemists, train_physicists, how = 'left', left_on = 'Laureate',
                          right_on = 'fullName')[nobel_columns]
nobel_chemists.head()

In [None]:
places = pd.read_csv('../data/processed/places.csv', keep_default_na=False)
assert(all(places[places.countryAlpha3Code == 'USA']['continentCode'].values == 'NA'))
places.head()

In [None]:
def build_features(physicists, nobel_physicists, nobel_chemists, places):
    features = physicists.copy()[['fullName', 'name']]    
    features['gender'] = _build_gender(physicists.gender)
    features['years_lived'] = _build_years_lived(physicists.birthDate,
                                                 physicists.deathDate)
    
    _build_physics_subfield_features(features, physicists)
    _build_num_laureates_features(features, physicists, nobel_physicists,
                                  nobel_chemists)
    
    _build_places_features(features, physicists, places)
    
    return features


def _build_physics_subfield_features(features, physicists):
    features['theoretical_physicist'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'Theoretical physicists',
                      'others': 'theoretical physic'})
    features['experimental_physicist'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'Experimental physicists',
                      'others': 'experimental physic'})
    features['astronomer'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'astronomers',
                      'others': 'astronom'})


def _build_num_laureates_features(features, physicists, nobel_physicists,
                                  nobel_chemists):
    features['num_physics_laureate_academic_advisors'] = (
        _build_num_laureates(physicists.academicAdvisor,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_academic_advisors'] = (
        _build_num_laureates(physicists.academicAdvisor,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_doctoral_advisors'] = (
        _build_num_laureates(physicists.doctoralAdvisor,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_doctoral_advisors'] = (
        _build_num_laureates(physicists.doctoralAdvisor,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_doctoral_students'] = (
        _build_num_laureates(physicists.doctoralStudent,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_doctoral_students'] = (
        _build_num_laureates(physicists.doctoralStudent,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_notable_students'] = (
        _build_num_laureates(physicists.notableStudent,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_notable_students'] = (
        _build_num_laureates(physicists.notableStudent,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_children'] = (
        _build_num_laureates(physicists.child,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_children'] = (
        _build_num_laureates(physicists.child,
                             nobel_chemists.Laureate,  nobel_chemists.name))
    _impute_num_laureate_children_parents(features) # impute missing values
    features['num_physics_laureate_spouses'] = (
        _build_num_laureates(physicists.spouse,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_spouses'] = (
        _build_num_laureates(physicists.spouse,
                             nobel_chemists.Laureate,  nobel_chemists.name))
    features['num_physics_laureate_influenced'] = (
        _build_num_laureates(physicists.influenced,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_influenced'] = (
        _build_num_laureates(physicists.influenced,
                             nobel_chemists.Laureate,  nobel_chemists.name))
    features['num_physics_laureate_influenced_by'] = (
        _build_num_laureates(physicists.influencedBy,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_influenced_by'] = (
        _build_num_laureates(physicists.influencedBy,
                             nobel_chemists.Laureate,  nobel_chemists.name))

    
def _build_places_features(features, physicists, places):
    features['birth_country_alpha_3_codes'] = _build_places_codes(
        physicists.birthPlace, places.fullName, places.countryAlpha3Code)
    features['num_birth_country_alpha_3_codes'] = (
        features.birth_country_alpha_3_codes.apply(len))
    features['birth_continent_codes'] = _build_places_codes(
        physicists.birthPlace, places.fullName, places.continentCode)
    features['num_birth_continent_codes'] = (
        features.birth_continent_codes.apply(len))
    features['death_country_alpha_3_codes'] = _build_places_codes(
        physicists.deathPlace, places.fullName, places.countryAlpha3Code)
    features['num_death_country_alpha_3_codes'] = (
        features.death_country_alpha_3_codes.apply(len))
    features['death_continent_codes'] = _build_places_codes(
        physicists.deathPlace, places.fullName, places.continentCode)
    features['num_death_continent_codes'] = (
        features.death_continent_codes.apply(len))
    features['residence_country_alpha_3_codes'] = _build_places_codes(
        physicists.residence, places.fullName, places.countryAlpha3Code)
    features['num_residence_country_alpha_3_codes'] = (
        features.residence_country_alpha_3_codes.apply(len))
    features['residence_continent_codes'] = _build_places_codes(
        physicists.residence, places.fullName, places.continentCode)
    features['num_residence_continent_codes'] = (
        features.residence_continent_codes.apply(len))
    features['citizenship_country_alpha_3_codes'] = _build_places_codes(
        physicists.citizenship, places.fullName, places.countryAlpha3Code)
    features['num_citizenship_country_alpha_3_codes'] = (
        features.citizenship_country_alpha_3_codes.apply(len))
    features['citizenship_continent_codes'] = _build_places_codes(
        physicists.citizenship, places.fullName, places.continentCode)
    features['num_citizenship_continent_codes'] = (
        features.citizenship_continent_codes.apply(len))
    features['nationality_country_alpha_3_codes'] = _build_places_codes(
        physicists.nationality, places.fullName, places.countryAlpha3Code)
    features['num_nationality_country_alpha_3_codes'] = (
        features.nationality_country_alpha_3_codes.apply(len))
    features['nationality_continent_codes'] = _build_places_codes(
        physicists.nationality, places.fullName, places.continentCode)
    features['num_nationality_continent_codes'] = (
        features.nationality_continent_codes.apply(len))
    features['alma_mater'] = physicists.almaMater.apply(
        lambda am: list() if isinstance(am, float) else am.split('|'))
    features['num_alma_mater'] = features.alma_mater.apply(len)
    features['alma_mater_country_alpha_3_codes'] = _build_places_codes(
        physicists.almaMater, places.fullName, places.countryAlpha3Code)
    features['num_alma_mater_country_alpha_3_codes'] = (
        features.alma_mater_country_alpha_3_codes.apply(len))
    features['alma_mater_continent_codes'] = _build_places_codes(
        physicists.almaMater, places.fullName, places.continentCode)
    features['num_alma_mater_continent_codes'] = (
        features.alma_mater_continent_codes.apply(len))
    features['workplaces'] = physicists.workplaces.apply(
        lambda wp: list() if isinstance(wp, float) else wp.split('|'))
    features['num_workplaces'] = features.workplaces.apply(len)
    features['workplaces_country_alpha_3_codes'] = _build_places_codes(
        physicists.workplaces, places.fullName, places.countryAlpha3Code)
    features['num_workplaces_country_alpha_3_codes'] = (
        features.workplaces_country_alpha_3_codes.apply(len))
    features['workplaces_continent_codes'] = _build_places_codes(
        physicists.workplaces, places.fullName, places.continentCode)
    features['num_workplaces_continent_codes'] = (
        features.workplaces_continent_codes.apply(len))
    



def _build_gender(gender):
    return gender.map({'male': 1, 'female': 0})


def _build_years_lived(birth_date, death_date):
    death_date_no_nan = death_date.apply(_date_no_nan)
    birth_date_no_nan = birth_date.apply(_date_no_nan)
    years_lived = ((death_date_no_nan - birth_date_no_nan) / pd.to_timedelta(1, 'Y'))
    return years_lived.astype('int64')


def _build_physics_subfield(categories, field, description, comment, search_terms):
    cat_theoretical_physicist = categories.apply(
        lambda cat: search_terms['categories'] in cat)
    field_theoretical_physicist = field.apply(
        lambda fld: search_terms['others'] in fld.lower() if isinstance(fld, str)
        else False)
    desc_theoretical_physicist = description.apply(
        lambda desc: search_terms['others'] in desc.lower() if isinstance(desc, str)
        else False)
    comm_theoretical_physicist = description.apply(
        lambda comm: search_terms['others'] in comm.lower() if isinstance(comm, str)
        else False)
    return (cat_theoretical_physicist |
            field_theoretical_physicist |
            desc_theoretical_physicist |
            comm_theoretical_physicist).astype('int64')



def _build_num_laureates(series, laureates, names):
    laureate_names = series.apply(_get_nobel_laureates, args=(laureates, names))
    return laureate_names.apply(len)


def _build_places_codes(places_in_physicists, full_name_in_places, places_codes):
    codes = places_in_physicists.apply(_get_places_codes,
                                       args=(full_name_in_places, places_codes))
    return codes



def _impute_num_laureate_children_parents(features):
    
    features['num_physics_laureate_parents'] = 0
    features['num_chemistry_laureate_parents'] = 0 # none for chemistry
    
    # https://www.nobelprize.org/prizes/facts/facts-on-the-nobel-prize-in-physics/
    # William Bragg and Lawrence Bragg, 1915
    features.loc[features.fullName == 'William Henry Bragg',
                 'num_physics_laureate_children'] = 1
    features.loc[features.fullName == 'William Lawrence Bragg',
                 'num_physics_laureate_parents'] = 1
    # Niels Bohr, 1922 and Aage N. Bohr, 1975
    features.loc[features.fullName == 'Aage Bohr',
                 'num_physics_laureate_parents'] = 1
    # Manne Siegbahn, 1924 and Kai M. Siegbahn, 1981
    features.loc[features.fullName == 'Kai Siegbahn',
                 'num_physics_laureate_parents'] = 1
    # J. J. Thomson, 1906 and George Paget Thomson, 1937
    features.loc[features.fullName == 'J. J. Thomson',
                 'num_physics_laureate_children'] = 1
    features.loc[features.fullName == 'George Paget Thomson',
                 'num_physics_laureate_parents'] = 1



def _get_nobel_laureates(cell, laureates, names):
    laureates_in_cell = set()
    
    # flatten the alternative laureate names
    alt_laureate_names = np.concatenate(names.apply(
        lambda nam: str(nam).split('|'))).ravel()
    
    if isinstance(cell, str):
        # assume the same name if only differs by a hyphen
        # or whitespace at front or end of string
        values = cell.strip().replace('-', ' ').split('|')
        for value in values:
            if value in laureates.values:
                laureates_in_cell.add(value)
            if value in alt_laureate_names:
                laureates_in_cell.add(value)
                    
    laureates_in_cell = list(laureates_in_cell)
    
    return laureates_in_cell

    
def _get_places_codes(cell, full_name_in_places, places_codes):
    codes = set()

    if isinstance(cell, str):
        places = cell.split('|')
        for place in places:
            code_indices = full_name_in_places[
                full_name_in_places == place].index
            assert(len(code_indices) <= 1)
            if len(code_indices) == 1:
                code_index = code_indices[0]
                code = places_codes[code_index]
                if code:
                    codes.add(code)
                
    codes = list(sorted(codes, key=locale.strxfrm))
    return codes
    

def _date_no_nan(date):
    if isinstance(date, str):
        return datetime.strptime(date, '%Y-%m-%d').date()
    return datetime.now().date()

In [None]:
train_physicists_features = build_features(train_physicists, nobel_physicists,
                                           nobel_chemists, places)

In [None]:
train_physicists_features.head(50)