# Build Features

In [None]:
from datetime import datetime

import numpy as np
import pandas as pd

In [None]:
train_physicists = pd.read_csv(
    '../data/processed/train_notable_physicists_from_1901.csv')
train_physicists.head()

In [None]:
nobel_physicists = pd.read_csv(
    '../data/raw/nobel-physics-prize-laureates.csv')
nobel_physicists.head()

In [None]:
nobel_columns = ['Year', 'Laureate', 'name', 'Country', 'Rationale']
nobel_physicists = pd.merge(nobel_physicists, train_physicists, how = 'left', left_on = 'Laureate',
                            right_on = 'fullName')[nobel_columns]
nobel_physicists.head()

In [None]:
nobel_chemists = pd.read_csv(
    '../data/raw/nobel-chemistry-prize-laureates.csv')
nobel_chemists.head()

In [None]:
nobel_chemists = pd.merge(nobel_chemists, train_physicists, how = 'left', left_on = 'Laureate',
                          right_on = 'fullName')[nobel_columns]
nobel_chemists.head()

In [None]:
def build_features(physicists, nobel_physicists, nobel_chemists):
    features = physicists.copy()[['fullName', 'name']]    
    features['gender'] = _build_gender(physicists.gender)
    features['years_lived'] = _build_years_lived(physicists.birthDate,
                                                 physicists.deathDate)
    
    _build_physics_subfield_features(features, physicists)
    _build_num_laureates_features(features, physicists, nobel_physicists,
                                  nobel_chemists)
    return features


def _build_physics_subfield_features(features, physicists):
    features['theoretical_physicist'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'Theoretical physicists',
                      'others': 'theoretical physic'})
    features['experimental_physicist'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'Experimental physicists',
                      'others': 'experimental physic'})
    features['astronomer'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'astronomers',
                      'others': 'astronom'})


def _build_num_laureates_features(features, physicists, nobel_physicists,
                                  nobel_chemists):
    features['num_physics_laureate_academic_advisors'] = (
        _build_num_laureates(physicists.academicAdvisor,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_academic_advisors'] = (
        _build_num_laureates(physicists.academicAdvisor,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_doctoral_advisors'] = (
        _build_num_laureates(physicists.doctoralAdvisor,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_doctoral_advisors'] = (
        _build_num_laureates(physicists.doctoralAdvisor,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_doctoral_students'] = (
        _build_num_laureates(physicists.doctoralStudent,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_doctoral_students'] = (
        _build_num_laureates(physicists.doctoralStudent,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_notable_students'] = (
        _build_num_laureates(physicists.notableStudent,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_notable_students'] = (
        _build_num_laureates(physicists.notableStudent,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_spouses'] = (
        _build_num_laureates(physicists.spouse,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_spouses'] = (
        _build_num_laureates(physicists.spouse,
                             nobel_chemists.Laureate,  nobel_chemists.name))
    


def _build_gender(gender):
    return gender.map({'male': 1, 'female': 0})


def _build_years_lived(birth_date, death_date):
    death_date_no_nan = death_date.apply(_date_no_nan)
    birth_date_no_nan = birth_date.apply(_date_no_nan)
    years_lived = ((death_date_no_nan - birth_date_no_nan) / pd.to_timedelta(1, 'Y'))
    return years_lived.astype('int64')


def _build_physics_subfield(categories, field, description, comment, search_terms):
    cat_theoretical_physicist = categories.apply(
        lambda cat: search_terms['categories'] in cat)
    field_theoretical_physicist = field.apply(
        lambda fld: search_terms['others'] in fld.lower() if isinstance(fld, str)
        else False)
    desc_theoretical_physicist = description.apply(
        lambda desc: search_terms['others'] in desc.lower() if isinstance(desc, str)
        else False)
    comm_theoretical_physicist = description.apply(
        lambda comm: search_terms['others'] in comm.lower() if isinstance(comm, str)
        else False)
    return (cat_theoretical_physicist |
            field_theoretical_physicist |
            desc_theoretical_physicist |
            comm_theoretical_physicist).astype('int64')



def _build_num_laureates(series, laureates, names):
    laureates = series.apply(_get_nobel_laureates, args=(laureates, names))
    return laureates.apply(len)



def _get_nobel_laureates(cell, laureates, names):
    laureates_in_cell = set()
    
    # flatten the alternative laureate names
    alt_laureate_names = np.concatenate(names.apply(
        lambda nam: str(nam).split('|'))).ravel()
    
    if isinstance(cell, str):
        # assume the same name if only differs by a hyphen
        # or whitespace at front or end of string
        values = cell.strip().replace('-', ' ').split('|')
        for value in values:
            if value in laureates.values:
                laureates_in_cell.add(value)
            if value in alt_laureate_names:
                laureates_in_cell.add(value)
                    
    laureates_in_cell = list(laureates_in_cell)
    return laureates_in_cell


    
def _date_no_nan(date):
    if isinstance(date, str):
        return datetime.strptime(date, '%Y-%m-%d').date()
    return datetime.now().date()

In [None]:
train_physicists_features = build_features(train_physicists, nobel_physicists,
                                           nobel_chemists)
train_physicists_features.head()

## Cleaning Up

A few clean up steps are needed:

- Convert the notebook to a HTML file with all the output.
- Convert the notebook to another notebook with the output removed.

In [None]:
!jupyter nbconvert --output-dir html_output --to html 3.0-build-features.ipynb

In [None]:
!jupyter nbconvert --ClearOutputPreprocessor.enabled=True --to notebook 3.0-build-features.ipynb