# Build Features

In [None]:
from datetime import datetime

import numpy as np
import pandas as pd

In [None]:
train_physicists = pd.read_csv(
    '../data/processed/train_notable_physicists_from_1901.csv')
train_physicists.head()

In [None]:
nobel_physicists = pd.read_csv(
    '../data/raw/nobel-physics-prize-laureates.csv')
nobel_physicists.head()

In [None]:
nobel_chemists = pd.read_csv(
    '../data/raw/nobel-chemistry-prize-laureates.csv')
nobel_chemists.head()

In [None]:
def build_features(physicists):
    features = physicists.copy()[['fullName', 'name']]    
    features['gender'] = _build_gender(physicists.gender)
    features['years_lived'] = _build_years_lived(physicists.birthDate,
                                                 physicists.deathDate)
    
    
    features['theoretical_physicist'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'Theoretical physicists',
                      'others': 'theoretical physic'})
    features['experimental_physicist'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'Experimental physicists',
                      'others': 'experimental physic'})
    features['astronomer'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'astronomers',
                      'others': 'astronom'})

    features['num_physics_laureate_academic_advisors'] = (
        _build_num_academic_advisors_laureate(physicists.academicAdvisor,
                                              nobel_physicists.Laureate))
    features['num_chemistry_laureate_academic_advisors'] = (
        _build_num_academic_advisors_laureate(physicists.academicAdvisor,
                                              nobel_chemists.Laureate))
    
    return features


def _build_gender(gender):
    return gender.map({'male': 1, 'female': 0})


def _build_years_lived(birth_date, death_date):
    death_date_no_nan = death_date.apply(_date_no_nan)
    birth_date_no_nan = birth_date.apply(_date_no_nan)
    years_lived = ((death_date_no_nan - birth_date_no_nan) / pd.to_timedelta(1, 'Y'))
    return years_lived.astype('int64')


def _build_physics_subfield(categories, field, description, comment, search_terms):
    cat_theoretical_physicist = categories.apply(
        lambda cat: search_terms['categories'] in cat)
    field_theoretical_physicist = field.apply(
        lambda fld: search_terms['others'] in fld.lower() if isinstance(fld, str)
        else False)
    desc_theoretical_physicist = description.apply(
        lambda desc: search_terms['others'] in desc.lower() if isinstance(desc, str)
        else False)
    comm_theoretical_physicist = description.apply(
        lambda comm: search_terms['others'] in comm.lower() if isinstance(comm, str)
        else False)
    return (cat_theoretical_physicist |
            field_theoretical_physicist |
            desc_theoretical_physicist |
            comm_theoretical_physicist).astype('int64')



def _build_num_academic_advisors_laureate(academic_advisors, laureates):
    laureates = academic_advisors.apply(_get_nobel_laureates, args=(laureates,))
    return laureates.apply(len)



def _get_nobel_laureates(cell, laureates):
    laureates_in_cell = []
    if isinstance(cell, str):
        names = cell.split('|')
        for name in names:
            if name in laureates.values:
                laureates_in_cell.append(name)
    return laureates_in_cell


    
def _date_no_nan(date):
    if isinstance(date, str):
        return datetime.strptime(date, '%Y-%m-%d').date()
    return datetime.now().date()

In [None]:
train_physicists_features = build_features(train_physicists)
train_physicists_features.head()

## Cleaning Up

A few clean up steps are needed:

- Convert the notebook to a HTML file with all the output.
- Convert the notebook to another notebook with the output removed.

In [None]:
!jupyter nbconvert --output-dir html_output --to html 3.0-build-features.ipynb

In [None]:
!jupyter nbconvert --ClearOutputPreprocessor.enabled=True --to notebook 3.0-build-features.ipynb