# Build Features

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

import numpy as np
import pandas as pd

In [None]:
train_physicists = pd.read_csv(
    '../data/processed/train_notable_physicists_from_1901.csv')
train_physicists.head()

In [None]:
def build_features(physicists):
    features = physicists.copy()[['fullName', 'name']]    
    features['gender'] = _build_gender(physicists.gender)
    features['years_lived'] = _build_years_lived(physicists.birthDate,
                                                 physicists.deathDate)
    return features

def _build_gender(gender):
    return gender.map({'male': 1, 'female': 0})

def _build_years_lived(birth_date, death_date):
    death_date_no_nan = death_date.apply(_date_no_nan)
    birth_date_no_nan = birth_date.apply(_date_no_nan)
    rdelta = ((death_date_no_nan - birth_date_no_nan) / pd.to_timedelta(1, 'Y'))
    return rdelta.astype('int64')

def _date_no_nan(date):
    if isinstance(date, str):
        return datetime.strptime(date, '%Y-%m-%d').date()
    return datetime.now().date()

In [None]:
train_physicists_features = build_features(train_physicists)

In [None]:
train_physicists_features.head()

## Cleaning Up

A few clean up steps are needed:

- Convert the notebook to a HTML file with all the output.
- Convert the notebook to another notebook with the output removed.

In [None]:
!jupyter nbconvert --output-dir html_output --to html 3.0-build-features.ipynb

In [None]:
!jupyter nbconvert --ClearOutputPreprocessor.enabled=True --to notebook 3.0-build-features.ipynb