# Data Preprocessing: Feature Engineering

## Import Modules

In [66]:
import numpy as np
import pandas as pd
from datetime import datetime

In [74]:
data = {
    "name": ["Peter", "Paul", "Mary", "Linda", "Erna"],
    "day_of_birth": ["1970-01-12", "1983-05-05", "2000-03-17", "1997-10-13", "1920-05-13"],
    "day_of_death": [np.nan, np.nan, np.nan, np.nan, "2008-08-15"],
    "salery": [55000, 43000, 52000, 83500, 0]
}
df = pd.DataFrame(data)
df

Unnamed: 0,name,day_of_birth,day_of_death,salery
0,Peter,1970-01-12,,55000
1,Paul,1983-05-05,,43000
2,Mary,2000-03-17,,52000
3,Linda,1997-10-13,,83500
4,Erna,1920-05-13,2008-08-15,0


## Set column datatypes

In [75]:
df[["day_of_birth", "day_of_death"]] = df[["day_of_birth", "day_of_death"]].apply(pd.to_datetime)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   name          5 non-null      object        
 1   day_of_birth  5 non-null      datetime64[ns]
 2   day_of_death  1 non-null      datetime64[ns]
 3   salery        5 non-null      int64         
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 292.0+ bytes


## Calculate age until "day_of_death" or today

In [76]:
today = pd.to_datetime(datetime.today().strftime("%Y-%m-%d"))
df["age"] = ((df["day_of_death"].fillna(today) - df["day_of_birth"]).dt.days / 365.25).astype(int)
df

Unnamed: 0,name,day_of_birth,day_of_death,salery,age
0,Peter,1970-01-12,NaT,55000,54
1,Paul,1983-05-05,NaT,43000,41
2,Mary,2000-03-17,NaT,52000,24
3,Linda,1997-10-13,NaT,83500,27
4,Erna,1920-05-13,2008-08-15,0,88


## Guess a value with a callback
Attention! The callback function is written in Python and probably it'll slow down the process on big data! 

In [79]:
def sex_guesser(v):
    di = {
        "Peter": "male",
        "Paul": "male",
        "Mary": "female",
        "Linda": "female",
        "Erna": "female"
    }
    return di.get(v, np.nan)

df["sex"] = df[["name"]].map(sex_guesser)
df

Unnamed: 0,name,day_of_birth,day_of_death,salery,age,sex
0,Peter,1970-01-12,NaT,55000,54,male
1,Paul,1983-05-05,NaT,43000,41,male
2,Mary,2000-03-17,NaT,52000,24,female
3,Linda,1997-10-13,NaT,83500,27,female
4,Erna,1920-05-13,2008-08-15,0,88,female
