In [1]:
import pandas as pd
import numpy as np

In [2]:
raw = 'raw'
processed = 'processed'
features = 'features'
!mkdir -p processed features

In [3]:
from sklearn.preprocessing import OneHotEncoder

df_train = pd.read_csv(f'{raw}/train.csv')
df_test = pd.read_csv(f'{raw}/test.csv')

df_train['src'] = 'train'
df_test['src'] = 'test'
union = pd.concat([df_train, df_test], sort=False)
field_name = 'Sex'
dummies = pd.get_dummies(union['Sex'])
colnames = [f'{field_name}_{class_name}' for class_name in dummies.columns]
dummies.columns = colnames

In [4]:
union['Age_na'] = union['Age'].isnull().astype(int)

In [5]:
union[colnames] = dummies[colnames]

In [6]:
bin_col = pd.cut(union['Age'], bins=[0, 2, 6, 14, 16, 18, 25, 35, 40, 45, 60, 80])
bin_ind = pd.get_dummies(bin_col)
bin_ind.columns = [f'Age_is({cat.left}..{cat.right}]' for cat in bin_ind.columns.values]
union[bin_ind.columns] = bin_ind
union.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Age_is(2..6],Age_is(6..14],Age_is(14..16],Age_is(16..18],Age_is(18..25],Age_is(25..35],Age_is(35..40],Age_is(40..45],Age_is(45..60],Age_is(60..80]
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,0,0,0,1,0,0,0,0,0


In [7]:
bin_col = pd.qcut(union['Age'], 10)
bin_ind = pd.get_dummies(bin_col)
bin_ind.columns = [f'Age_q_is({cat.left}..{cat.right}]' for cat in bin_ind.columns.values]
union[bin_ind.columns] = bin_ind

In [8]:
df_train = union[union['src'] == 'train']
df_test  = union[union['src'] == 'test']

In [9]:
len(df_train), len(df_test), len(union)

(891, 418, 1309)

In [10]:
df_train.to_csv(f'{processed}/train.csv')
df_test.to_csv(f'{processed}/test.csv')

In [11]:
quantile_age_features = [col_name for col_name in df_train.columns if 'Age_q_is' in col_name]
df_train[['PassengerId'] + quantile_age_features].to_csv(f'{features}/age_quantiles_train.csv')
df_test[['PassengerId'] + quantile_age_features].to_csv(f'{features}/age_quantiles_test.csv')

In [12]:
human_age_features = [col_name for col_name in df_train.columns if 'Age_is' in col_name]
df_train[['PassengerId'] + human_age_features].to_csv(f'{features}/age_human_train.csv')
df_test[['PassengerId'] + human_age_features].to_csv(f'{features}/age_human_test.csv')