In [17]:
import pandas as pd
import numpy as np

In [18]:
raw = 'raw'
processed = 'processed'
features = 'features'
!mkdir -p processed features

In [19]:
from sklearn.preprocessing import OneHotEncoder

df_train = pd.read_csv(f'{raw}/train.csv')
df_test = pd.read_csv(f'{raw}/test.csv')

df_train['src'] = 'train'
df_test['src'] = 'test'
union = pd.concat([df_train, df_test], sort=False)

# 🧠 Derive features
## Sex: dummies

In [20]:
field_name = 'Sex'
dummies = pd.get_dummies(union['Sex'])
colnames = [f'{field_name}_{class_name}' for class_name in dummies.columns]
dummies.columns = colnames
union[colnames] = dummies[colnames]

## Age: na, human ages dummies

In [21]:
union['Age_na'] = union['Age'].isnull().astype(int)

In [22]:
bin_col = pd.cut(union['Age'], bins=[0, 2, 6, 14, 16, 18, 25, 35, 40, 45, 60, 80])
bin_ind = pd.get_dummies(bin_col)
bin_ind.columns = [f'Age_is({cat.left}..{cat.right}]' for cat in bin_ind.columns.values]
union[bin_ind.columns] = bin_ind

## Age: quantiles

In [23]:
bin_col = pd.qcut(union['Age'], 10)
bin_ind = pd.get_dummies(bin_col)
bin_ind.columns = [f'Age_q_is({cat.left}..{cat.right}]' for cat in bin_ind.columns.values]
union[bin_ind.columns] = bin_ind

In [24]:
union = pd.get_dummies(union, columns=['Pclass'], prefix='Pclass_is')

In [25]:
union.columns

Index(['PassengerId', 'Survived', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'src', 'Sex_female', 'Sex_male',
       'Age_na', 'Age_is(0..2]', 'Age_is(2..6]', 'Age_is(6..14]',
       'Age_is(14..16]', 'Age_is(16..18]', 'Age_is(18..25]', 'Age_is(25..35]',
       'Age_is(35..40]', 'Age_is(40..45]', 'Age_is(45..60]', 'Age_is(60..80]',
       'Age_q_is(0.169..14.0]', 'Age_q_is(14.0..19.0]', 'Age_q_is(19.0..22.0]',
       'Age_q_is(22.0..25.0]', 'Age_q_is(25.0..28.0]', 'Age_q_is(28.0..31.0]',
       'Age_q_is(31.0..36.0]', 'Age_q_is(36.0..42.0]', 'Age_q_is(42.0..50.0]',
       'Age_q_is(50.0..80.0]', 'Pclass_is_1', 'Pclass_is_2', 'Pclass_is_3'],
      dtype='object')

In [26]:
df_train = union[union['src'] == 'train']
df_test  = union[union['src'] == 'test']

In [27]:
len(df_train), len(df_test), len(union)

(891, 418, 1309)

In [28]:
df_train.to_csv(f'{processed}/train.csv', index=False)
df_test.to_csv(f'{processed}/test.csv', index=False)

# 💾 Output
## Age: dummies(quantiles)

In [29]:
quantile_age_features = [col_name for col_name in df_train.columns if 'Age_q_is' in col_name]
df_train[['PassengerId'] + quantile_age_features].to_csv(f'{features}/age_quantiles_train.csv', index=False)
df_test[['PassengerId'] + quantile_age_features].to_csv(f'{features}/age_quantiles_test.csv', index=False)

## Age: dummies(human bins)

In [30]:
human_age_features = [col_name for col_name in df_train.columns if 'Age_is' in col_name]
df_train[['PassengerId'] + human_age_features].to_csv(f'{features}/age_human_train.csv', index=False)
df_test[['PassengerId'] + human_age_features].to_csv(f'{features}/age_human_test.csv', index=False)

## Pclass: dummies

In [31]:
pclass_features = [col_name for col_name in df_train.columns if 'Pclass_is_' in col_name]
df_train[['PassengerId'] + pclass_features].to_csv(f'{features}/pclass_dummy_train.csv', index=False)
df_test[['PassengerId'] + pclass_features].to_csv(f'{features}/pclass_dummy_test.csv', index=False)

In [32]:
df_train.columns

Index(['PassengerId', 'Survived', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'src', 'Sex_female', 'Sex_male',
       'Age_na', 'Age_is(0..2]', 'Age_is(2..6]', 'Age_is(6..14]',
       'Age_is(14..16]', 'Age_is(16..18]', 'Age_is(18..25]', 'Age_is(25..35]',
       'Age_is(35..40]', 'Age_is(40..45]', 'Age_is(45..60]', 'Age_is(60..80]',
       'Age_q_is(0.169..14.0]', 'Age_q_is(14.0..19.0]', 'Age_q_is(19.0..22.0]',
       'Age_q_is(22.0..25.0]', 'Age_q_is(25.0..28.0]', 'Age_q_is(28.0..31.0]',
       'Age_q_is(31.0..36.0]', 'Age_q_is(36.0..42.0]', 'Age_q_is(42.0..50.0]',
       'Age_q_is(50.0..80.0]', 'Pclass_is_1', 'Pclass_is_2', 'Pclass_is_3'],
      dtype='object')