# 2.7. Feature Engineering with ColumnTransformers

## Challenge :: Feature Engineer your data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/train.csv", sep=",")
df.head(250)

df['Age'].isna().sum()

177

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer

from sklearn.pipeline import make_pipeline

X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin', 'Name']]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=151)

X_train.sort_values(by="Age")

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin,Name
469,3,female,0.75,2,1,19.2583,C,,"Baclini, Miss. Helene Barbara"
644,3,female,0.75,2,1,19.2583,C,,"Baclini, Miss. Eugenie"
831,2,male,0.83,1,1,18.7500,S,,"Richards, Master. George Sibley"
386,3,male,1.00,5,2,46.9000,S,,"Goodwin, Master. Sidney Leonard"
788,3,male,1.00,1,2,20.5750,S,,"Dean, Master. Bertram Vere"
...,...,...,...,...,...,...,...,...,...
82,3,female,,0,0,7.7875,Q,,"McDermott, Miss. Brigdet Delia"
415,3,female,,0,0,8.0500,S,,"Meek, Mrs. Thomas (Annie Louise Rowley)"
711,1,male,,0,0,26.5500,S,C124,"Klaber, Mr. Herman"
859,3,male,,0,0,7.2292,C,,"Razi, Mr. Raihed"


In [4]:
X_train.isna().sum()

Pclass        0
Sex           0
Age         137
SibSp         0
Parch         0
Fare          0
Embarked      1
Cabin       554
Name          0
dtype: int64

In [5]:
df.shape

(891, 12)

In [6]:
df.groupby(by=['Sex', 'Pclass'])['Age'].transform('mean').isna().sum()

0

In [7]:
df.loc[df['Age'].isna(), 'Age'] = df.groupby(by=['Sex', 'Pclass'])['Age'].transform('mean')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,21.75,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.0000,C148,C


In [8]:
def age_by_group(df):
#     df.groupby(by=['Sex', 'Pclass'])
#     df.groupby(by=['Sex', 'Pclass'])['Age']
#     df.loc[:, 'Age'] = df.groupby(by=['Sex', 'Pclass'])['Age'].transform('mean')
    df.loc[df['Age'].isna(), 'Age'] = df.groupby(by=['Sex', 'Pclass'])['Age'].transform('mean')
#     df.loc[df['Age'].isna(), 'Age'] = df['Age'].transform('mean')
    return df.copy(deep=True)

In [9]:
def cabin_to_deck(df):  
#     print(X_train['Cabin'].to_string())
    df.loc[df['Cabin'].notna(), 'Cabin'] = df[df['Cabin'].notna()]['Cabin'].astype(str).str[0]
#     df['Cabin_2'] = df[df['Cabin'].notna()]['Cabin'].astype(str).str[0]
#     df['Cabin_3'] = df['Cabin'].map({'A': 1, 'B': 2, 'E': 4}, na_action="ignore")
    return df.copy(deep=True)

In [10]:
def add_family_size(df: pd.DataFrame):
#     SibSp
#     Parch
    df.loc[(df['SibSp'] >= 5), 'family_tshirt_size'] = 'XL'
    df.loc[(df['SibSp'] >= 3) & (df['SibSp'] < 5), 'family_tshirt_size'] = 'L'
    df.loc[(df['SibSp'] == 2), 'family_tshirt_size'] = 'M'
    df.loc[(df['SibSp'] == 1), 'family_tshirt_size'] = 'S'

    return df.copy(deep=True)


# add_family_size(X_train)

In [11]:
# X_train['Fare'].isna().value_counts()

In [12]:
pipeline = make_pipeline(
        FunctionTransformer(age_by_group),
        FunctionTransformer(cabin_to_deck),
#         FunctionTransformer(extract_surname),
        FunctionTransformer(add_family_size),
#         FunctionTransformer(impude_na_fare),
# #         FunctionTransformer(drop_unused_columns),
#         ColumnTransformer([
#             ('impude_fare', SimpleImputer(strategy='mean'), ['Fare']),
#         ], remainder='passthrough'),
        ColumnTransformer([
            ('quantile_age', KBinsDiscretizer(n_bins=3, strategy='quantile'), ['Age']),
            ('quantile_fare', KBinsDiscretizer(n_bins=3, strategy='quantile'), ['Fare']),
#             ('impude_fare', SimpleImputer(strategy='most_frequent'), ['Fare']),
            ('ohe_and_sex', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Sex']),
            ('ohe_deck', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Cabin']),
            ('ohe_family_tshirt_size', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['family_tshirt_size']),
#             ('scale_fare', MinMaxScaler(), ['Fare']),
            ('do_nothing', 'passthrough', ['Pclass']),
            ('family_with_kids', Binarizer(), ['Parch'])
        ], remainder='drop'),
    )

In [13]:
pipeline.fit(X_train)
X_train_fe = pipeline.transform(X_train)
X_test_fe = pipeline.transform(X_test)

pd.DataFrame(X_train_fe)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0
708,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
709,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
710,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [14]:
from sklearn.linear_model import LogisticRegression

# instantiate the model
m = LogisticRegression()
# train the model
m.fit(X_train_fe, y_train)
# score train data
print('score train data: %0.4f' % m.score(X_train_fe, y_train))
# score test data
print('score test data: %0.4f' % m.score(X_test_fe, y_test))

score train data: 0.8132
score test data: 0.7933


In [15]:
m.predict(X_test_fe)

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0])

## Fuzzy string matching like a boss

https://github.com/seatgeek/thefuzz

In [16]:
pip install --no-input thefuzz

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
Note: you may need to restart the kernel to use updated packages.


In [17]:
pip install --no-input python-Levenshtein

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
Note: you may need to restart the kernel to use updated packages.


In [18]:
# pip freeze > requirements.txt

In [19]:
from thefuzz import fuzz
from thefuzz import process

In [20]:
fuzz.ratio("this is a test", "this is a test!")

97

In [21]:
df[df['Name'].str.contains(".*Sage.*")]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
159,160,0,3,"Sage, Master. Thomas Henry",male,26.507589,8,2,CA. 2343,69.55,,S
180,181,0,3,"Sage, Miss. Constance Gladys",female,21.75,8,2,CA. 2343,69.55,,S
201,202,0,3,"Sage, Mr. Frederick",male,26.507589,8,2,CA. 2343,69.55,,S
324,325,0,3,"Sage, Mr. George John Jr",male,26.507589,8,2,CA. 2343,69.55,,S
641,642,1,1,"Sagesser, Mlle. Emma",female,24.0,0,0,PC 17477,69.3,B35,C
792,793,0,3,"Sage, Miss. Stella Anna",female,21.75,8,2,CA. 2343,69.55,,S
846,847,0,3,"Sage, Mr. Douglas Bullen",male,26.507589,8,2,CA. 2343,69.55,,S
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,21.75,8,2,CA. 2343,69.55,,S


In [22]:
# df['Embarked'].value_counts()

## Style

In [23]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


## Challenge :: Submit predictions to Kaggle

In [24]:
X_data_kaggle = pd.read_csv("../data/test.csv", sep=",")
X_test_kaggle = X_data_kaggle.loc[:, ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin', 'Name']]
X_test_kaggle.isna().sum()

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Fare          1
Embarked      0
Cabin       327
Name          0
dtype: int64

In [25]:
fare_mean = X_test_kaggle['Fare'].mean()
X_test_kaggle['Fare'].fillna(fare_mean, inplace=True)

In [26]:
X_test_kaggle.isna().sum()

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Fare          0
Embarked      0
Cabin       327
Name          0
dtype: int64

In [27]:
X_test_kaggle_fe = pipeline.transform(X_test_kaggle)
# X_test_kaggle_fe = pd.DataFrame(X_test_kaggle_fe)

In [28]:
X_test_kaggle_fe

array([[0., 0., 1., ..., 1., 3., 0.],
       [0., 0., 1., ..., 0., 3., 0.],
       [0., 0., 1., ..., 1., 2., 0.],
       ...,
       [0., 0., 1., ..., 1., 3., 0.],
       [0., 1., 0., ..., 1., 3., 0.],
       [0., 1., 0., ..., 0., 3., 1.]])

In [29]:
# is_NaN = X_test_kaggle_fe.isnull()
# row_has_NaN = is_NaN.any(axis=1)
# rows_with_NaN = X_test_kaggle_fe[row_has_NaN]
# rows_with_NaN

In [30]:
X_data_kaggle['PassengerId']

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [31]:
y_test_kaggle = m.predict(X_test_kaggle_fe)
y_test_kaggle

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [32]:
df_kaggle = pd.DataFrame({
    "PassengerId": X_data_kaggle['PassengerId'],
    "Survived": y_test_kaggle
})
df_kaggle

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [33]:
df_kaggle.to_csv("./output/titanic_predictions.csv", index=False)