# 2.7. Feature Engineering with ColumnTransformers

## Challenge :: Feature Engineer your data

In [1]:
import pandas as pd

In [88]:
df = pd.read_csv("../data/train.csv", sep=",")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [116]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from feature_engineering import AgeImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline

X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
733,2,male,23.0,0,0,13.0000,S
857,1,male,51.0,0,0,26.5500,S
81,3,male,29.0,0,0,9.5000,S
319,1,female,40.0,1,1,134.5000,C
720,2,female,6.0,0,1,33.0000,S
...,...,...,...,...,...,...,...
575,3,male,19.0,0,0,14.5000,S
838,3,male,32.0,0,0,56.4958,S
337,1,female,41.0,0,0,134.5000,C
523,1,female,44.0,0,1,57.9792,C


In [126]:
transformer = ColumnTransformer([
    
    ('fillna_and_split_age', make_pipeline(
        # impute the missing values in the Age column
        SimpleImputer(strategy='median'),
    
        # bin the imputed Age column into 3 bins (young, middle-aged, old)
        KBinsDiscretizer(n_bins=3, strategy='quantile')
    ), ['Age']),

    # one-hot-encode the Embarked column
    ('ohe_embarked', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Embarked', 'Sex']),
    
    # scale the Fare column
    ('scale_fare', MinMaxScaler() ,['Fare'])
    
], remainder='passthrough')

transformer.fit(X_train)
X_train_fe = transformer.transform(X_train)
X_test_fe = transformer.transform(X_test)
pd.DataFrame(X_train_fe)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.025374,2.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.051822,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.018543,3.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.262527,1.0,1.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.064412,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.028302,3.0,0.0,0.0
708,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.110272,3.0,0.0,0.0
709,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.262527,1.0,0.0,0.0
710,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.113168,1.0,0.0,1.0


In [118]:
from sklearn.linear_model import LogisticRegression

# instantiate the model
m = LogisticRegression()

In [119]:
X_train_fe.shape

(712, 13)

In [120]:
y_train.shape

(712,)

In [121]:
y_train

733    0
857    1
81     1
319    1
720    1
      ..
575    0
838    1
337    1
523    1
863    0
Name: Survived, Length: 712, dtype: int64

In [122]:
# train the model
m.fit(X_train_fe, y_train)

LogisticRegression()

In [124]:
m.score(X_train_fe, y_train)

0.8061797752808989

In [127]:
m.score(X_test_fe, y_test)

0.8100558659217877

In [128]:
m.predict(X_test_fe)

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0])