# Exercise 2b: Feature engineering

In [232]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [233]:
X_train = pd.read_csv("ex2_train.csv")
y_train = pd.read_csv("ex2_class_train.csv")
X_test = pd.read_csv("ex2_test.csv")
y_test = pd.read_csv("ex2_class_test.csv")

In [234]:
# define a utility function to print out the prediction performance
def evaluate_result_deprecated(y_test, y_pred, clf):
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print(f'Precision: {precision_score(y_test, y_pred):.4f}')
    print(f'Recall: {recall_score(y_test, y_pred):.4f}')
    print(f'F1-score: {f1_score(y_test, y_pred):.4f}')
    print(f'AUC-ROC: {roc_auc_score(y_test, clf.predict_proba(X_test_processed)[:, 1]):.4f}')

In [None]:
# new utility function to print out the prediction performance
def evaluate_result(y_test, y_pred, clf, X_test):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    print(f'AUC-ROC: {roc_auc:.4f}')
    return accuracy, precision, recall, f1, roc_auc

## Prototyping (without feature engineering)

In [236]:
def preprocess(data_in):
    data = data_in.drop(columns=['Name'])
    
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)

    # Convert categorical variables to dummy/indicator variables
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)

    return data

In [237]:
X_train_processed = preprocess(X_train)
X_test_processed = preprocess(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_processed, y_train.values.ravel())
y_pred = clf.predict(X_test_processed)

print('Random Forest Model without Feature Engineering')
base_acc, base_prec, base_rec, base_f1, base_auc = evaluate_result(y_test, y_pred, clf, X_test_processed)

Random Forest Model without Feature Engineering
Accuracy: 0.8101
Precision: 0.7778
Recall: 0.7568
F1-score: 0.7671
AUC-ROC: 0.8736


## Feature engineering

The classification using simple preprocessed data gives only mediocre performance.

**TODO: You should make use of the insights from your EDA (ex2a) to complete the following feature engineering function below.** Later the function will replace the simple preprocessing.

You will pass the exercise if your feature engineering can improve the performance (i.e., winning in three or more metrics).

In [248]:
def feature_engineering(data_in):
    df = data_in.copy()
    
    df["Title"] = df["Name"].str.extract(r",\s*([^\.]*)\.", expand=False).str.strip()

    # Normalizing duplicates (not used anymore)
    #df["Title"] = df["Title"].replace({
    #    "Mlle": "Miss",
    #    "Ms": "Miss",
    #    "Mme": "Mrs"
    #})
    
    # Grouping rare titles (to avoid overfitting))
    important_woman = ["Lady", "Mlle", "Mme", "Ms", "the Countess"]
    df["Title"] = df["Title"].replace(important_woman, "ImportantWoman")
    df["ImportantWoman"] = (df["Title"] == "ImportantWoman").astype(int)
    df["Mr"] = (df["Title"] == "Mr").astype(int)
    df.drop(columns=["Title"], inplace=True)

    # Grouping by age
    df["Child"] = (df["Age"] < 10).astype(int)
    df["Adult"] = ((df["Age"] >= 10) & (df["Age"] <= 60)).astype(int)
    df["Senior"] = (df["Age"] > 60).astype(int)

    # Grouping by fare
    df["LowFare"] = (df["Fare"] < 10).astype(int)
    df["MediumFare"] = ((df["Fare"] >= 10) & (df["Fare"] < 30)).astype(int)
    df["HighFare"] = (df["Fare"] >= 30).astype(int)

    df = preprocess(df)

    # Family features
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    
    return df

I extracted the Titles and added two features representing important women (Lady, Mlle, Mme, Ms, Countess) and "Mr".
I also grouped people by age, since it seemed an important feature for survival rate prediction.
I also added features for Family Size, Being Alone (family size of 1), and Fare grouping.

In [249]:
X_train_processed_engineered = feature_engineering(X_train)
X_test_processed_engineered = feature_engineering(X_test)

In [240]:
X_train_processed_engineered.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,ImportantWoman,Mr,Child,YoungAdult,Adult,LowFare,MediumFare,HighFare,Sex_male,Embarked_Q,Embarked_S,FamilySize,IsAlone
0,1,45.5,0,0,28.5,0,1,0,0,1,0,1,0,1,0,1,1,1
1,2,23.0,0,0,13.0,0,1,0,1,0,0,1,0,1,0,1,1,1
2,3,32.0,0,0,7.925,0,1,0,1,0,1,0,0,1,0,1,1,1
3,3,26.0,1,0,7.8542,0,1,0,1,0,1,0,0,1,0,1,2,0
4,3,6.0,4,2,31.275,0,0,1,0,0,0,0,1,0,0,1,7,0


In [250]:
clf_engineered = RandomForestClassifier(n_estimators=100, random_state=42)
clf_engineered.fit(X_train_processed_engineered, y_train.values.ravel())
y_pred_eng = clf_engineered.predict(X_test_processed_engineered)

print('Random Forest Model with Feature Engineering')
eng_acc, eng_prec, eng_rec, eng_f1, eng_auc = evaluate_result(y_test, y_pred_eng, clf_engineered, X_test_processed_engineered)

Random Forest Model with Feature Engineering
Accuracy: 0.8324
Precision: 0.8143
Recall: 0.7703
F1-score: 0.7917
AUC-ROC: 0.8955


In [251]:
acc_imrpovement = eng_acc - base_acc
print(f'Accuracy Improvement: {acc_imrpovement:.4f}')
prec_improvement = eng_prec - base_prec
print(f'Precision Improvement: {prec_improvement:.4f}')
rec_improvement = eng_rec - base_rec
print(f'Recall Improvement: {rec_improvement:.4f}')
f1_improvement = eng_f1 - base_f1
print(f'F1 Improvement: {f1_improvement:.4f}')
auc_improvement = eng_auc - base_auc
print(f'AUC-ROC Improvement: {auc_improvement:.4f}')

Accuracy Improvement: 0.0223
Precision Improvement: 0.0365
Recall Improvement: 0.0135
F1 Improvement: 0.0245
AUC-ROC Improvement: 0.0219


All metrics improved!

In [243]:
# Counting wrong predictions from base and engineered models
wrong_base = np.sum(y_pred != y_test.values.ravel())
wrong_engineered = np.sum(y_pred_eng != y_test.values.ravel())
print(f'Wrong predictions in base model: {wrong_base}')
print(f'Wrong predictions in engineered model: {wrong_engineered}')
print(f"Total predictions: {len(y_test)}")

Wrong predictions in base model: 34
Wrong predictions in engineered model: 30
Total predictions: 179


We also got less wrong predictions