# Exercise 2b: Feature engineering

In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
X_train = pd.read_csv("ex2_train.csv")
y_train = pd.read_csv("ex2_class_train.csv")
X_test = pd.read_csv("ex2_test.csv")
y_test = pd.read_csv("ex2_class_test.csv")

In [16]:
# define a utility function to print out the prediction performance
def evaluate_result(y_test, y_pred, clf):
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print(f'Precision: {precision_score(y_test, y_pred):.4f}')
    print(f'Recall: {recall_score(y_test, y_pred):.4f}')
    print(f'F1-score: {f1_score(y_test, y_pred):.4f}')
    print(f'AUC-ROC: {roc_auc_score(y_test, clf.predict_proba(X_test_processed)[:, 1]):.4f}')

## Prototyping (without feature engineering)

In [19]:
def preprocess(data_in):
    data = data_in.drop(columns=['Name'])
    
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)

    # Convert categorical variables to dummy/indicator variables
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)
    
    return data

In [20]:
X_train_processed = preprocess(X_train)
X_test_processed = preprocess(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_processed, y_train.values.ravel())
y_pred = clf.predict(X_test_processed)

print('Random Forest Model without Feature Engineering')
evaluate_result(y_test, y_pred, clf)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

Random Forest Model without Feature Engineering
Accuracy: 0.8101
Precision: 0.7778
Recall: 0.7568
F1-score: 0.7671
AUC-ROC: 0.8736


## Feature engineering

The classification using simple preprocessed data gives only mediocre performance.

**TODO: You should make use of the insights from your EDA (ex2a) to complete the following feature engineering function below.** Later the function will replace the simple preprocessing.

You will pass the exercise if your feature engineering can improve the performance (i.e., winning in three or more metrics).

In [None]:
# data['Title'] = data_in['Name'].apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))
# data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',  'Dr', 'Major', 'Rev'], 'Noble')
# data['Title'] = data['Title'].replace(['Mlle', 'Ms', 'Mme'], 'Miss') 
# data['Title'] = data['Title'].replace(['Lady', 'Countess'], 'Mrs') 

In [117]:
def preprocess(data_in):
    data = data_in.drop(columns=['Name'])
    
    data['Age'] = data['Age'].fillna(data['Age'].median())
    data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
    data['Fare'] = data['Fare'].fillna(data['Fare'].median())
    data['Fare'] = np.log1p(data['Fare'])  # log(1 + Fare) s


    data['AgeGroup'] = pd.cut(data['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child', 'Teenager', 'Adult', 'Middle Aged', 'Elderly'])

    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['FamilyCategory'] = pd.cut(data['FamilySize'], bins=[0, 1, 4, 11], labels=['Single', 'SmallFamily', 'LargeFamily'])

    data['FarePerPerson'] = data['Fare'] / data['FamilySize']

    data['Title'] = data_in['Name'].apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1) if pd.notnull(x ) else 'None')
    
    # feature interaction
    data['Sex*Title'] = data['Sex'] + "_" + data['Title']
    # data['Pclass*Fare'] = data['Pclass'] * data['Fare']
    data['Pclass*Age'] = data['Pclass'] * data['Age']
    data['Fare*FamilySize'] = data['Fare'] / (data['FamilySize'])
    data['FarePerPerson*Pclass'] = data['FarePerPerson'] * data['Pclass']
    

    
    # Convert categorical variables to dummy/indicator variables
    data = pd.get_dummies(data, columns=['Sex', 'Embarked', "AgeGroup", "FamilyCategory", "Title", "Sex*Title"], drop_first=True)

    return data

In [118]:
def feature_engineering(data_in):
    df = data_in.copy()
    
    # Write your feature engineering code here. The following line is just a placeholder. You can remove it if needed.
    df = preprocess(df)
    
    return df

In [120]:
X_train_processed = feature_engineering(X_train)
X_test_processed = feature_engineering(X_test)
X_train_processed, X_test_processed = X_train_processed.align(X_test_processed, join='left', axis=1, fill_value=0)

# X_train_processed.isnull().sum()
# X_test_processed.isnull().sum()

In [121]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_processed, y_train.values.ravel())
y_pred = clf.predict(X_test_processed)

print('Random Forest Model with Feature Engineering')
evaluate_result(y_test, y_pred, clf)

Random Forest Model with Feature Engineering
Accuracy: 0.8380
Precision: 0.7922
Recall: 0.8243
F1-score: 0.8079
AUC-ROC: 0.8947


In [122]:
## All scores above is better than the previous results