# Starters

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [104]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [105]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Strategy

- Title of each passenger
- Ticket -> separate the ticket into two groups: numeric and alphanumerical
- Treat null values for Cabin
- Encoding non numerical features (OHE):
  - sex
  - embarked
  - title
- Scaling numerical features
  - age
  - sibsp
  - parch
  - fare

# Data Cleaning

## Check if it's balanced

In [106]:
train['Survived'].value_counts(normalize=True) * 100

Survived
0    61.616162
1    38.383838
Name: proportion, dtype: float64

# Preprocessing Layer

In [107]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

## Functions

In [108]:
def treat_name(X):
  X = X.copy()
  X['Title'] = X['Name'].apply(lambda x: x.split(' ')[1].replace('.',''))
  # after the value counts to see the most frequent ones
  most_common = ['Mr', 'Miss', 'Mrs', 'Master']
  X['Title'] = X['Title'].apply(lambda x: x if x in most_common else 'Other')

  return X

def binary_ticket(X):
  X = X.copy()
  X['Ticket_binary'] = X['Ticket'].apply(lambda x: 1 if x.isalnum() else 0)

  return X

def cabin_label(X):
  X = X.copy()
  X['Cabin_labeled'] = X['Cabin'].fillna(0).apply(lambda x: 0 if x == 0 else 1)

  return X

binary_ticket_transf = FunctionTransformer(binary_ticket)

treat_name_transf = FunctionTransformer(treat_name)

cabin_label_transf = FunctionTransformer(cabin_label)

## Imputer, OHE, Scaler and Dropper

In [109]:
imputer = ColumnTransformer(
    transformers=[
        ('imp_embarked', SimpleImputer(strategy='most_frequent'), ['Embarked']),
        ('imp_age', SimpleImputer(strategy='median'), ['Age', 'Fare'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
).set_output(transform='pandas')

ohe = ColumnTransformer(
    transformers=[
        ('ohe',
         OneHotEncoder(drop='if_binary', sparse_output=False, handle_unknown='ignore'),
         ['Sex', 'Embarked', 'Title'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
).set_output(transform='pandas')

scaler = ColumnTransformer(
    transformers=[
        ('min_max', MinMaxScaler(), ['Age', 'SibSp', 'Parch']),
        ('rob', RobustScaler(), ['Fare'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
).set_output(transform='pandas')

def cols_dropper(X):
  X = X.copy()
  cols_to_drop = ['PassengerId', 'Pclass', 'Name', 'Ticket', 'Cabin']
  return X.drop(columns=cols_to_drop)

cols_dropper_transf = FunctionTransformer(cols_dropper)

## Final Preproc Pipeline

In [110]:
preproc_pipe = Pipeline([
    ('binary_ticket', binary_ticket_transf.set_output(transform='pandas')),
    ('treat_name', treat_name_transf.set_output(transform='pandas')),
    ('cabin_label', cabin_label_transf.set_output(transform='pandas')),
    ('imputer', imputer.set_output(transform='pandas')),
    ('ohe', ohe.set_output(transform='pandas')),
    ('scaler', scaler.set_output(transform='pandas')),
    ('cols_dropper', cols_dropper_transf.set_output(transform='pandas')),
])

preproc_pipe

## Preprocessing Pipeline

In [111]:
from sklearn.model_selection import train_test_split

X = train.drop(columns='Survived')
y = train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42)

X_train_preproc = preproc_pipe.fit_transform(X_train)
X_val_preproc = preproc_pipe.transform(X_val)

# Modeling

## Logistic Regression

### Ridge

In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV

log = LogisticRegression()

cv_results = cross_validate(log, X_train_preproc, y_train, cv=5)
print(f"Train score: {cv_results['test_score'].mean()}")

log.fit(X_train_preproc, y_train)

print(f"Val score: {log.score(X_val_preproc, y_val)}")

Train score: 0.8314606741573034
Val score: 0.7982062780269058


### Lasso

In [113]:
lasso = LogisticRegression(penalty='l1', solver='liblinear')

cv_results = cross_validate(lasso, X_train_preproc, y_train, cv=5)
print(f"Train score: {cv_results['test_score'].mean()}")

lasso.fit(X_train_preproc, y_train)

print(f"Val score: {lasso.score(X_val_preproc, y_val)}")

Train score: 0.8337078651685393
Val score: 0.8004484304932735


### Elastic Net

In [114]:
enet = LogisticRegression(penalty='elasticnet', l1_ratio=0.5 , solver='saga')

cv_results = cross_validate(enet, X_train_preproc, y_train, cv=5)
print(f"Train score: {cv_results['test_score'].mean()}")

enet.fit(X_train_preproc, y_train)

print(f"Val score: {enet.score(X_val_preproc, y_val)}")

Train score: 0.8247191011235955
Val score: 0.8026905829596412


## SVM

### Linear SVM

In [115]:
from sklearn.svm import LinearSVC

lin_svm = LinearSVC(loss='hinge')

params_grid = [
  {'C':[0.001, 0.01, 1, 10]}
]

grid_search = GridSearchCV(
  lin_svm, param_grid=params_grid
)

grid_search.fit(X_train_preproc, y_train)

print(f"Best params: {grid_search.best_params_}")

best_lin_svm = grid_search.best_estimator_

cv_results = cross_validate(best_lin_svm, X_train_preproc, y_train, cv=5)
print(f"Train score: {cv_results['test_score'].mean()}")

best_lin_svm.fit(X_train_preproc, y_train)

print(f"Val score: {best_lin_svm.score(X_val_preproc, y_val)}")

Best params: {'C': 1}
Train score: 0.8179775280898877
Val score: 0.820627802690583


### Non Linear

In [116]:
from sklearn.svm import SVC

svm = SVC()

params_grid = {
    'C': [0.001, 0.01, 1, 10],
    'kernel': ['rbf', 'poly'],
    'gamma': [0.1, 1, 5, 20]
}

grid_search = GridSearchCV(
  svm, param_grid=params_grid, n_jobs=-1
)

grid_search.fit(X_train_preproc, y_train)

best_svm = grid_search.best_estimator_

print(f"Best params: {grid_search.best_params_}")

best_svm = grid_search.best_estimator_

cv_results = cross_validate(best_svm, X_train_preproc, y_train, cv=5)
print(f"Train score: {cv_results['test_score'].mean()}")

best_svm.fit(X_train_preproc, y_train)

print(f"Val score: {best_svm.score(X_val_preproc, y_val)}")

Best params: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Train score: 0.8179775280898877
Val score: 0.8161434977578476


## KNN

In [117]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

params_grid = {'n_neighbors' : np.arange(1,30,1)}

grid_search = GridSearchCV(knn, param_grid=params_grid, n_jobs=-1,  cv = 5)

grid_search.fit(X_train_preproc, y_train)

best_knn = grid_search.best_estimator_

print(f"Best params: {grid_search.best_params_}")

best_knn = grid_search.best_estimator_

cv_results = cross_validate(best_knn, X_train_preproc, y_train, cv=5)
print(f"Train score: {cv_results['test_score'].mean()}")

best_knn.fit(X_train_preproc, y_train)

print(f"Val score: {best_knn.score(X_val_preproc, y_val)}")

Best params: {'n_neighbors': 9}
Train score: 0.804494382022472
Val score: 0.7959641255605381


In [118]:
X_test_preproc = preproc_pipe.transform(test)

y_pred = best_lin_svm.predict(X_test_preproc)

sub = pd.Series(y_pred, index=test['PassengerId'], name='Survived')

In [119]:
sub.to_csv('linear_svm_model.csv', header=True)

# Modeling for Decision Trees

For Decision Trees, we don't need to scale the numerical features, so let's cut this part of the preprocessing pipeline.

In [120]:
trees_pre_proc_pipe = Pipeline([
    ('binary_ticket', binary_ticket_transf.set_output(transform='pandas')),
    ('treat_name', treat_name_transf.set_output(transform='pandas')),
    ('cabin_label', cabin_label_transf.set_output(transform='pandas')),
    ('imputer', imputer.set_output(transform='pandas')),
    ('ohe', ohe.set_output(transform='pandas')),
    ('cols_dropper', cols_dropper_transf.set_output(transform='pandas')),
])

trees_pre_proc_pipe

In [121]:
X_train_preproc_trees = trees_pre_proc_pipe.fit_transform(X_train)
X_val_preproc_trees = trees_pre_proc_pipe.transform(X_val)


## Decision Trees

In [122]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=42)

params_grid = {'max_depth': np.arange(1, 11), 'min_samples_split': np.arange(1, 100, 10)}

grid_search = GridSearchCV(tree, param_grid=params_grid, n_jobs=-1,  cv = 5)

grid_search.fit(X_train_preproc_trees, y_train)

best_tree = grid_search.best_estimator_

print(f"Best params: {grid_search.best_params_}")

best_tree = grid_search.best_estimator_

cv_results = cross_validate(best_tree, X_train_preproc_trees, y_train, cv=5)
print(f"Train score: {cv_results['test_score'].mean()}")

best_tree.fit(X_train_preproc_trees, y_train)

print(f"Val score: {best_tree.score(X_val_preproc_trees, y_val)}")

Best params: {'max_depth': 9, 'min_samples_split': 11}
Train score: 0.802247191011236
Val score: 0.7533632286995515


## Random Forest

In [123]:
from sklearn.ensemble import RandomForestClassifier

random = RandomForestClassifier()

params_grid = {
    'n_estimators': np.arange(100, 500, 100),
    'max_depth': np.arange(1, 11),
    'min_samples_split': np.arange(2, 100, 10)
}

grid_search = GridSearchCV(random, param_grid=params_grid, n_jobs=-1,  cv = 5)

grid_search.fit(X_train_preproc_trees, y_train)

best_random = grid_search.best_estimator_

print(f"Best params: {grid_search.best_params_}")

best_random = grid_search.best_estimator_

cv_results = cross_validate(best_random, X_train_preproc_trees, y_train, cv=5)
print(f"Train score: {cv_results['test_score'].mean()}")

best_random.fit(X_train_preproc_trees, y_train)

print(f"Val score: {best_random.score(X_val_preproc_trees, y_val)}")

Best params: {'max_depth': 7, 'min_samples_split': 2, 'n_estimators': 300}
Train score: 0.8269662921348315
Val score: 0.8318385650224215


## AdaBoost

In [124]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(estimator=DecisionTreeClassifier())

params_grid = {
    'n_estimators': np.arange(100, 500, 100),                 # do AdaBoost
    'estimator__max_depth': np.arange(1, 11),                 # da árvore base
    'estimator__min_samples_split': np.arange(2, 100, 10)     # da árvore base (não pode ser 1!)
}

grid_search = GridSearchCV(ada, param_grid=params_grid, n_jobs=-1,  cv = 5)

grid_search.fit(X_train_preproc_trees, y_train)

best_ada = grid_search.best_estimator_

print(f"Best params: {grid_search.best_params_}")

best_ada = grid_search.best_estimator_

cv_results = cross_validate(best_ada, X_train_preproc_trees, y_train, cv=5)
print(f"Train score: {cv_results['test_score'].mean()}")

best_ada.fit(X_train_preproc_trees, y_train)

print(f"Val score: {best_ada.score(X_val_preproc_trees, y_val)}")

Best params: {'estimator__max_depth': 2, 'estimator__min_samples_split': 12, 'n_estimators': 300}
Train score: 0.8471910112359551
Val score: 0.8183856502242153


# Predictions for test set

In [125]:
X_test_preproc_trees = trees_pre_proc_pipe.transform(test)

y_pred = best_ada.predict(X_test_preproc_trees)

sub = pd.Series(y_pred, index=test['PassengerId'], name='Survived')
sub.shape

(418,)

In [128]:
sub.to_csv('adaboost_model.csv', header=True)