In [237]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler,StandardScaler

# Clean the data

In [2]:
train = pd.read_csv('data/train.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train['Male'] = train['Sex'] == 'male'

In [5]:
train['Female'] = train['Sex'] == 'female'

In [6]:
train['Cherbourg'] = train['Embarked'] == 'C'
train['Queestown'] = train['Embarked'] == 'Q'
train['Southampton'] = train['Embarked'] == 'S'

In [7]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Male', 'Female',
       'Cherbourg', 'Queestown', 'Southampton'],
      dtype='object')

In [8]:
train[train.duplicated()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Male,Female,Cherbourg,Queestown,Southampton


In [9]:
cleaned_train = train[['Pclass', 'Age', 'SibSp',
       'Parch', 'Fare', 'Male', 'Female',
       'Cherbourg', 'Queestown', 'Southampton','Survived']]

In [10]:
cleaned_train.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,891.0,714.0,891.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208,0.383838
std,0.836071,14.526497,1.102743,0.806057,49.693429,0.486592
min,1.0,0.42,0.0,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104,0.0
50%,3.0,28.0,0.0,0.0,14.4542,0.0
75%,3.0,38.0,1.0,0.0,31.0,1.0
max,3.0,80.0,8.0,6.0,512.3292,1.0


In [11]:
# clean age
cleaned_train['imputed_age'] = cleaned_train['Age'].isna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_train['imputed_age'] = cleaned_train['Age'].isna()


In [12]:
cleaned_train['Age'].fillna(29.7,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_train['Age'].fillna(29.7,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_train['Age'].fillna(29.7,inplace=True)


In [239]:
# normalize age
scaler = MinMaxScaler()
cleaned_train['normalized_age'] = scaler.fit_transform(cleaned_train[['Age']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_train['normalized_age'] = scaler.fit_transform(cleaned_train[['Age']])


In [244]:
# normalize fare
# fare is very skewed to the right. use log
cleaned_train['normalized_fare'] = np.log1p(cleaned_train['Fare'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_train['normalized_fare'] = np.log1p(cleaned_train['Fare'])


In [245]:
cleaned_train.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived,normalized_age,normalized_fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,29.699293,0.523008,0.381594,32.204208,0.383838,0.367923,2.962246
std,0.836071,13.002015,1.102743,0.806057,49.693429,0.486592,0.163383,0.969048
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,7.9104,0.0,0.271174,2.187218
50%,3.0,29.7,0.0,0.0,14.4542,0.0,0.367932,2.737881
75%,3.0,35.0,1.0,0.0,31.0,1.0,0.434531,3.465736
max,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,6.240917


In [246]:
X = cleaned_train[['Pclass', 'normalized_age', 'SibSp',
       'Parch', 'normalized_fare', 'Male', 'Female',
       'Cherbourg', 'Queestown', 'Southampton','imputed_age']]

In [247]:
y = train['Survived']

In [248]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, train_size=0.8)

In [277]:
model = MLPClassifier(learning_rate_init=0.05,activation='relu'
                      ,hidden_layer_sizes=(128,64)
                     )

In [278]:
model.fit(X_train, y_train)

In [279]:
y_probs = model.predict_proba(X_test)[:, 1]

threshold = 0.5
y_pred_custom = (y_probs >= threshold).astype(int)
print(classification_report(y_test, y_pred_custom))

auc = roc_auc_score(y_test, y_probs)
print(f"AUC: {auc:.2f}")

              precision    recall  f1-score   support

           0       0.80      0.95      0.87       110
           1       0.90      0.62      0.74        69

    accuracy                           0.83       179
   macro avg       0.85      0.79      0.80       179
weighted avg       0.84      0.83      0.82       179

AUC: 0.83


In [280]:
y_probs = model.predict_proba(X_test)[:, 1]

threshold = 0.4
y_pred_custom = (y_probs >= threshold).astype(int)
print(classification_report(y_test, y_pred_custom))

auc = roc_auc_score(y_test, y_probs)
print(f"AUC: {auc:.2f}")

              precision    recall  f1-score   support

           0       0.80      0.94      0.87       110
           1       0.86      0.64      0.73        69

    accuracy                           0.82       179
   macro avg       0.83      0.79      0.80       179
weighted avg       0.83      0.82      0.81       179

AUC: 0.83


In [253]:
y_probs = model.predict_proba(X_test)[:, 1]

threshold = 0.3
y_pred_custom = (y_probs >= threshold).astype(int)
print(classification_report(y_test, y_pred_custom))

auc = roc_auc_score(y_test, y_probs)
print(f"AUC: {auc:.2f}")

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       110
           1       0.72      0.72      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

AUC: 0.84


In [228]:
final_test = pd.read_csv('data/test.csv')

In [230]:
final_test['Male'] = final_test['Sex'] == 'male'

final_test['Female'] = final_test['Sex'] == 'female'

final_test['Cherbourg'] = final_test['Embarked'] == 'C'
final_test['Queestown'] = final_test['Embarked'] == 'Q'
final_test['Southampton'] = final_test['Embarked'] == 'S'

In [232]:
cleaned_final_test = final_test[['Pclass', 'Age', 'SibSp',
       'Parch', 'Fare', 'Male', 'Female',
       'Cherbourg', 'Queestown', 'Southampton']]

In [233]:
cleaned_final_test.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,418.0,332.0,418.0,418.0,417.0
mean,2.26555,30.27259,0.447368,0.392344,35.627188
std,0.841838,14.181209,0.89676,0.981429,55.907576
min,1.0,0.17,0.0,0.0,0.0
25%,1.0,21.0,0.0,0.0,7.8958
50%,3.0,27.0,0.0,0.0,14.4542
75%,3.0,39.0,1.0,0.0,31.5
max,3.0,76.0,8.0,9.0,512.3292
