## Titanic Survivors

In [1]:
import zipfile
from pathlib import Path

p = Path("train.csv")
if not p.exists():
    with zipfile.ZipFile("data/titanic.zip", "r") as z:
        z.extractall("data")

In [2]:
import pandas as pd

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

print("Train Data Overview:")
print(train.head())
print(train.info())
print(train.describe())

Train Data Overview:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.050

In [3]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
train["Age"] = train["Age"].fillna(train["Age"].median())
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [6]:
train = pd.get_dummies(train, columns=["Sex", "Embarked"], drop_first=True)
print(train.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   
2                             Heikkinen, Miss. Laina  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   
4                           Allen, Mr. William Henry  35.0      0      0   

             Ticket     Fare Cabin  Sex_male  Embarked_Q  Embarked_S  
0         A/5 21171   7.2500   NaN      True       False        True  
1          PC 17599  71.2833   C85     False       False       False  
2  STON/O2. 3101282   7.9250   NaN     False       False        True  
3            113803  53.1000  C123     Fal

In [7]:
X = train.drop(["Survived", "Name", "PassengerId", "Cabin", "Ticket"], axis=1)
y = train["Survived"]

print("Features:")
print(X.head())
print("Target:")
print(y.head())

Features:
   Pclass   Age  SibSp  Parch     Fare  Sex_male  Embarked_Q  Embarked_S
0       3  22.0      1      0   7.2500      True       False        True
1       1  38.0      1      0  71.2833     False       False       False
2       3  26.0      0      0   7.9250     False       False        True
3       1  35.0      1      0  53.1000     False       False        True
4       3  35.0      0      0   8.0500      True       False        True
Target:
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_val, y_val)

0.8156424581005587

In [9]:
import numpy as np
from sklearn.metrics import accuracy_score

y_pred_all_died = np.zeros_like(y_val)
print("All died:", accuracy_score(y_val, y_pred_all_died))

y_pred_female_survived = (X_val["Sex_male"] == 0).astype(int)
print("Female survived:", accuracy_score(y_val, y_pred_female_survived))

All died: 0.5865921787709497
Female survived: 0.7821229050279329


In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")

print(scores)
print("mean:", scores.mean())
print("std:", scores.std())

[0.79888268 0.81460674 0.84831461 0.79213483 0.81460674]
mean: 0.813709120582512
std: 0.01941352293771137


In [11]:
print("\nTest Data Overview:")
print(test.head())
print(test.info())
print(test.describe())


Test Data Overview:
   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ----

In [12]:
y = train["Survived"]
train_features = train.drop(columns=["Survived"])

In [13]:
full = pd.concat([train_features, test], axis=0, ignore_index=True)

In [14]:
full["Age"] = full["Age"].fillna(full["Age"].median())
full["Embarked"] = full["Embarked"].fillna(full["Embarked"].mode()[0])
full["Fare"] = full["Fare"].fillna(full["Fare"].median())

titles = full["Name"].str.extract(r",\s*([^\.]+)\.", expand=False)
full = full.drop(columns=["Cabin", "Ticket", "Name"])

# full = pd.get_dummies(full, columns=["Sex", "Embarked"], drop_first=True)

In [15]:
#X = full.iloc[: len(train), :]
#model.fit(X, y)

In [16]:
# X_test = full.iloc[len(train) :, :]
# preds = model.predict(X_test)

# submission = pd.DataFrame(
#     {"PassengerId": test["PassengerId"],
#      "Survived": preds.astype(int)}
# )

# submission.to_csv("data/submission.csv", index=False)

In [17]:
full["FamilySize"] = full["SibSp"] + full["Parch"] + 1
full["IsAlone"] = (full["FamilySize"] == 1).astype(int)

In [18]:
titles = titles.replace(["Mlle", "Ms"], "Miss")
titles = titles.replace("Mme", "Mrs")
titles = titles.replace(
    [
        "Lady",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
full["Title"] = titles

In [19]:
full = full.drop(columns=["Sex_male", "Embarked_Q", "Embarked_S"])
full = pd.get_dummies(full, columns=["Sex", "Embarked", "Title"], drop_first=True)

print(full.head())
print(full.isnull().sum())

   PassengerId  Pclass   Age  SibSp  Parch     Fare  FamilySize  IsAlone  \
0            1       3  22.0      1      0   7.2500           2        0   
1            2       1  38.0      1      0  71.2833           2        0   
2            3       3  26.0      0      0   7.9250           1        1   
3            4       1  35.0      1      0  53.1000           2        0   
4            5       3  35.0      0      0   8.0500           1        1   

   Sex_male  Embarked_Q  Embarked_S  Title_Miss  Title_Mr  Title_Mrs  \
0     False       False        True       False      True      False   
1     False       False        True       False     False       True   
2     False       False        True        True     False      False   
3     False       False        True       False     False       True   
4     False       False        True       False      True      False   

   Title_Rare  Title_the Countess  
0       False               False  
1       False               False  
2 

In [20]:
y = train["Survived"]
n_train = len(train)

X = full.iloc[:n_train, :].copy()
X_test = full.iloc[n_train:, :].copy()

test_passenger_ids = test["PassengerId"]

print(X.shape, X_test.shape)

(891, 16) (418, 16)


In [21]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

model = GradientBoostingClassifier(random_state=42)

scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
print(scores)
print("mean:", scores.mean(), "std:", scores.std())

[0.61452514 0.81460674 0.83146067 0.82022472 0.8258427 ]
mean: 0.7813319942250957 std: 0.08359242402024637


In [22]:
model.fit(X, y)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [23]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
)

scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
print(scores, scores.mean(), scores.std())

[0.75977654 0.79775281 0.85393258 0.80898876 0.83707865] 0.8115058690603227 0.03263556256641751


In [24]:
model = RandomForestClassifier(n_estimators=300, random_state=42)
model.fit(X, y)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
preds = model.predict(X_test)

submission = pd.DataFrame(
    {"PassengerId": test_passenger_ids, "Survived": preds.astype(int)}
)
submission.to_csv("data/submission.csv", index=False)

In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [300, 600],
    "max_depth": [4, 6, 8, None],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 3],
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 600}
0.8171112924486851


In [27]:
best_model = grid.best_estimator_

print(best_model)

RandomForestClassifier(min_samples_leaf=3, n_estimators=600, random_state=42)


In [28]:

best_model.fit(X, y)
preds = best_model.predict(X_test)

submission = pd.DataFrame(
    {"PassengerId": test_passenger_ids,
     "Survived": preds.astype(int)}
)
submission.to_csv("data/submission.csv", index=False)

In [29]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
)

scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
print(scores, scores.mean(), scores.std())

[0.78212291 0.81460674 0.8258427  0.79213483 0.81460674] 0.8058627832527776 0.016150226198288393


In [30]:
model.fit(X, y)
preds = model.predict(X_test)

submission = pd.DataFrame(
    {"PassengerId": test_passenger_ids,
     "Survived": preds.astype(int)}
)
submission.to_csv("data/submission.csv", index=False)