In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split

# configuration
train_data_path = "data/train.csv"
test_data_path = "data/test.csv"
random_seed = 42
write_csv = False

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
split_train_data, split_test_data = train_test_split(train_data, test_size=0.33, random_state=random_seed)

In [85]:
# naive model
d = pd.DataFrame(split_test_data, columns=["Survived", "Sex"])
d["Sex"] = (d["Sex"] == "female").astype(int)
accuracy = (d["Survived"] == d["Sex"]).astype(int).sum() / len(d)
print(f"accuracy: {accuracy:.4f}")

if write_csv:
    output = pd.DataFrame({
        'PassengerId': test_data["PassengerId"],
        'Survived': (test_data["Sex"] == "female").astype(int)
    })
    output.to_csv('submission_naive.csv', index=False)

accuracy: 0.7966


In [86]:
# random forest model
from sklearn.ensemble import RandomForestClassifier

features = ["Pclass", "Sex", "SibSp", "Parch"]

X = pd.get_dummies(split_train_data[features])
y = split_train_data["Survived"]
model = RandomForestClassifier(n_estimators=300, max_depth=4, random_state=random_seed)
model.fit(X, y)

A = pd.get_dummies(split_test_data[features])
b = split_test_data["Survived"]
predictions = model.predict(A)
comparison = pd.DataFrame({
    'predicted': predictions,
    'expected': b,
})
accuracy = (comparison["predicted"] == comparison["expected"]).astype(int).sum() / len(comparison)
print(f"accuracy: {accuracy:.4f}")

if write_csv:
    output = pd.DataFrame({
        'PassengerId': test_data["PassengerId"],
        'Survived': model.predict(pd.get_dummies(test_data[features]))
    })
    output.to_csv('submission_naive.csv', index=False)

accuracy: 0.8169
