In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [26]:
train_df = pd.read_csv("data/data.csv") 

from sklearn.preprocessing import OneHotEncoder
features = ['Sex','Parch','Age','Pclass', 'Embarked', 'Fare']

# filling missing values with mean
age_avg = train_df.Age.mean()
train_df['Age'] = train_df['Age'].fillna(age_avg)
fare_avg = train_df['Fare'].mean()
train_df['Fare'] = train_df['Fare'].fillna(fare_avg)

X = train_df[features]
y = train_df['Survived']
X_train, X_valid, y_train, y_val = train_test_split(X, y, random_state = 0)


# clean data for df used for submission
submission_df = pd.read_csv("data/submission_data.csv")
passenger_id = submission_df['PassengerId'] # needs to be saved later for submission
submission_df = submission_df[features]
age_avgs = submission_df['Age'].mean()
submission_df['Age'] = submission_df['Age'].fillna(age_avgs)
fare_avgs = submission_df['Fare'].mean()
train_df['Fare'] = submission_df['Fare'].fillna(fare_avgs)

In [27]:
# Use one-hot encoder on categorical variables
object_cols = ['Sex', 'Embarked']

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
OH_sub = pd.DataFrame(OH_encoder.transform(submission_df[object_cols]))

# put indexes back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_sub.index = submission_df.index

# Remove categorical columns
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_sub = submission_df.drop(object_cols, axis = 1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_sub = pd.concat([num_sub, OH_sub], axis=1)

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)
OH_sub.columns = OH_sub.columns.astype(str)

In [28]:
forest_model = RandomForestClassifier()
forest_model.fit(OH_X_train, y_train)
predictions = forest_model.predict(OH_X_valid)

In [29]:
def accuracy(actual, pred):
    return (actual == pred).mean()

print(accuracy(y_val, predictions))

0.8340807174887892


In [30]:
log_reg_model = LogisticRegression(max_iter = 500)
log_reg_model.fit(OH_X_train, y_train)
preds_log = log_reg_model.predict(OH_X_valid)
print(accuracy(y_val, preds_log))

0.7982062780269058


In [31]:
gnb_model = GaussianNB()
gnb_model.fit(OH_X_train, y_train)
preds_gnb = gnb_model.predict(OH_X_valid)
print(accuracy(y_val, preds_gnb))

0.7399103139013453


In [32]:
decision_model = DecisionTreeClassifier()
decision_model.fit(OH_X_train, y_train)
decision_p = decision_model.predict(OH_X_valid)
print(accuracy(y_val, decision_p))

0.7713004484304933


In [33]:
# apply the most accurate model, random forest, to the final submission predictions
final_preds = forest_model.predict(OH_sub)


In [34]:
final_preds

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [48]:
predictions = pd.concat([passenger_id, pd.Series(final_preds)], axis=1)
predictions.columns = ['PassengerId', 'Survived']
predictions = predictions.set_index('PassengerId')
predictions.to_csv('predictions.csv')