# Titanic Survival Classification

Connect to Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Import pandas, seaborn, and matplotlib.

In [None]:
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

Read the train data.  

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/320/train.csv",
                     sep=",",  # delimiter
                     header=0, # header in first row
                     index_col=0 # ids in first column
                     )

Read the test data.

In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/320/test.csv",
                     sep=",",  # delimiter
                     header=0, # header in first row
                     index_col=0 # ids in first column
                     )

Check the train data for missingess, levels, etc.

In [None]:
train_data.describe(include="all")

Check the test data for missingness, levels, etc.

In [None]:
test_data.describe(include="all")

Select columns and create dummies.


In [None]:
predictors = ["Pclass",
              "Sex",  
              "Age",  
              "SibSp",  
              "Parch",  
              "Fare",  
              "Embarked"]

X_train = pd.get_dummies(train_data[predictors],
               drop_first=True)
y_train = train_data["Survived"]

X_test = pd.get_dummies(test_data[predictors],
               drop_first=True)

Replace missing values with the means from the training data.

In [None]:
train_means = X_train.mean()

X_train = X_train.fillna(train_means)
X_test = X_test.fillna(train_means)

Fit logistic regression model and calculate accuracy using the train data.

In [None]:
my_lr = LogisticRegression(max_iter=400).fit(X_train, y_train)
lr_pred_train = my_lr.predict(X_train)
metrics.accuracy_score(y_train, lr_pred_train)

Create confusion matrix.

In [None]:
metrics.confusion_matrix(y_train, lr_pred_train)

Make predictions using test data.

In [None]:
lr_pred_test = my_lr.predict(X_test)

Format predictions for output and write to csv.

In [None]:
lr_output = pd.DataFrame(lr_pred_test,
                      index=X_test.index,
                      columns=["Survived"])
lr_output.to_csv("/content/drive/MyDrive/320/lr_pred.csv")

Fit decision tree model and calculate accuracy on the train data.

In [None]:
my_tree = DecisionTreeClassifier().fit(X_train, y_train)
tree_pred_train = my_tree.predict(X_train)
metrics.accuracy_score(y_train, tree_pred_train)

Create confusion matrix for the train data.

In [None]:
pd.DataFrame(metrics.confusion_matrix(y_train, tree_pred_train, normalize="true"))

Make predictions using test data.

In [None]:
tree_pred_test = my_tree.predict(X_test)

Format predictions for output and write to csv.

In [None]:
tree_output = pd.DataFrame(tree_pred_test,
                      index=X_test.index,
                      columns=["Survived"])
tree_output.to_csv("/content/drive/MyDrive/320/tree_pred.csv")