<a href="https://colab.research.google.com/github/clyde2020/ML_Portfolio/blob/main/Titanic/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [None]:
import pandas as pd
import numpy as np
import os
import shutil
from shutil import copyfile
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop, Adam
import matplotlib.pyplot as plt
import random
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
import pickle
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from keras.callbacks import ReduceLROnPlateau
import cv2

import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Initialize variables

In [None]:
home_dir = '/content/gdrive/MyDrive/Titanic'  # Mandatory update

df_train = pd.read_csv('{}/train.csv'.format(home_dir))
df_test = pd.read_csv('{}/test.csv'.format(home_dir))
test_ids = df_test['PassengerId'].copy()

state = 42

# Process data

In [None]:
df_train.hist(bins=50, figsize=(15, 12))
plt.show()

In [None]:
del_cols = ['PassengerId', 'Ticket', 'Name', 'Cabin']
scale_cols = ['Pclass', 'SibSp', 'Parch', 'Age', 'Fare', 'fam_total']
ordinal_cols = ['Sex', 'has_cabin']
one_hot_cols = ['Embarked']

# Fill in null values
df_train['Embarked'] = df_train['Embarked'].fillna('S')
df_test['Fare'][np.isnan(df_test['Fare'])] = df_test['Fare'].mean()

In [None]:
# Train, valid split
df_train, df_valid = train_test_split(df_train, 
                                      test_size=.1, 
                                      random_state=state, 
                                      shuffle=True, 
                                      stratify=df_train['Sex'])

In [None]:
# Set label arrays
y_train = df_train['Survived'].copy()
y_valid = df_valid['Survived'].copy()

df_train.drop('Survived', axis=1, inplace=True)
df_valid.drop('Survived', axis=1, inplace=True)

In [None]:
df_full = [df_train, df_valid, df_test]

for dataset in df_full:
  # Fill in missing values for Age
  age_avg = df_train['Age'].mean()
  age_std = df_train['Age'].std()
  age_null_size = dataset['Age'].isnull().sum()
  random_list = np.random.randint(low=(age_avg - age_std),
                                  high=(age_avg + age_std),
                                  size=age_null_size)
  dataset['Age'][np.isnan(dataset['Age'])] = random_list
  # Set if cabin info is known
  dataset['has_cabin'] = dataset['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
  # Count total family members
  dataset['fam_total'] = dataset['SibSp'] + dataset['Parch']

  dataset.drop(del_cols, axis=1, inplace=True)

In [None]:
train = df_full[0]
valid = df_full[1]
test = df_full[2]

In [None]:
corr_matrix = train.corr()
corr_matrix

In [None]:
# Process data in column transformer
full_pipeline = ColumnTransformer([('std_scale', StandardScaler(), scale_cols),
                                   ('one_hot', OneHotEncoder(), one_hot_cols),
                                   ('ordinal', OrdinalEncoder(), ordinal_cols)
                                   ])

train_tr = full_pipeline.fit_transform(train)
valid_tr = full_pipeline.transform(valid)
test_tr = full_pipeline.transform(test)

In [None]:
train_tr[:10]

# Models

In [None]:
# Make predictions for soft voting classifier
lin_clf = LinearRegression()
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=state)
log_clf = LogisticRegression(solver='lbfgs', random_state=state)
svm_clf = SVC(gamma='scale', probability=True, random_state=state)
dec_clf = DecisionTreeClassifier(random_state=state)

voting_clf = VotingClassifier(estimators=[('rf', rnd_clf),
                                          ('log', log_clf),
                                          ('svc', svm_clf),
                                          ('dec', dec_clf)],
                              voting='soft')

for clf in (rnd_clf, svm_clf, log_clf, dec_clf, voting_clf):
  clf.fit(train_tr, y_train)
  y_pred = clf.predict(valid_tr)
  y_pred = np.round(y_pred)
  y_pred = y_pred.astype(int)
  print(clf.__class__.__name__, accuracy_score(y_valid, y_pred))

RandomForestClassifier 0.8111111111111111
SVC 0.8
LogisticRegression 0.7555555555555555
DecisionTreeClassifier 0.8
VotingClassifier 0.8111111111111111


In [None]:
# Make predictions for hard voting classifier
lin_clf = LinearRegression()
rnd_clf = RandomForestClassifier(n_estimators=70, random_state=state)
log_clf = LogisticRegression(solver='lbfgs', random_state=state)
svm_clf = SVC(gamma='scale', random_state=state)
dec_clf = DecisionTreeClassifier(random_state=state)

voting_clf = VotingClassifier(estimators=[#('lnr', lin_clf),
                                          ('rf', rnd_clf),
                                          ('log', log_clf),
                                          ('svc', svm_clf),
                                          ('dec', dec_clf)],
                              voting='hard')

for clf in (rnd_clf, svm_clf, log_clf, dec_clf, voting_clf):
  clf.fit(train_tr, y_train)
  y_pred = clf.predict(valid_tr)
  y_pred = np.round(y_pred)
  y_pred = y_pred.astype(int)
  print(clf.__class__.__name__, accuracy_score(y_valid, y_pred))

RandomForestClassifier 0.8444444444444444
SVC 0.8
LogisticRegression 0.7555555555555555
DecisionTreeClassifier 0.8
VotingClassifier 0.8222222222222222


In [None]:
# Get predictions
preds = voting_clf.predict(test_tr)
preds = np.round(preds)
preds = preds.astype(int)

In [None]:
# Set up submit file to Kaggle
output = pd.DataFrame({'PassengerId': test_ids,
                       'Survived': preds})

output.to_csv('/content/gdrive/MyDrive/Titanic/gender_submission.csv', index=False)