In [None]:
# libraries used 
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from scipy.stats.stats import pearsonr
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
import tensorflow as tf
from sklearn.metrics import accuracy_score, recall_score, precision_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import seaborn as sns

In [None]:
# importing the datasets
training = pd.read_csv (r'datasets/train.csv')
test = pd.read_csv (r'datasets/test.csv')
full_sets = [training, test] # combining sets to process at same time
submission = pd.DataFrame(full_sets[1]["PassengerId"]) # this will be used to submit predictions

# removing unwanted columns
for dataset in full_sets:
#     dataset.drop("Embarked", axis = 1, inplace = True) # removing embarked
    dataset.drop("Cabin", axis = 1, inplace = True) # removing cabin
    dataset.drop("Ticket", axis = 1, inplace = True) # removing ticket 
    dataset.drop("Name", axis = 1, inplace = True) # removing name
    dataset.drop("PassengerId", axis = 1, inplace = True) # removing passenger index
    

In [None]:
# label encoder
# changing male to 1 and female to 0 
for dataset in full_sets:
    ageLabelEncoder = LabelEncoder()
    dataset["Sex"] = ageLabelEncoder.fit_transform(dataset["Sex"])

In [None]:
# Encoding Embarked as S, C, Q
for dataset in full_sets:
    dataset["S"] = 0
    dataset["C"] = 0
    dataset["Q"] = 0

    dataset.loc[dataset.Pclass == 1, "S"] = 1
    dataset.loc[dataset.Pclass == 2, "C"] = 1
    dataset.loc[dataset.Pclass == 3, "Q"] = 1
    dataset.head()
    dataset.drop("Embarked", axis = 1, inplace = True) # removing embarked

In [None]:
# Missing values
# creating a KNN imputer to fill in the missing age values
knnImputer = KNNImputer(n_neighbors = 5)

trainingSetResult = knnImputer.fit_transform(training)
testSetResult = knnImputer.fit_transform(test)

# converting the NP array returned from the KNN back into a Dataframe
full_sets[0] = pd.DataFrame(data = trainingSetResult, columns = training.columns)
full_sets[1] = pd.DataFrame(data = testSetResult, columns = test.columns)

In [None]:
# Reducing Dimentions
# simplying data by adding sibling and parent/child together and creating FamilySize
for dataset in full_sets:
    familyCount = []
    for index, row in dataset.iterrows():
        siblingCount = dataset.at[index, 'SibSp']
        parentAndChildCount = dataset.at[index, 'Parch']

        familyCount.append(siblingCount + parentAndChildCount)

    dataset.drop("SibSp", axis = 1, inplace = True) # removing sibling count
    dataset.drop("Parch", axis = 1, inplace = True) # removing parent/child count
    dataset["FamilySize"] = familyCount

In [None]:
# dataset 
print(full_sets[0].head())

In [None]:
# Visualizing the Train Dataset
# Female vs male
plt = full_sets[0].Sex.value_counts().sort_index().plot(kind='bar')
plt.set_xlabel('Sex')
plt.set_ylabel('Passenger count')

In [None]:
plt = full_sets[0][['Sex', 'Survived']].groupby('Sex').mean().Survived.plot(kind='bar')
plt.set_xlabel('Sex')
plt.set_ylabel('Survival Probability')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Age histogram
g = sns.FacetGrid(full_sets[0], col='Survived')
g.map(plt.hist, 'Age', bins=40)

In [None]:
# Pclass and Age  
grid = sns.FacetGrid(full_sets[0], col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=40)

In [None]:
# Survived percentage for every Social Class
plt.figure(figsize=(10,10))
sns.pointplot(x=full_sets[0]["Pclass"], y='Survived', data=full_sets[0], ax = plt.subplot(313))
plt.xlabel("P Class", fontsize=14)
plt.ylabel('Survived Percentage', fontsize=14)
plt.show()

In [None]:
# Survived density plot for Fare prices
fig = plt.figure(figsize=(15,9),)
ax=sns.kdeplot(full_sets[0].loc[(full_sets[0]['Survived'] == 0),'Fare'] , color='black',shade=True,label='not survived')
ax=sns.kdeplot(full_sets[0].loc[(full_sets[0]['Survived'] == 1),'Fare'] , color='g',shade=True, label='survived')
plt.title('Fare Distribution Survived vs Non Survived', fontsize = 25, pad = 40)
plt.legend(loc="upper right")
plt.ylabel("Frequency of Passenger Survived", fontsize = 15, labelpad = 20)
plt.xlabel("Fare", fontsize = 15, labelpad = 20)

In [None]:
# Family Size histogram
plt = full_sets[0].FamilySize.value_counts().sort_index().plot(kind='bar')
plt.set_xlabel('SibSp')
plt.set_ylabel('Passenger count')

In [None]:
# Family Size Survived probability histogram
plt = full_sets[0][['FamilySize', 'Survived']].groupby('FamilySize').mean().Survived.plot(kind='bar')
plt.set_xlabel('FamilySize')
plt.set_ylabel('Survival Probability')

In [None]:
# Correlation Between features
import matplotlib.pyplot as plt
corr_matrix = full_sets[0].corr()
plt.figure(figsize=(10, 10))
sns.heatmap(data = corr_matrix,cmap='CMRmap', annot=True, linewidths=0.2)

In [None]:
# Data cleaning 
# Pclass Encoding
for dataset in full_sets:
    dataset["A_Class"] = 0
    dataset["B_Class"] = 0
    dataset["C_Class"] = 0

    dataset.loc[dataset.Pclass == 1, "A_Class"] = 1
    dataset.loc[dataset.Pclass == 2, "B_Class"] = 1
    dataset.loc[dataset.Pclass == 3, "C_Class"] = 1
    dataset.head()
    dataset.drop("Pclass", axis = 1, inplace = True) # removing Pclass

In [None]:
# Min-max normalization of age and fare
for dataset in full_sets:
    dataset['Age'] = MinMaxScaler().fit_transform(np.array(dataset['Age']).reshape(-1,1))
    dataset['Fare'] = MinMaxScaler().fit_transform(np.array(dataset['Fare']).reshape(-1,1))
    dataset['FamilySize'] = MinMaxScaler().fit_transform(np.array(dataset['FamilySize']).reshape(-1,1))
    
features = ["Sex", "Age", "Fare", "A_Class", "B_Class", "C_Class", "FamilySize", "S", "C", "Q"]

In [None]:
# Machine Learning Models
# ANN (Artificial Neural Network)

dfMainX = full_sets[0][features] 
dfMainY = full_sets[0]["Survived"] # labels

# Split Set into a training set and test set  (using 75% as training set and 25% as test set)
x_train, x_test, y_train, y_test = train_test_split(dfMainX, dfMainY, test_size=0.25, random_state=1)

In [None]:
# converting the datasets to be readable by tensorflow
x_train = tf.convert_to_tensor(x_train)
x_test = tf.convert_to_tensor(x_test)
y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)

In [None]:
# Set the Artifial neural network

ANN = tf.keras.Sequential([  # feed-forwards NN
    tf.keras.layers.Flatten(input_shape=(10,)),# input layer
    tf.keras.layers.Dense(128, activation='elu'), 
    tf.keras.layers.Dense(64, activation='elu'),
    tf.keras.layers.Dense(32, activation='elu'),
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(128, activation='elu'),
    tf.keras.layers.Dense(64, activation='elu'),
    tf.keras.layers.Dense(32, activation='elu'),
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(1, activation="sigmoid") #  output layer with sigmoid
])
ANN.compile(optimizer="Adam", loss='binary_crossentropy', metrics=["binary_accuracy"])
ANN.fit(x_train, y_train, epochs= 100)
# testing the ANN on the test set
val_loss, val_acc = ANN.evaluate(x_test,y_test)
print(val_loss, val_acc)

In [None]:
# Prediction using artificial neural network

dfToPredict = full_sets[1][features] 
predictionFromANN = ANN.predict(dfToPredict)
n = len(predictionFromANN) # converting to 0 and 1 died and survived 
survivedPredictions = []
for i in range(0,n):
    if(predictionFromANN[i][0] > 0.5 ):
        survivedPredictions.append(1)
    else:
        survivedPredictions.append(0)
ANNsubmission = submission.copy(deep=True)
ANNsubmission.insert(1, "Survived",survivedPredictions , True)
ANNsubmission.to_csv('results/ANNSubmission.csv', index=False) # Save as csv for submission

In [None]:
#Random Forest Classifier
x_train, x_test, y_train, y_test = train_test_split(dfMainX, dfMainY, test_size=0.1, random_state=2) # splitting the dataset
# Finding the best parameters 
# create the grid
n_estimators = [10, 19, 20, 24, 25,26,27]
max_depth = [4, 5, 6, 7]
leaf_samples = [1, 2, 3, 4]
grid = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=leaf_samples)

RF = RandomForestClassifier(random_state=2)# create default RL model

# search the grid
grid = GridSearchCV(estimator=RF, 
                    param_grid=grid,
                    cv=3,
                    verbose=2,
                    n_jobs=-1)

grid_result = grid.fit(x_train, y_train)

print(grid_result.best_score_, "Was achieved using:",grid_result.best_params_)

In [None]:
# Get the best Random Forest Classifier Parameters
RF = grid_result.best_estimator_
RF.fit(x_train, y_train)
y_pred = RF.predict(x_test)

In [None]:
# Analyse the result on the test set
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# Prediction using RF classifier 

dfToPredict = full_sets[1][features] 
predictionFromRF = RF.predict(dfToPredict)
predictionFromRF = [int(x) for x in predictionFromRF] # converting to int type
RFsubmission = submission.copy(deep=True)
RFsubmission.insert(1, "Survived",predictionFromRF , True)
RFsubmission.to_csv('results/RFSubmission.csv', index=False) # Save as csv for submission 

In [None]:
# logistic regression
X_train = full_sets[0].drop("Survived", axis = 1)
y_train = full_sets[0]["Survived"]
X_test = full_sets[1]

logRegModel = LogisticRegression(penalty = 'l2', solver = "sag", random_state = 0)
logRegModel.fit(X_train, y_train)

#Testpredictions = logRegModel.predict(x_test)
# Analyse the result on the test set
#print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, Testpredictions)))


predictions = logRegModel.predict(X_test)
predictions_ints = predictions.astype(int)

submission["Survived"] = predictions_ints.tolist()

submission.to_csv('results/LogisticRegressionSubmission.csv', index=False) 

In [None]:
# Support Vector Machine

svc = SVC()
svc.fit(X_train, y_train)

#Testpredictions = svc.predict(x_test)
# Analyse the result on the test set
#print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, Testpredictions)))

Y_pred = svc.predict(X_test)

Y_pred_ints = Y_pred.astype(int)
submission["Survived"] = Y_pred_ints.tolist()

submission.to_csv('results/SVMSubmission.csv', index=False) 

In [None]:
# function to be used to eveluate models
def cross_validate(estimator, train, validation):
    X_train = train[0]
    Y_train = train[1]
    X_val = validation[0]
    Y_val = validation[1]
    t_predictions = classifier.predict(X_train)
    t_accuracy = accuracy_score(t_predictions, Y_train)
    t_recall = recall_score(t_predictions, Y_train)
    t_precision = precision_score(t_predictions, Y_train)

    val_predictions = classifier.predict(X_val)
    val_accuracy = accuracy_score(val_predictions, Y_val)
    val_recall = recall_score(val_predictions, Y_val)
    val_precision = precision_score(val_predictions, Y_val)

    print('Model stats')
    print('Accuracy  Train: %.2f, Validation: %.2f' % (t_accuracy, val_accuracy))
    print('Recall    Train: %.2f, Validation: %.2f' % (t_recall, val_recall))
    print('Precision Train: %.2f, Validation: %.2f' % (t_precision, val_precision))

In [None]:
# Gaussian Naive Bayes
X = full_sets[0][features] 
Y = full_sets[0]["Survived"] 

# splitting data 
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=1)
X_train1, X_train2, Y_train1, Y_train2 = train_test_split(X_train, Y_train, test_size=0.3, random_state=12)

In [None]:
classifier = GaussianNB()
classifier.fit(X_train2, Y_train2)

# information
print('30% of train data')
cross_validate(classifier, (X_train, Y_train), (X_val, Y_val))

In [None]:
classifier.partial_fit(X_train1, Y_train1)
print('70% of train data')
cross_validate(classifier, (X_train, Y_train), (X_val, Y_val))

print("note: improved results with second fit")

In [None]:
# Predicting using Gaussian Naive Bayes
dfToPredict = full_sets[1][features] 
test_predictions = classifier.predict(dfToPredict)
NBSubmission = submission.copy(deep=True)
NBSubmission.drop("Survived", inplace = True, axis = 1)
NBSubmission.insert(1, "Survived",test_predictions.astype('int'), True)
NBSubmission.to_csv('results/submissionNB.csv', index=False) # Save as csv for submission

In [None]:
# K-Nearest Neighbors Algorithm using k = 5 or 6 or 29
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

n5 = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
y5 = n5.predict(X_test)
cm5 = metrics.confusion_matrix(y_test, y5)
print('Test set Accuracy of K=5:', metrics.accuracy_score(y_test, y5))
print(cm5)

n6 = KNeighborsClassifier(n_neighbors=6).fit(X_train, y_train)
y6 = n6.predict(X_test)
cm6 = metrics.confusion_matrix(y_test, y6)
print('Test set Accuracy of K=6: ', metrics.accuracy_score(y_test, y6))
print(cm6)


n29 = KNeighborsClassifier(n_neighbors=29).fit(X_train, y_train)
y29 = n29.predict(X_test)
cm29 = metrics.confusion_matrix(y_test, y29)
print('Test set Accuracy of K=29 : ', metrics.accuracy_score(y_test, y29))
print(cm29)

In [None]:
df_test = pd.read_csv("datasets/test.csv")
df_test_copy = df_test.copy()

# so far, 5 is best
X_submit = np.array(dfToPredict)
y_submit = n5.predict(X_submit)

submit = df_test_copy[['PassengerId']].copy()
submit['Survived'] = y_submit

submission['Survived'] = y_submit.astype('int')
submission.to_csv('results/submissionKNN.csv', index=False)