In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Reading in the data as a Pandas DataFrame.

In [None]:
train_df = pd.read_csv('../input/train.csv')

A brief overview of the data.

In [None]:
train_df.head()

A look into the statistics of continous data.

In [None]:
train_df.describe()

For categorical variables, 

In [None]:
train_df.describe(include = 'object')

It can be seen that "Age" variable has missing values, similarly in categorical variables, "Cabin" and "Embarked" have missing values.

The shape of the training data is

In [None]:
train_df.shape

In [None]:
print("Value Counts of Tickets: {}".format(train_df['Ticket'].value_counts()))

In [None]:
print("Value Counts of Cabin: {}".format(train_df['Cabin'].value_counts()))

In [None]:
import matplotlib.pyplot as plt
train_df['Ticket'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
train_df['Cabin'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
train_df['Fare'].plot(kind = 'bar')

As, it can be seen, the distributions of the Cabin and Tickets variables are the same. But as their value counts are very high, turning them into one hot encoded categorical variable is not feasible.

Let's get rid of some of the unneccesary features. For now, the "name" and "ticket" features are not that important. Also "Cabin" feature has a lot of missing values (~600), so let's get rid of that feature. Also, for analysis, "PassengerID" is not essential, so I'll get rid of that.  

In [None]:
train_df_1 = train_df.drop(["Name", "Ticket", "PassengerId", "Cabin"], axis = 1)

Now, let's separate out features and output variables. 

In [None]:
X_train = train_df_1.drop(["Survived"], axis = 1)
y_train = train_df_1[["Survived"]]

Now, let's fill in the missing values of the variables mentioned above. For the "Age" variable, the missing values would be filled in by mean of all the ages.

In [None]:
X_train["Age"] = X_train["Age"].fillna(X_train["Age"].mean())

Now, let's look at the statistical description.

In [None]:
X_train["Age"].describe()

Thus, the 'NaN' values are filled in. Now, embarked has 2 missing values, let's fill them out by the mode. The df.mode() function return a df, so iloc is used.

In [None]:
X_train["Embarked"] = X_train["Embarked"].fillna(X_train["Embarked"].mode().iloc[0])

In [None]:
X_train["Embarked"].describe()

Now, let's analyze X_train statistically,

In [None]:
X_train.describe()

In [None]:
X_train.describe(include = 'object')

Now, the missing values are cleared.

To select important features, correlation is one of the important metric. So let's use that.

In [None]:
corr_mat = train_df_1.corr()
corr_mat

In [None]:
import seaborn as sns
sns.heatmap(corr_mat)

In [None]:
dum_df1 = pd.get_dummies(X_train["Embarked"])
X_train = X_train.drop(["Embarked"], axis = 1)
X_train = pd.concat([X_train, dum_df1], axis = 1)


In [None]:
dum_df2 = pd.get_dummies(X_train["Sex"])
X_train = X_train.drop(["Sex"], axis = 1)
X_train = pd.concat([X_train, dum_df2], axis = 1)
X_train.head()

In [None]:
dum_df3 = pd.get_dummies(X_train["Pclass"])
X_train = X_train.drop(["Pclass"], axis = 1)
X_train = pd.concat([X_train, dum_df3], axis = 1)
X_train.head()

Now the data has no missing values and is completely numeric. Great. Let's standardize the data now. I'll standrdize the continous data for now.

In [None]:
X_train['Age'] = (X_train['Age'] - X_train['Age'].mean()) / X_train['Age'].std()
X_train['Fare'] = (X_train['Fare'] - X_train['Fare'].mean()) / X_train['Fare'].std()
X_train.head() 

1. Let's start modelling. First let's try logisitic regression with cross validation.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(0.9)
X_train_pca = pca.fit_transform(X_train)
no_of_components = pca.n_components_
var_ratio = pca.explained_variance_ratio_
plt.plot(var_ratio)
plt.xlabel('Features')
plt.ylabel('Proportion of variance explained by each feature.')
plt.title('The number of features are: {}'.format(no_of_components))
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
clf_l2 = LogisticRegression(penalty = 'l2', solver = 'lbfgs', random_state = 0)
clf_l2.fit(X_train, y_train)

Thus, the model is trained, now let's evaluate the performance using 3-fold cross validation.


In [None]:
scores_l2 = cross_val_score(clf_l2, X_train, y_train, cv = 3)
scores_l2

The scores all lie between (78% - 80%). Let's try l1 regularisation.

In [None]:
clf_l1 = LogisticRegression(penalty = 'l1', solver = 'liblinear', random_state = 0)
clf_l1.fit(X_train,y_train)
scores_l1 = cross_val_score(clf_l1, X_train, y_train, cv = 3)
scores_l1

In [None]:
print("L1 Regularisation Cross Validation Score is: " + str(scores_l1.mean()))
print("L2 Regularisation Cross Validation Score is: " + str(scores_l2.mean()))

Thus, these are the results obtained by Logistic Regression. Let's try several different classification algorithms to see if we get better results through cross validation.

Let's try K-nearest classification. I'll use GridSearchCV for deciding which is the best 'k' for best score.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
parameters = {'n_neighbors':np.arange(1, 11)}
knn = KNeighborsClassifier()
clf_knn = GridSearchCV(knn, parameters, cv = 3)
clf_knn.fit(X_train, y_train)

In [None]:
clf_knn.cv_results_

Thus from the above results, we get a 'best estimator'. Let's store this in 'knn_best_estimator'.

In [None]:
knn_best_estimator = clf_knn.best_estimator_ 
knn_best_score = clf_knn.best_score_
print("The best estimator is: " + str(knn_best_estimator))
print("The best score is: " + str(knn_best_score))

That was k-nearest neighbours., it's a fairly simple algorithm but is known to give excellent results in some cases, let's compare the results with other algorithms. Now let's move to Support Vector Machines.

In [None]:
from sklearn.svm import SVC
params_range = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
parameters = [{'C': params_range, 'kernel':['linear']}, {'C':params_range, 'kernel':['rbf'], 'gamma':params_range}]
svm = SVC(random_state = 0, probability = True)
clf_svm = GridSearchCV(estimator = svm, param_grid = parameters, cv = 3, n_jobs = -1, scoring = 'accuracy')
clf_svm.fit(X_train, y_train)

In [None]:
best_estimator_svm = clf_svm.best_estimator_
best_score_svm = clf_svm.best_score_
print(best_estimator_svm)
print(best_score_svm)

Thus, the SVM classifier is giving a better accuracy than K-neighbours classifier. Let's move now to Decision trees.

In [None]:
from sklearn import tree
tree = tree.DecisionTreeClassifier()
params_range = np.arange(1, 21)
parameters_tree = [{'criterion':['gini'], 'max_depth':params_range}, {'criterion':['entropy'], 'max_depth':params_range}]
clf_tree = GridSearchCV(estimator = tree, param_grid = parameters_tree, cv = 3, scoring = 'accuracy')
clf_tree.fit(X_train, y_train)
tree_best_estimator = clf_tree.best_estimator_
tree_best_score = clf_tree.best_score_
print(tree_best_estimator)
print(tree_best_score)

Thus, the cross validation result obtained through Decision tree is better than K-Nearest Neighbours better a it did a little worse than support vector machines. Next, let's try Random Forests.

In [None]:
"""from sklearn.ensemble import RandomForestClassifier
params_range_n_estimators = np.arange(1, 21)
params_range_max_depth = np.arange(1, 21)
forest = RandomForestClassifier()
parameters_forest = [{'criterion':['gini'], 'n_estimators': params_range_n_estimators, 'max_depth':params_range_max_depth, 'oob_score':['True', 'False']}, {'criterion':['entropy'], 'n_estimators': params_range_n_estimators, 'max_depth':params_range_max_depth, 'oob_score':['True', 'False']}]
clf_forest = GridSearchCV(estimator = forest, param_grid = parameters_forest, cv = 3, scoring = 'accuracy')
clf_forest.fit(X_train, y_train)"""

In [None]:
"""clf_forest_best_estimator = clf_forest.best_estimator_
clf_forest_best_score = clf_forest.best_score_
print(clf_forest_best_estimator)
print(clf_forest_best_score)""""

Let's try a voting classifier to decide which of the above classifiers give best accuracy. The voting classifier is based on majority voting.

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
voting_clf = VotingClassifier(estimators = [('lr', clf_l2), ('svc', best_estimator_svm), ('knn', knn_best_estimator), ('rf', clf_forest_best_estimator)], voting = 'hard')
cross_validation_score = []
for clf in (clf_l2, clf_l1, best_estimator_svm, clf_forest_best_estimator, knn_best_estimator, voting_clf):
    clf.fit(X_train, y_train)
    cross_validation_score.append(clf)
    cross_validation_score.append(max(cross_val_score(clf, X_train, y_train, cv = 3)))
print(cross_validation_score)

Let's submit the results now. But first, the testing data needs to be in the form of training data.

In [None]:
test_df = pd.read_csv('../input/test.csv')

In [None]:
test_df.head()

In [None]:
test_df.describe(include = 'object')

In [None]:
test_df_1 = test_df.drop(["Name", "Ticket", "PassengerId", "Cabin"], axis = 1)

In [None]:
test_df_1.head()

In [None]:
test_df_1["Age"] = test_df_1["Age"].fillna(test_df_1["Age"].mean())

In [None]:
test_df_1["Fare"] = test_df_1["Fare"].fillna(test_df_1["Fare"].mean())

In [None]:
test_df_1.describe()

In [None]:
dum_df1_test = pd.get_dummies(test_df_1["Embarked"])
test_df_1 = test_df_1.drop(["Embarked"], axis = 1)
test_df_1 = pd.concat([test_df_1, dum_df1_test], axis = 1)


In [None]:
dum_df1_test = pd.get_dummies(test_df_1["Sex"])
test_df_1 = test_df_1.drop(["Sex"], axis = 1)
test_df_1 = pd.concat([test_df_1, dum_df1_test], axis = 1)

In [None]:
dum_df1_test = pd.get_dummies(test_df_1["Pclass"])
test_df_1 = test_df_1.drop(["Pclass"], axis = 1)
test_df_1 = pd.concat([test_df_1, dum_df1_test], axis = 1)

In [None]:
test_df_1.head()

In [None]:
test_df_1['Age'] = (test_df_1['Age'] - test_df_1['Age'].mean()) / test_df_1['Age'].std()
test_df_1['Fare'] = (test_df_1['Fare'] - test_df_1['Fare'].mean()) / test_df_1['Fare'].std()

In [None]:
test_df_1.head()

In [None]:
X_test_pca = pca.transform(test_df_1)

In [None]:
predictions = best_estimator_svm.predict(test_df_1)
submission = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':predictions})

In [None]:
submission.head()

In [None]:
filename = 'Titanic-Notebook-for-pratice-preds.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)