# Load Packages

In [None]:
# data analysis and wrangling
import numpy as np
import pandas as pd
import random as rnd

# visualization
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, f1_score

# Get the Data

In [None]:
train_titan = pd.read_csv('../input/train.csv')
test_titan = pd.read_csv('../input/test.csv')

In [None]:
# train_titan.sample(20)

In [None]:
train_titan.info()
print('_'*40)
test_titan.info()

In [None]:
# train_titan.describe()

In [None]:
# train_titan.describe(include=['O'])

In [None]:
print(train_titan.shape)
print(test_titan.shape)

# Analyze Datasets

## Select Columns

In [None]:
train_titan = train_titan.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
test_titan = test_titan.drop(['Ticket', 'Cabin'], axis=1)
print(train_titan.shape)
print(test_titan.shape)

In [None]:
# print(train_titan[['Pclass', 'Survived']].groupby(['Pclass']).mean().sort_values(by='Survived'))
# print('-'*40)
# print(train_titan[['Sex', 'Survived']].groupby(['Sex']).mean().sort_values(by='Survived'))
# print('-'*40)
# print(train_titan[['Embarked', 'Survived']].groupby(['Embarked']).mean().sort_values(by='Survived'))

## SibSp, Parch

In [None]:
# print(train_titan[['SibSp', 'Survived']].groupby(['SibSp']).mean())
# print('-'*40)
# print(train_titan[['Parch', 'Survived']].groupby(['Parch']).mean())

In [None]:
train_titan['Family'] = train_titan['SibSp'] + train_titan['Parch']
test_titan['Family'] = test_titan['SibSp'] + test_titan['Parch']
print(train_titan[['Family', 'Survived']].groupby(['Family']).mean())

In [None]:
# plt.hist(train_titan['Family'], bins=range(11))

In [None]:
# train_titan.loc[train_titan['SibSp'] >= 3, 'SibSp'] = 3
# train_titan.loc[train_titan['Parch'] >= 3, 'Parch'] = 3
# train_titan.loc[train_titan['Family'] >= 3, 'Family'] = 3
# test_titan.loc[test_titan['SibSp'] >= 3, 'SibSp'] = 3
# test_titan.loc[test_titan['Parch'] >= 3, 'Parch'] = 3
# test_titan.loc[test_titan['Family'] >= 3, 'Family'] = 3
# print(test_titan.sample(20))

In [None]:
train_titan = train_titan.drop(['SibSp', 'Parch'], axis=1)
test_titan = test_titan.drop(['SibSp', 'Parch'], axis=1)

## Others

In [None]:
# plt.subplot(1, 2, 1)
# plt.hist(train_titan['Age'].dropna())
# plt.subplot(1, 2, 2)
# plt.hist(train_titan['Fare'].dropna(), bins = 50)

In [None]:
# g = sns.FacetGrid(train_titan, row='Pclass', col='Embarked', hue='Sex', size=2, aspect=2)
# g.map(plt.hist, 'Fare', alpha=.5, bins=5)
# g.add_legend()

In [None]:
# sns.heatmap(train_titan.corr())

## Name

In [None]:
data = [train_titan, test_titan]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr":5, "Rare": 6}

# train_titan['Title'] = train_titan.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# train_titan.groupby(train_titan.Title).count()

for dataset in data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Major',\
                                             'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna(0)
   
train_titan = train_titan.drop(['Name'], axis=1)
test_titan = test_titan.drop(['Name'], axis=1)

# Missing Data

## Embarked

In [None]:
# plt.hist(train_titan['Embarked'])
# plt.hist(train_titan['Embarked'])

In [None]:
train_titan.Embarked.fillna(train_titan.Embarked.mode().iloc[0], inplace=True)
test_titan.Embarked.fillna(train_titan.Embarked.mode().iloc[0], inplace=True)

In [None]:
combine_titan = [train_titan, test_titan]
for dataset in combine_titan:
    dataset.loc[dataset.Embarked == 'S', 'Embarked'] = 1
    dataset.loc[dataset.Embarked == 'C', 'Embarked'] = 2
    dataset.loc[dataset.Embarked == 'Q', 'Embarked'] = 3
    dataset.loc[dataset.Sex == 'male', 'Sex'] = 1
    dataset.loc[dataset.Sex == 'female', 'Sex'] = 2

## Fare

In [None]:
guess_fare = train_titan['Fare'].groupby(train_titan['Pclass']).median()
for i in range(0, 3):
    test_titan.loc[ (test_titan.Fare.isnull()) & (test_titan.Pclass == i+1),\
                'Fare'] = guess_fare[i+1]

## Age

In [None]:
# g = sns.FacetGrid(train_titan, row='Pclass', col='Sex', size=2, aspect=2)
# g.map(plt.hist, 'Age', alpha=.5, bins=20)
# g.add_legend()

In [None]:
guess_age = train_titan[['Age', 'Pclass', 'Sex']].groupby(['Pclass', 'Sex']).mean().round(2)
for i in range(0, 2):
    for j in range(0, 3):
        train_titan.loc[ (train_titan.Age.isnull()) & (train_titan.Sex == i+1) 
                        & (train_titan.Pclass == j+1),'Age'] = guess_age.Age[j+1, i+1]
        test_titan.loc[ (test_titan.Age.isnull()) & (test_titan.Sex == i+1) 
                        & (test_titan.Pclass == j+1),'Age'] = guess_age.Age[j+1, i+1]

In [None]:
# kmeans = KMeans(n_clusters=6)
# kmeans.fit(train_titan.Age[:, np.newaxis])
# kmeans
# kmeans.cluster_centers_

In [None]:
# for dataset in combine_titan:
#     dataset.loc[dataset.Age <= 9, 'AgeBin'] = 1
#     dataset.loc[(dataset.Age > 9) & (dataset.Age <= 23), 'AgeBin'] = 2
#     dataset.loc[(dataset.Age > 23) & (dataset.Age <= 32), 'AgeBin'] = 3
#     dataset.loc[(dataset.Age > 32) & (dataset.Age <= 42), 'AgeBin'] = 4
#     dataset.loc[(dataset.Age > 42) & (dataset.Age <= 55), 'AgeBin'] = 5
#     dataset.loc[dataset.Age > 55, 'AgeBin'] = 6
#     dataset.drop('Age', axis=1)

# train_titan[['Survived', 'AgeBin']].groupby(['AgeBin']).mean()

In [None]:
# kmeans_fare = KMeans(n_clusters=6)
# kmeans_fare.fit(train_titan.Fare[:, np.newaxis])
# kmeans_fare.cluster_centers_

# Build Algorithms

In [None]:
X_train = train_titan.drop("Survived", axis=1)
Y_train = train_titan["Survived"]
X_test  = test_titan.drop("PassengerId", axis=1).copy()
print(X_train.info(), X_test.info())

In [None]:
# # Logistic Regression

# logreg = LogisticRegression()
# logreg.fit(X_train, Y_train)
# Y_pred = logreg.predict(X_test)
# acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
# acc_log

In [None]:
# coeff_df = pd.DataFrame(train_titan.columns.delete(0))
# coeff_df.columns = ['Feature']
# coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

# coeff_df.sort_values(by='Correlation', ascending=False)

In [None]:
# # Support Vector Machines

# svc = SVC()
# svc.fit(X_train, Y_train)
# Y_pred = svc.predict(X_test)
# acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
# acc_svc

In [None]:
# knn = KNeighborsClassifier(n_neighbors = 3)
# knn.fit(X_train, Y_train)
# Y_pred = knn.predict(X_test)
# acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
# acc_knn

In [None]:
# # Gaussian Naive Bayes

# gaussian = GaussianNB()
# gaussian.fit(X_train, Y_train)
# Y_pred = gaussian.predict(X_test)
# acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
# acc_gaussian

In [None]:
# # Perceptron

# perceptron = Perceptron()
# perceptron.fit(X_train, Y_train)
# Y_pred = perceptron.predict(X_test)
# acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
# acc_perceptron

In [None]:
# # Linear SVC

# linear_svc = LinearSVC()
# linear_svc.fit(X_train, Y_train)
# Y_pred = linear_svc.predict(X_test)
# acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
# acc_linear_svc

In [None]:
# # Stochastic Gradient Descent

# sgd = SGDClassifier()
# sgd.fit(X_train, Y_train)
# Y_pred = sgd.predict(X_test)
# acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
# acc_sgd

In [None]:
# # Decision Tree

# decision_tree = DecisionTreeClassifier()
# decision_tree.fit(X_train, Y_train)
# Y_pred = decision_tree.predict(X_test)
# acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
# acc_decision_tree

In [None]:
# # Random Forest

# random_forest = RandomForestClassifier()
# random_forest.fit(X_train, Y_train)
# Y_pred = random_forest.predict(X_test)
# acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
# acc_random_forest

In [None]:
rf_cv = RandomForestClassifier(n_estimators=100, oob_score=True)
scores = cross_val_score(rf_cv, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

# Evaluate the Algorithm

In [None]:
rf_cv.fit(X_train, Y_train)
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(rf_cv.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
print(importances)

## Hyperparameter Tuning

In [None]:
# param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10, 25], "min_samples_split" : [2, 4, 10, 16, 25], "n_estimators": [20, 100, 250]}
# rf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
# clf = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
# clf.fit(X_train, Y_train)
# clf.best_params_

In [None]:
rf_cv = RandomForestClassifier(n_estimators=100, oob_score=True)
rf_cv.fit(X_train, Y_train)
rf_cv.score(X_train, Y_train)
print("oob score:", round(rf_cv.oob_score_, 4)*100, "%")

In [None]:
random_forest = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 1, 
                                       min_samples_split = 16,   
                                       n_estimators=250, 
                                       max_features='auto', 
                                       oob_score=True, 
                                       random_state=1, 
                                       n_jobs=-1)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

In [None]:
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
confusion_matrix(Y_train, predictions)

# f1_score(Y_train, predictions)

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_titan["PassengerId"],
        "Survived": Y_prediction
    })
submission.to_csv('submission.csv', index=False)