## <a name="abstract">Titanic Machine Learning From Disaster</a>

Abstract The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.
One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.
In this challenge, we ask you to complete the analysis of what sorts of people were likely to survive. In particular, we ask you to apply the tools of machine learning to predict which passengers survived the tragedy.

In [142]:
using DataFrames
using DecisionTree
using Iterators
using GLM
using Distances
using MLBase
using Gadfly

# Scikit
using PyCall
@pyimport sklearn.neighbors.nearest_centroid as nearest_centroid
@pyimport numpy as np
@pyimport sklearn.linear_model as sklm
@pyimport sklearn.svm as svm
@pyimport sklearn.naive_bayes as naive_bayes
@pyimport sklearn.ensemble as ensemble

### Load Preprocessed Data

In [182]:
features = [:Pclass, :Sex, :Age, :SibSp, :Parch, :Fare, :Embarked,
            :Title, :FamilySize, :IsMother, :IsChild, :IsSmallFamily,
            :CabinPrefix, :CabinNumber]
label = :Survived;

In [195]:
train = readtable("./data/train_enriched.tsv", separator='\t')[:, [features, label]]
test_passengers_id = readtable("./data/test_enriched.tsv", separator='\t')[:, :PassengerId]
test = readtable("./data/test_enriched.tsv", separator='\t')[:, features];

In [254]:
binarize{T <: Number}(y::Array{T,1}) = Array{Int8,1}(y .>= .5)
binarize{T <: Integer}(y::T) = y >= .5? 1 : 0

function calc_accuracy{T <: Integer}(y_real::Array{T,1}, y_pred::Array{T,1})
    nrows = length(y_real)
    preds_diff = map(i -> y_real[i] == binarize(y_pred[i])?1:0, vcat(1:nrows))
    return round(sum(preds_diff)/nrows, 2)  
end

function calc_accuracy{T <: Integer}(y_real::Array{T,1}, y_preds::Array{Array{T,1},1})
    return map(y_pred -> calc_accuracy(Array{Int8}(y_real), Array{Int8}(y_pred)), y_preds)
end

function predictions_to_matrix(predictions)
    nrows = length(predictions[1])
    ncols = length(predictions)
    return reshape(vcat(predictions...), (nrows, ncols))
end

function average_predictions(predictions_matrix)
    final_predictions = []
    ncols = size(predictions_matrix, 2)
    for row = 1:size(predictions_matrix, 1)
        push!(final_predictions, sum(predictions_matrix[row, :])/ncols >= .5? 1 : 0)
    end
    return Array{Int8,1}(final_predictions)
end

average_predictions (generic function with 1 method)

### Logistic Linear Regression

In [51]:
linear_model = glm(Survived ~ Pclass + Sex + Age + Parch + Fare + 
                              Embarked + Title + FamilySize + IsMother + 
                              IsSmallFamily + CabinPrefix + CabinNumber, 
                   train, Binomial());
linear_predictions_training = binarize(array(predict(linear_model, train)));

In [52]:
calc_accuracy(Array{Int8}(train[label]), linear_predictions_training)

0.81

### Random Forest

In [53]:
rf_model = build_forest(array(train[label]), array(train[:, features]), 5, 500)
rf_predictions_training = apply_forest(rf_model, array(train[:, features]));

In [54]:
calc_accuracy(Array{Int8}(train[label]), Array{Int8}(rf_predictions_training))

0.94

### kNN

In [167]:
knn_model = nearest_centroid.NearestCentroid()
knn_model[:fit](array(train[:, features]), array(train[label]))
knn_predictions_training = knn_model[:predict](array(train[:, features]));

In [111]:
calc_accuracy(Array{Int8}(train[label]), Array{Int8}(knn_predictions_training))

0.67

### Lasso

In [168]:
lasso_model = sklm.Lasso(alpha = 0.1)
lasso_model[:fit](array(train[:, features]), array(train[label]))
lasso_predictions_training = binarize(lasso_model[:predict](array(train[:, features])));

In [123]:
calc_accuracy(Array{Int8}(train[label]), Array{Int8}(lasso_predictions_training))

0.7

### Support Vector Machine

In [176]:
svm_model = svm.SVC()
svm_model[:fit](array(train[:, features]), array(train[label])) 
svm_predictions_training = binarize(svm_model[:predict](array(train[:, features])));

In [177]:
calc_accuracy(Array{Int8}(train[label]), Array{Int8}(svm_predictions_training))

0.93

### Stocastic Gradient Descent

In [174]:
sgd_model = sklm.SGDClassifier(loss="hinge", penalty="l2")
sgd_model[:fit](array(train[:, features]), array(train[label]))
sgd_predictions_training = binarize(sgd_model[:predict](array(train[:, features])));

In [175]:
calc_accuracy(Array{Int8}(train[label]), Array{Int8}(sgd_predictions_training))

0.74

### Gaussian Naive Bayes

In [172]:
naive_bayes_model = naive_bayes.GaussianNB()
naive_bayes_model[:fit](array(train[:, features]), array(train[label]))
naive_bayes_predictions_training = binarize(naive_bayes_model[:predict](array(train[:, features])));

In [173]:
calc_accuracy(Array{Int8}(train[label]), Array{Int8}(naive_bayes_predictions_training))

0.78

### Extremely Randomized Trees

In [178]:
extremely_rand_tress_models = ensemble.ExtraTreesClassifier(n_estimators=10, min_samples_split=1, random_state=0)
extremely_rand_tress_models[:fit](array(train[:, features]), array(train[label]))
extremely_rand_trees_predictions_training = binarize(extremely_rand_tress_models[:predict](array(train[:, features])));

In [179]:
calc_accuracy(Array{Int8}(train[label]), Array{Int8}(extremely_rand_trees_predictions_training))

0.98

###  Combine Models Training Data

In [166]:
mtx = predictions_to_matrix(Array[linear_predictions_training, rf_predictions_training, 
                                  knn_predictions_training, lasso_predictions_training,
                                  svm_predictions_training, sgd_predictions_training,
                                  naive_bayes_predictions_training, 
                                  extremely_rand_trees_bayes_predictions_training])
combined_predictions_training = average_predictions(mtx)
calc_accuracy(Array{Int8}(train[label]), Array{Int8}(combined_predictions_training))

0.87

### Make Final Prediction

Before choose classifiers, lets remember the previous score on training set: 

#### Solo Classifier
1. LR: 0.81 (Kaggle: 0.76077)
2. RF: 0.94 (Kaggle: 0.78469)
3. kNN: 0.67 (Kaggle: 0.65072)
4. Lasso: 0.70  (Kaggle: 0.66507)
5. SVM: 0.93 (Kaggle: 0.66507)
6. SGD: 0.74  (Kaggle: 0.68900)
7. Naive Bayes: 0.78 (Kaggle: 0.75120)
8. Extremely Randomized Trees: 0.98 (Kaggle: 0.74163)

#### Combined Classifier with low correlation
RF + SVM + Extremely Randomized Trees: (Kaggle: 0.76077)  
LR + RF Cor($P_1,P_2$: 0.69, $F,P_1$: 0.88 $F,P_2$: 0.84) (Kaggle: 0.77033)  
LR + Naive Bayes Cor($P_1,P_2$: 0.79, $F,P_1$: 0.85 $F,P_2$: 0.94) (Kaggle: )  
RF + Naive Bayes Cor($P_1,P_2$: 0.62, $F,P_1$: 0.77 $F,P_2$: 0.89) (Kaggle: 0.76077)  
LR + RF + Naive Bayes Cor($F,P_1$: 0.93, $F,P_2$: 0.76 $F,P_7$: 0.85) (Kaggle: )  
LR + RF + Lasso Cor($F,P_1$: 0.80, $F,P_2$: 0.89 $F,P_4$: 0.42) (Kaggle: )

In [330]:
labels = collect(subsets([:p1, :p2, :p3, :p4, :p5, :p6, :p7, :p8]))[2:end]
outputs = collect(subsets(Array[p1,p2,p3,p4,p5,p6,p7,p8]))[2:end]
preds = Array[average_predictions(predictions_to_matrix(out)) for out in outputs]
comb = map(i -> (labels[i], round(cor(preds[i], p2), 2)), vcat(1:length(labels)))
for c in comb
    if 0.85 < last(c) < .95 && length(first(c)) > 2
        println(c)
    end
end

([:p1,:p2,:p4],0.89)
([:p1,:p2,:p5],0.88)
([:p1,:p2,:p6],0.86)
([:p2,:p3,:p8],0.93)
([:p1,:p2,:p3,:p8],0.91)
([:p2,:p4,:p8],0.93)
([:p1,:p2,:p4,:p8],0.93)
([:p2,:p5,:p8],0.93)
([:p1,:p2,:p5,:p8],0.86)
([:p1,:p2,:p3,:p5,:p8],0.9)
([:p1,:p2,:p4,:p5,:p8],0.91)
([:p2,:p6,:p8],0.94)
([:p1,:p2,:p6,:p8],0.87)
([:p1,:p2,:p3,:p6,:p8],0.87)
([:p1,:p2,:p4,:p6,:p8],0.89)
([:p1,:p2,:p5,:p6,:p8],0.87)
([:p2,:p4,:p7,:p8],0.87)
([:p1,:p2,:p4,:p7,:p8],0.87)
([:p1,:p2,:p5,:p7,:p8],0.86)
([:p1,:p2,:p6,:p7,:p8],0.86)


In [340]:
final_predictions = average_predictions(predictions_to_matrix(Array[p1, p2, p4]))
round(cor(final_predictions, p4),2)
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=final_predictions)
writetable("./data/prediction_v8_ensemble_learning_lr+rf+lasso.csv", test_predictions_df)

In [187]:
p1 = binarize(array(predict(linear_model, test))) # Logistic Regression Model
p2 = apply_forest(rf_model, array(test)) # Random Forest
p3 = knn_model[:predict](array(test)) # kNN
p4 = binarize(lasso_model[:predict](array(test))) # Lasso Model
p5 = binarize(svm_model[:predict](array(test))) # SVM
p6 = binarize(sgd_model[:predict](array(test))) # SGD
p7 = binarize(naive_bayes_model[:predict](array(test))) # Naive Bayes
p8 = binarize(extremely_rand_tress_models[:predict](array(test))) # Extremely Randomized Trees
final_predictions = predictions_to_matrix(Array[p1, p2, p3, p4, p5, p6, p7, p8])
final_predictions = average_predictions(final_predictions);

In [196]:
# RF + SVM + Extremely Randomized Trees - Kaggle Results: 0.76077
final_predictions = average_predictions(predictions_to_matrix(Array[p2, p5, p8]))
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=final_predictions)
writetable("./data/prediction_v8_ensemble_learning_rf+svm+extrees.csv", test_predictions_df)

In [253]:
# LR + RF + Naive Bayes
final_predictions = average_predictions(predictions_to_matrix(Array[p1, p2, p7]))
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=final_predictions)
writetable("./data/prediction_v8_ensemble_learning_lr+rf+naive_bayes.csv", test_predictions_df)

In [202]:
# Logistic Regression
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=p1)
writetable("./data/prediction_v8_logistic_regression.csv", test_predictions_df)

# Random Forest
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=p2)
writetable("./data/prediction_v8_random_forest.csv", test_predictions_df)

# kNN
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=p3)
writetable("./data/prediction_v8_knn.csv", test_predictions_df)

# Lasso
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=p4)
writetable("./data/prediction_v8_lasso.csv", test_predictions_df)

# SVM
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=p5)
writetable("./data/prediction_v8_svm.csv", test_predictions_df)

# SGD
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=p6)
writetable("./data/prediction_v8_sgd.csv", test_predictions_df)

# Naive Bayes
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=p7)
writetable("./data/prediction_v8_naive_bayes.csv", test_predictions_df)

# Extremely Randomized Trees
test_predictions_df = DataFrame(PassengerId=test_passengers_id, Survived=p8)
writetable("./data/prediction_v8_extremely_randomized_trees.csv", test_predictions_df)