In [41]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

sns.set()

In [47]:
dataset = pd.read_csv('./dataset/titanic/cleaned_train.csv')
dataset.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_cat,Sex_cat
0,0,3,22.0,1,0,7.25,2,1
1,1,1,38.0,1,0,71.2833,0,0
2,1,3,26.0,0,0,7.925,2,0
3,1,1,35.0,1,0,53.1,2,0
4,0,3,35.0,0,0,8.05,2,1


Splitting the data in train and test set

In [66]:
X = dataset.iloc[:, 1:].values
Y = dataset.iloc[:, 0].values


In [73]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

print("Length train set: {}".format(len(X_train)))
print("Length test set: {}".format(len(X_test)))

Length train set: 569
Length test set: 143


Now we can test different machine learning models with our dataset

In [54]:
#Create a function within many Machine Learning Models
def models(X_train,Y_train):
  
  #Using Logistic Regression Algorithm to the Training Set
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state = 0, solver='liblinear')
  log.fit(X_train, Y_train)
  
  #Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm
  from sklearn.neighbors import KNeighborsClassifier
  knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  knn.fit(X_train, Y_train)

  #Using SVC method of svm class to use Support Vector Machine Algorithm
  from sklearn.svm import SVC
  svc_lin = SVC(kernel = 'linear', random_state = 0)
  svc_lin.fit(X_train, Y_train)

  #Using SVC method of svm class to use Kernel SVM Algorithm
  from sklearn.svm import SVC
  svc_rbf = SVC(kernel = 'rbf', random_state = 0, gamma='auto')
  svc_rbf.fit(X_train, Y_train)

  #Using GaussianNB method of naïve_bayes class to use Naïve Bayes Algorithm
  from sklearn.naive_bayes import GaussianNB
  gauss = GaussianNB()
  gauss.fit(X_train, Y_train)

  #Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
  tree.fit(X_train, Y_train)

  #Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
  forest.fit(X_train, Y_train)
  
  #print model accuracy on the training data.
  print('[0]Logistic Regression Training Accuracy:', log.score(X_train, Y_train))
  print('[1]K Nearest Neighbor Training Accuracy:', knn.score(X_train, Y_train))
  print('[2]Support Vector Machine (Linear Classifier) Training Accuracy:', svc_lin.score(X_train, Y_train))
  print('[3]Support Vector Machine (RBF Classifier) Training Accuracy:', svc_rbf.score(X_train, Y_train))
  print('[4]Gaussian Naive Bayes Training Accuracy:', gauss.score(X_train, Y_train))
  print('[5]Decision Tree Classifier Training Accuracy:', tree.score(X_train, Y_train))
  print('[6]Random Forest Classifier Training Accuracy:', forest.score(X_train, Y_train))
  
  return log, knn, svc_lin, svc_rbf, gauss, tree, forest


model = models(X_train, Y_train)

[0]Logistic Regression Training Accuracy: 0.8031634446397188
[1]K Nearest Neighbor Training Accuracy: 0.789103690685413
[2]Support Vector Machine (Linear Classifier) Training Accuracy: 0.7768014059753954
[3]Support Vector Machine (RBF Classifier) Training Accuracy: 0.9173989455184535
[4]Gaussian Naive Bayes Training Accuracy: 0.8031634446397188
[5]Decision Tree Classifier Training Accuracy: 0.9929701230228472
[6]Random Forest Classifier Training Accuracy: 0.984182776801406


Now we can test our model on the test set

In [64]:
from sklearn.metrics import classification_report, accuracy_score 
models_name = ['Logistic Regression', 'knn', 'svc_lin', 'svc_rbf', 'gauss', 'tree', 'forest']

for i in range(len(model)):
    
    y_predic = model[i].predict(X_test)

    report = classification_report(Y_test, y_predic)
    print('Model: {}'.format(models_name[i]))
    print('Accuracy: {}'.format(accuracy_score(Y_test, y_predic)))
    print("Classification report")
    print(report)
    print('#####################################################')# Print a new line
    
    

Model: Logistic Regression
Accuracy: 0.8041958041958042
Classification report
              precision    recall  f1-score   support

           0       0.79      0.89      0.84        82
           1       0.82      0.69      0.75        61

   micro avg       0.80      0.80      0.80       143
   macro avg       0.81      0.79      0.79       143
weighted avg       0.81      0.80      0.80       143

#####################################################
Model: knn
Accuracy: 0.6643356643356644
Classification report
              precision    recall  f1-score   support

           0       0.68      0.78      0.73        82
           1       0.63      0.51      0.56        61

   micro avg       0.66      0.66      0.66       143
   macro avg       0.66      0.64      0.65       143
weighted avg       0.66      0.66      0.66       143

#####################################################
Model: svc_lin
Accuracy: 0.7902097902097902
Classification report
              precision    recal

Cross validation example

In [93]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import make_scorer


#Create a function within many Machine Learning Models

    
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),
           'recall' : make_scorer(recall_score)}

#Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=20, criterion='entropy', random_state=0)
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

results_kfold_forest = cross_validate(forest.fit(X,Y), X, Y, cv=kf, scoring=scoring)
results_kfold_tree = cross_validate(tree.fit(X,Y), X, Y, cv=kf, scoring=scoring)

# print model accuracy on the training data.
print('Random Forest Classifier Training Accuracy: {}% ({}%)'.format(results_kfold_forest['test_accuracy'].mean() * 100.0, results_kfold_forest['test_accuracy'].std() * 100.0))
print('Random Forest Classifier Training Precision: {}% ({}%)'.format(results_kfold_forest['test_precision'].mean() * 100.0, results_kfold_forest['test_precision'].std() * 100.0))
print('Random Forest Classifier Training Recall: {}% ({}%)'.format(results_kfold_forest['test_recall'].mean() * 100.0, results_kfold_forest['test_recall'].std() * 100.0))

print('##################################################################')
print()

print('Decision Tree Classifier Training Accuracy: {}% ({}%)'.format(results_kfold_tree['test_accuracy'].mean() * 100.0, results_kfold_tree['test_accuracy'].std() * 100.0))
print('Decision Tree Classifier Training Precision: {}% ({}%)'.format(results_kfold_tree['test_precision'].mean() * 100.0, results_kfold_tree['test_precision'].std() * 100.0))
print('Decision Tree Classifier Training Recall: {}% ({}%)'.format(results_kfold_tree['test_recall'].mean() * 100.0, results_kfold_tree['test_recall'].std() * 100.0))



Random Forest Classifier Training Accuracy: 80.06064162754302% (5.608055856387823%)
Random Forest Classifier Training Precision: 77.8193564355967% (8.3944425357888%)
Random Forest Classifier Training Recall: 70.12711312092871% (9.803484981326035%)
##################################################################

Decision Tree Classifier Training Accuracy: 76.40062597809077% (5.425554737429664%)
Decision Tree Classifier Training Precision: 70.64792893604701% (8.35740062008114%)
Decision Tree Classifier Training Recall: 71.02120972342112% (7.781584002116166%)
