<a href="https://colab.research.google.com/github/wildautumnwind/ml_notebooks/blob/master/homework_titanic_knn_and_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Homework 2 Light: Titanic Dataset (KNN and Bayes)**

https://www.kaggle.com/c/titanic/data

# Load libs

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.model_selection import KFold
import sklearn
import numpy as np
from google.colab import files
import pandas as pd
import matplotlib.pyplot as plt

# User defined functions

In [0]:
# Select best metrics from the given.
# Custom method for KNN
def select_best_metrics(X_train, y_train, X, y):
  best_metrics_selected = {}

  metrics = ['euclidean', 'manhattan', 'minkowski']
  weights = ['uniform', 'distance']
  k_nums = range(2, 100)
  best_accuracy = 0.0

  for weight in weights:
    for metric in metrics:
      for k_num in k_nums:  
        model = KNeighborsClassifier(n_neighbors = k_num, metric = metric, weights = weight)
        model.fit(X_train, y_train)

        y_pred = model.predict(X)
        current_accuracy = accuracy_score(y, y_pred)
        if current_accuracy > best_accuracy:
          best_accuracy = current_accuracy
          best_metrics_selected['metric'] = metric
          best_metrics_selected['n_neighbors'] = k_num
          best_metrics_selected['weights'] = weight

  return best_metrics_selected

In [0]:
# Fit models on Grid SearchCV
def grid_search_fitting(models, parameters, X_train, y_train, X_test, y_test): 
  scores = ['accuracy', 'precision', 'recall']
  
  for score in scores:
    clf = GridSearchCV(estimator = models, scoring = score, param_grid = parameters, cv = 5, iid = False)

    clf.fit(X_train, y_train)
    
    score_test = clf.best_estimator_.score(X_test, y_test) 
        

    print(score)
    print(clf.best_estimator_)
    print(clf.best_score_)
    print("\n")

  print('accuracy_on_test: ', score_test)
  print("\n")
  
  return clf

# Load dataset

In [3]:
!pip install kaggle



In [4]:
file = files.upload()

Saving kaggle.json to kaggle.json


In [0]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle

In [6]:
!kaggle competitions download -c titanic

Downloading train.csv to /content
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 22.5MB/s]
Downloading test.csv to /content
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 24.6MB/s]
Downloading gender_submission.csv to /content
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 2.80MB/s]


# Data exploration

In [0]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [8]:
# Test data (we will use it for prediction)
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
# Shape of the train dataset
train.shape

(891, 12)

In [10]:
# Train dataset info
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
# Describe train dataset
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
# Data for model training
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
# Check empty values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Features transformation

In [0]:
X = train.drop(['PassengerId','Survived', 'Name','Ticket', 'Cabin'], axis = 1)
y = train.Survived

In [0]:
X = X.fillna({'Age': X.Age.median()})

In [46]:
X.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [0]:
MaxPassEmbarked = train.groupby('Embarked').count()['PassengerId']

In [48]:
MaxPassEmbarked

Embarked
C    168
Q     77
S    644
Name: PassengerId, dtype: int64

In [0]:
X = X.fillna({'Embarked': MaxPassEmbarked[MaxPassEmbarked == MaxPassEmbarked.max()].index[0]})

In [50]:
X.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [0]:
X = pd.get_dummies(X)

In [52]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,0,1
4,3,35.0,0,0,8.05,0,1,0,0,1


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [54]:
print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(712, 10) (712,) (179, 10) (179,)


# Training

In [67]:
MultinomialNB_param = {'alpha': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 0.01, 0.1, 0.2, 0.5, 0.75, 0.9, 1.0], 'fit_prior': ['True', 'False']}
grid_search_fitting(MultinomialNB(), MultinomialNB_param, X_train, y_train, X_test, y_test)

accuracy
MultinomialNB(alpha=1e-09, class_prior=None, fit_prior='True')
0.6825505929491545


precision
MultinomialNB(alpha=1e-09, class_prior=None, fit_prior='True')
0.6076733649545438


recall
MultinomialNB(alpha=1e-09, class_prior=None, fit_prior='True')
0.44388539482879114


accuracy_on_test:  0.7262569832402235




GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid=False, n_jobs=None,
             param_grid={'alpha': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001,
                                   0.001, 0.01, 0.1, 0.2, 0.5, 0.75, 0.9, 1.0],
                         'fit_prior': ['True', 'False']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

In [68]:
GaussianNB_param = {'var_smoothing':  [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 0.01, 0.1, 0.2, 0.5, 0.75, 0.9, 1.0]}
grid_search_fitting(GaussianNB(), GaussianNB_param, X_train, y_train, X_test, y_test)

accuracy
GaussianNB(priors=None, var_smoothing=1e-05)
0.7934956862136964


precision
GaussianNB(priors=None, var_smoothing=0.0001)
0.7950187969924812


recall
GaussianNB(priors=None, var_smoothing=1e-09)
0.7049615653389238


accuracy_on_test:  0.776536312849162




GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GaussianNB(priors=None, var_smoothing=1e-09), iid=False,
             n_jobs=None,
             param_grid={'var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05,
                                           0.0001, 0.001, 0.01, 0.1, 0.2, 0.5,
                                           0.75, 0.9, 1.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

In [69]:
ComplementNB_param = {'alpha': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 0.01, 0.1, 0.2, 0.5, 0.75, 0.9, 1.0], 'fit_prior': ['True', 'False'], 'norm': ['True', 'False']}
grid_search_fitting(ComplementNB(), ComplementNB_param, X_train, y_train, X_test, y_test)

accuracy
ComplementNB(alpha=0.75, class_prior=None, fit_prior='True', norm='True')
0.6853673546511425


precision
ComplementNB(alpha=0.75, class_prior=None, fit_prior='True', norm='True')
0.5916489173112025


recall
ComplementNB(alpha=1e-09, class_prior=None, fit_prior='True', norm='True')
0.5336128581411601


accuracy_on_test:  0.7486033519553073




GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=ComplementNB(alpha=1.0, class_prior=None, fit_prior=True,
                                    norm=False),
             iid=False, n_jobs=None,
             param_grid={'alpha': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001,
                                   0.001, 0.01, 0.1, 0.2, 0.5, 0.75, 0.9, 1.0],
                         'fit_prior': ['True', 'False'],
                         'norm': ['True', 'False']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

In [70]:
BernoulliNB_param = {'alpha': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 0.01, 0.1, 0.2, 0.5, 0.75, 0.9, 1.0], 
                     'binarize': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 0.01, 0.1, 0.2, 0.5, 0.75, 0.9, 1.0],
                     'fit_prior': ['True', 'False']}
grid_search_fitting(BernoulliNB(), BernoulliNB_param, X_train, y_train, X_test, y_test)

accuracy
BernoulliNB(alpha=1e-09, binarize=0.9, class_prior=None, fit_prior='True')
0.7922158353084334


precision
BernoulliNB(alpha=1e-09, binarize=0.9, class_prior=None, fit_prior='True')
0.7418962523505939


recall
BernoulliNB(alpha=1e-09, binarize=0.9, class_prior=None, fit_prior='True')
0.6865828092243186


accuracy_on_test:  0.7877094972067039




GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                                   fit_prior=True),
             iid=False, n_jobs=None,
             param_grid={'alpha': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001,
                                   0.001, 0.01, 0.1, 0.2, 0.5, 0.75, 0.9, 1.0],
                         'binarize': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001,
                                      0.001, 0.01, 0.1, 0.2, 0.5, 0.75, 0.9,
                                      1.0],
                         'fit_prior': ['True', 'False']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

In [71]:
KNeighborsClassifier_param = {'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
                              'n_neighbors': [k for k in range (1, 11)],
                              'p': [p for p in range (1, 11, 2)]
                             }
grid_search_fitting(KNeighborsClassifier(), KNeighborsClassifier_param, X_train, y_train, X_test, y_test)

accuracy
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=1,
                     weights='uniform')
0.7583915036117613


precision
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=1,
                     weights='uniform')
0.762989875011989


recall
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=1,
                     weights='uniform')
0.6308176100628932


accuracy_on_test:  0.7430167597765364




GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid=False, n_jobs=None,
             param_grid={'metric': ['euclidean', 'manhattan', 'chebyshev',
                                    'minkowski'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'p': [1, 3, 5, 7, 9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

# Submission

In [75]:
y = train["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])

model = BernoulliNB(alpha = 1.0, binarize = 0.0, class_prior = None, fit_prior = True)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index = False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [76]:
!kaggle competitions submit -c titanic -f my_submission.csv -m "BernoulliNB submition from Colab"

100% 2.77k/2.77k [00:04<00:00, 600B/s]
Successfully submitted to Titanic: Machine Learning from Disaster

https://www.kaggle.com/c/titanic/leaderboard#score

pos 7639

# Useful links

* https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/