<a href="https://colab.research.google.com/github/wildautumnwind/ml_notebooks/blob/master/homework_wine_titanic_knn_v_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Homework 2 Light: Titanic Dataset (KNN and Bayes)

https://www.kaggle.com/c/titanic/data

# Load libs

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import sklearn
import numpy as np
from google.colab import files
import pandas as pd
import matplotlib.pyplot as plt

# User defined functions

In [0]:
# Select best metrics from the given
def select_best_metrics(X_train, y_train, X, y):
  best_metrics_selected = {}

  metrics = ['euclidean', 'manhattan', 'minkowski']
  weights = ['uniform', 'distance']
  k_nums = range(2, 100)
  best_accuracy = 0.0

  for weight in weights:
    for metric in metrics:
      for k_num in k_nums:  
        model = KNeighborsClassifier(n_neighbors = k_num, metric = metric, weights = weight)
        model.fit(X_train, y_train)

        y_pred = model.predict(X)
        current_accuracy = accuracy_score(y, y_pred)
        if current_accuracy > best_accuracy:
          best_accuracy = current_accuracy
          best_metrics_selected['metric'] = metric
          best_metrics_selected['n_neighbors'] = k_num
          best_metrics_selected['weights'] = weight

  return best_metrics_selected

# Load dataset

In [11]:
!pip install kaggle



In [12]:
file = files.upload()

Saving kaggle.json to kaggle.json


In [13]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [0]:
!kaggle competitions download -c titanic

Downloading train.csv to /content
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 22.7MB/s]
Downloading test.csv to /content
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 30.2MB/s]
Downloading gender_submission.csv to /content
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 2.77MB/s]


# Data exploration

In [0]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [15]:
# Test data (we will use it for prediction)
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [17]:
# Train dataset info
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [16]:
# Data for model training
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Training

In [0]:
y = train["Survived"]

In [0]:
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train[features])

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [0]:
print('Train data shape - ' + str(X_train.shape) + ', train labels shape - ' + str(y_train.shape))
print('Test data shape - ' + str(X_test.shape) + ', test labels shape - ' + str(y_test.shape))

Train data shape - (712, 5), train labels shape - (712,)
Test data shape - (179, 5), test labels shape - (179,)


In [0]:
# Call udf to get best parameters
best_metrics = select_best_metrics(X_train, y_train, X, y)

In [0]:
# Check best parameters
best_metrics

{'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}

In [0]:
# Fit model with best parameters selected previously
model = KNeighborsClassifier(n_neighbors = best_metrics['n_neighbors'], metric = best_metrics['metric'], weights = best_metrics['weights'])
model.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')

In [0]:
# Get predictions
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1])

In [0]:
print('Accuracy: ', accuracy_score(y_test, y_pred), accuracy_score(y_test, y_pred, normalize = False))

Accuracy:  0.7877094972067039 141


# Useful links

* https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/