In [15]:
#import all libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [16]:
# Load data
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00528/dataset.csv", header = 1, sep = ";")
df = df.drop(columns=['Green frogs','Brown frogs', 'Common toad', 'Common newt', 'Great crested newt','Tree frog'])
df = df.drop(columns=['ID', 'TR', 'VR', 'SUR1', 'SUR2', 'SUR3', 'UR', 'FR', 'RR', 'BR','MR', 'CR'])
df['Fire-bellied toad'].value_counts()

Fire-bellied toad
0    131
1     58
Name: count, dtype: int64

In [17]:
# Prepare Data
X = df.iloc[:, 1:-1].values
y = df.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(X_train, y_train)

In [19]:
# Sets to experiment with
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size = 0.40)
scaler = StandardScaler()
scaler.fit(X_train2)
X_train2 = scaler.transform(X_train2)
X_test2 = scaler.transform(X_test2)


In [20]:
# Basic Knn
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train2, y_train2)

In [21]:
# Gridsearch
params = {"n_neighbors":[2,3,4,5,6,7,8,9]}
model = GridSearchCV(classifier, params, cv=3)

In [22]:
model.fit(X_train, y_train)
model.best_params_

{'n_neighbors': 6}

In [23]:
model.best_score_

0.7349454717875771

In [24]:
# Knn with 3 neighbors
gridsearch_knn = KNeighborsClassifier(n_neighbors = 3)
gridsearch_knn.fit(X_train2, y_train2)

In [25]:
gridsearch_y_pred = gridsearch_knn.predict(X_test2)
print(gridsearch_y_pred)

[0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0
 1 1]


In [26]:
print(confusion_matrix(y_test2, gridsearch_y_pred))
print(classification_report(y_test2, gridsearch_y_pred))

[[41 10]
 [20  5]]
              precision    recall  f1-score   support

           0       0.67      0.80      0.73        51
           1       0.33      0.20      0.25        25

    accuracy                           0.61        76
   macro avg       0.50      0.50      0.49        76
weighted avg       0.56      0.61      0.57        76



In [27]:
# Random Search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

params = {"n_neighbors": range(1,5), "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn, param_distributions=params,cv=4,n_iter=8, random_state=5)
rsearch.fit(X_train, y_train)
rsearch_y_pred = rsearch.predict(X_test)

print(rsearch.best_score_)

0.7518472906403941


In [28]:
print(confusion_matrix(y_test2, rsearch_y_pred))
print(classification_report(y_test2, rsearch_y_pred))

[[47  4]
 [25  0]]
              precision    recall  f1-score   support

           0       0.65      0.92      0.76        51
           1       0.00      0.00      0.00        25

    accuracy                           0.62        76
   macro avg       0.33      0.46      0.38        76
weighted avg       0.44      0.62      0.51        76

