In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

## Reading Data

In [14]:
# GALAXY DATASET
def DatasetGALAXY():
    df = pd.read_csv('../../Data/galaxymorphology/dataset1_sydney.csv')
    df.head()
    df['target'].value_counts()
    mapping = {
        'elliptical': 0,
        'spiral':1
    }
    df['target'] = df['target'].map(mapping)
    x = df.sample(df.shape[0])
    df = x.iloc[:, :5]
    df['target'] = x.iloc[:,-1]
    return df
df = DatasetGALAXY()

In [62]:
#CAR DATASET
def DatasetCAR():
    df = pd.read_csv('../../Data/dataset1-car/car_dataset.csv')
    
    df['buying'].replace(['vhigh', 'high', 'med', 'low'],[3,2,1,0], inplace = True)
    df['maintenance'].replace(['vhigh', 'high', 'med', 'low'],[3,2,1,0], inplace = True)
    df['doors'].replace(['2', '3', '4', '5more'],[2,3,4,5], inplace = True)
    df['persons'].replace(['2', '4', 'more'],[2,4,5], inplace = True)
    df['lug_boot'].replace(['small', 'med', 'big'],[1,2,3], inplace = True)
    df['safety'].replace(['high', 'med', 'low'],[3,2,1], inplace = True)
    df['target'].replace(['unacc', 'acc', 'good', 'vgood'],[0,1,2,3], inplace = True)
    df= df[df.target !=3]
    df= df[df.target !=2]
    df = df.sample(df.shape[0])
    return df

df = DatasetCAR()


In [66]:
#IRIS DATASET
def DatasetIRIS():
    df = pd.read_csv('../../Data/iris/iris.data')
    df['target'].replace(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'],[0,1,2], inplace = True)
    df = df.sample(df.shape[0])
    return df
df = DatasetIRIS()


In [69]:
#TITANIC DATASET
def DatasetTITANIC():
    df = pd.read_csv('../../Data/Titanic/train.csv')
    df.drop(columns = ['Cabin', 'Embarked', 'Ticket','Name','PassengerId'], inplace=True)
    df['Sex'].replace(['male', 'female'],[0,1], inplace = True)
    df =df.dropna()
    
    return df
df = DatasetTITANIC()

In [73]:
df

Unnamed: 0,buying,maintenance,doors,persons,lug_boot,safety,target
100,3,3,5,5,1,2,0
1361,0,3,4,4,1,3,1
695,2,1,3,5,1,3,1
1243,1,0,4,2,1,2,0
491,2,3,4,2,2,3,0
...,...,...,...,...,...,...,...
178,3,2,4,4,3,2,0
113,3,2,2,2,2,3,0
1399,0,3,5,5,2,2,1
1410,0,2,2,2,3,1,0


# KNN Algorithm

In [16]:
def distance(x_query, x):
    return np.sum((x_query - x)**2)**0.5

In [52]:
def kNN(X, y, x_test, k=5):

    train_size = X.shape[0]
    
    pred = []
    for x_query in x_test:
        distances = []
        for i in range(train_size):
            dis = distance(x_query, X[i])
            
            distances.append((dis, y[i]))
    
            distances = sorted(distances)
            distances = distances[:k]

        distances = np.array(distances)
        labels = distances[:,1]

        uniq_label, counts = np.unique(labels, return_counts=True)   
        pred.append(int(uniq_label[counts.argmax()]))

    return pred

In [53]:
def get_overall_cm(df):
    classes = df['target'].nunique()
    if classes < 2:
        classes = 2
    overall_cm = []
    for i in range(classes):
        overall_cm.append( [0] * classes)
    return overall_cm

In [60]:
def Kfold(df,n_folds, test_size, overall_cm):
    # Iterate through all folds
    overall_actual = []
    overall_predicted = []
    for i in range(n_folds):
        # split test and train
        test = df.iloc[test_size * i : test_size*i+test_size]
        train = df.iloc[test_size:]

        # remove target label and convert it to numpy for KNN to work
        X_train = test.iloc[:,:-1].to_numpy()
        y_train = test.iloc[:,-1].to_numpy()
        X_test = test.iloc[:,:-1].to_numpy()
        y_test = list(test.iloc[:,-1])

        #start training and testing the dataset
        y_pred = kNN(X_train, y_train, X_test, k=7)
        # generate and append cm 

        for p,a in zip(y_pred,y_test):
            overall_predicted.append(p)
            overall_actual.append(a)
        overall_cm = overall_cm + confusion_matrix(y_test, y_pred)

    print(classification_report( overall_predicted, overall_actual))
    return overall_cm

# TRAIN + TEST

In [71]:
if __name__ == "__main__":
    # KFOLD take in n folds and the size of dataset
    n_folds = 2
    test_size = int(df.shape[0]/n_folds)

    # get overall cm returns a Confusion matrix with 0's shape depends on (classes)
    final_confusion_matrix= Kfold(df,n_folds, test_size, get_overall_cm(df))

    print(f"\n\n\n Confusion Matrix\n{final_confusion_matrix}")

              precision    recall  f1-score   support

           0       0.82      0.76      0.79       671
           1       0.62      0.71      0.66       373

    accuracy                           0.74      1044
   macro avg       0.72      0.73      0.72      1044
weighted avg       0.75      0.74      0.74      1044




 Confusion Matrix
[[508 109]
 [163 264]]


In [65]:
#from sklearn.model_selection import KFold, cross_val_score
# from sklearn.metrics import make_scorer
#  kf = KFold(n_splits=n_folds)
# overall_cm = [[0, 0], [0, 0]]
# cross_val_score(kNN, X_train, y_train, scoring=make_scorer(classification_report_with_accuracy_score))
# overall_cm = overall_cm + confusion_matrix(originalclass, predictedclass)