In [58]:
import numpy as np
from scipy.spatial import KDTree
from statistics import mode
import pickle
import pandas as pd
from sklearn import preprocessing
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
class NearestNeighbor:
    def __init__(self):
        self.tree = None
        self.y_train = None

    def fit(self, X_train, y_train):
      # for loop for inserting x_train inside ds
        self.tree = KDTree(X_train)
        self.y_train = np.array(y_train)

    def predict(self, X_test, k = 10):
      # calls find function of kdtree that looks for the NNs for x_test
      # label is the one that has 0 dist
        d, index = self.tree.query(X_test, k)
        # return y_train[index] so we know the predicted label
        labels = []
        for i in index:
          labels.append(self.y_train[i]) # adds labels of found NNs

        return mode(labels) # returns most frequent label amongst found NNs

## EDA

In [43]:
df_train_labels = pd.read_csv(r'.\train.csv')
df_train_labels.head()

Unnamed: 0,idx,class
0,0,NoWhale
1,1,RightWhale
2,2,NoWhale
3,3,NoWhale
4,4,NoWhale


In [44]:
df_train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10934 entries, 0 to 10933
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   idx     10934 non-null  int64 
 1   class   10934 non-null  object
dtypes: int64(1), object(1)
memory usage: 171.0+ KB


In [45]:
df_train_labels['class'].unique()

array(['NoWhale', 'RightWhale'], dtype=object)

In [46]:
label_encoder = preprocessing.LabelEncoder()
df_train_labels['class'] = label_encoder.fit_transform(df_train_labels['class'])
df_train_labels.head()
# 0 : NoWhale
# 1 : RightWhale

Unnamed: 0,idx,class
0,0,0
1,1,1
2,2,0
3,3,0
4,4,0


In [55]:
y_train_labels = np.array(df_train_labels['class'])

## Feature processing

In [47]:
with open ('train_features_mfcc.pkl', 'rb') as file:
    features_mfcc = pickle.load(file)

In [48]:
print(type(features_mfcc))

<class 'numpy.ndarray'>


In [49]:
features_mfcc_flat = []
for i, feature in enumerate(features_mfcc):
    temp_feature = []
    for local_feature in feature:
        temp_feature = np.concatenate((temp_feature, local_feature), axis=None)
    features_mfcc_flat.append(temp_feature)

In [50]:
features_mfcc_flat = np.array(features_mfcc_flat)
print(features_mfcc_flat.shape)

(10934, 1740)


## Normalization

In [51]:
features_mfcc_flat = (features_mfcc_flat - features_mfcc_flat.min(axis=0)) / (features_mfcc_flat.max(axis=0) - features_mfcc_flat.min(axis=0))

## PCA

In [52]:
pca = PCA(n_components=200)
pca.fit(features_mfcc_flat)
explained_variance_cumsum = np.cumsum(pca.explained_variance_ratio_)

n_components_above_90 = np.argmax(explained_variance_cumsum >= 0.9) + 1
print(n_components_above_90)

103


In [53]:
pca = PCA(n_components=n_components_above_90)
features_mfcc_flat = pca.fit_transform(features_mfcc_flat)

In [54]:
print(features_mfcc_flat.shape)

(10934, 103)


In [56]:
nn = NearestNeighbor()
nn.fit(features_mfcc_flat, y_train_labels)

In [57]:
y_preds = []
for i in range(len(features_mfcc_flat)):
    y_pred = nn.predict(features_mfcc_flat[i], 10)
    y_preds.append(y_pred)


In [59]:
accuracy= accuracy_score(y_train_labels,y_preds)
presicion= precision_score(y_train_labels,y_preds,average='macro')
recall= recall_score(y_train_labels,y_preds, average='macro'  )
score= f1_score(y_train_labels,y_preds,average='macro')

print("Presicion: : ",presicion)
print("Recall: ",recall)
print("F1_Score: ",score)
print("Accuracy: ",accuracy)

Presicion: :  0.7501369939730691
Recall:  0.7498628132430949
F1_Score:  0.7497942492191343
Accuracy:  0.7498628132430949
