##### Imports & setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

# Feature names (without linear features):
feature_names = ['mean', 'weightedMean', 'std', 'median', 'amplitude', 'beyond1Std', 'cusum', 'IPR10',
                 'kurtosis', 'MPR40_5', 'MPR20_10', 'maxSlope', 'medianAbsDev', 'medianBRP10',
                 'percentAmplitude', 'meanVariance', 'andersonDarlingNorm', 'chi2', 'skew', 'stetsonK']

## Data preparation

### Getting data

In [2]:
# The following paths should be changed accordingly. The data can be saved in a csv from the clean_data.ipynb notebook.
positive_g = pd.read_csv('../../data/clean_data/positive_class_g.csv', index_col=0)
positive_r = pd.read_csv('../../data/clean_data/positive_class_r.csv', index_col=0)
negative_g = pd.read_csv('../../data/clean_data/negative_class_g.csv', index_col=0)
negative_r = pd.read_csv('../../data/clean_data/negative_class_r.csv', index_col=0)

# Adding labels for each class:
positive_g['class'] = 'positive'
positive_r['class'] = 'positive'
negative_g['class'] = 'negative'
negative_r['class'] = 'negative'

### Organizing samples & train-test split

In [3]:
np.random.seed(42)

# Taking a random sample of the negative class with the same size as the positive class:
negative_g_sample1 = negative_g.sample(frac=len(positive_g)/len(negative_g))
# Taking a random sample of the negative class with twice the size of the positive class:
negative_g_sample2 = negative_g.sample(frac=2*len(positive_g)/len(negative_g))
# Taking a random sample of the negative class with thrice the size of the positive class:
negative_g_sample3 = negative_g.sample(frac=3*len(positive_g)/len(negative_g))

# Splitting the data into training and testing sets (no need for a negative test set since we are only looking for positive class objects with nearest neighbors):
positive_g_train, positive_g_test = train_test_split(positive_g, train_size=0.7)
negative_g_train1, _ = train_test_split(negative_g_sample1, train_size=0.7)
negative_g_train2, _ = train_test_split(negative_g_sample2, train_size=0.7)
negative_g_train3, _ = train_test_split(negative_g_sample3, train_size=0.7)

# Concatenating the training data in one dataframe:
train_sample1 = pd.concat([positive_g_train, negative_g_train1])
train_sample2 = pd.concat([positive_g_train, negative_g_train2])
train_sample3 = pd.concat([positive_g_train, negative_g_train3])
train_sample4 = pd.concat([positive_g_train, negative_g]) # Using the whole negative class in training

## Finding nearest neighbors

### Wrapper

In [4]:
def NN_wrapper(train: pd.DataFrame, test: pd.DataFrame, k: int = 1) -> pd.DataFrame:
    """Wrapper for the NearestNeighbors class from scikit-learn.
    This function takes the training and testing sets and prints out the propotion of neighbors that are of the positive and negative class.

    Args:
        train (pd.DataFrame): Training set (a combination of positive and negative class objects).
        test (pd.DataFrame): Testing set (positive class objects only, and different from those of the training set).
        k (int, optional): Number of nearest neighbors to look for for each object in the test set. Defaults to 1.
    
    Returns:
        neighbors (pd.DataFrame): The neighbors of the test set objects.
    """
    # Fitting the NearestNeighbors model:
    neigh = NearestNeighbors(algorithm='brute', metric='euclidean')
    neigh.fit(train[feature_names])

    # Finding the nearest neighbors of the test set objects:
    neighbors_indices = neigh.kneighbors(test[feature_names], k, return_distance=False)
    neighbors = train.iloc[neighbors_indices.flatten()]
    print(neighbors['class'].value_counts())

    # Calculating the accuracy of the model:
    accuracy = (neighbors['class'] == 'positive').sum() / len(neighbors)
    print(f'\nPercentage of positive class samples in the test set that have a positive class nearest neighbor: {accuracy*100:.0f} %')

    return neighbors

### Results

#### Train sample positive-negative ratio: 1-1

In [5]:
NN_wrapper(train_sample1, positive_g_test);

class
positive    984
negative    188
Name: count, dtype: int64

Percentage of positive class samples in the test set that have a positive class nearest neighbor: 84 %


#### Train sample positive-negative ratio: 1-2

In [6]:
NN_wrapper(train_sample2, positive_g_test);

class
positive    922
negative    250
Name: count, dtype: int64

Percentage of positive class samples in the test set that have a positive class nearest neighbor: 79 %


#### Train sample positive-negative ratio: 1-3

In [7]:
NN_wrapper(train_sample3, positive_g_test);

class
positive    891
negative    281
Name: count, dtype: int64

Percentage of positive class samples in the test set that have a positive class nearest neighbor: 76 %


#### Train sample positive-negative ratio: 1-full

In [8]:
NN_wrapper(train_sample4, positive_g_test);

class
positive    623
negative    549
Name: count, dtype: int64

Percentage of positive class samples in the test set that have a positive class nearest neighbor: 53 %
