##### Imports & setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

# Feature names (without linear features):
feature_names = ['mean', 'weightedMean', 'std', 'median', 'amplitude', 'beyond1Std', 'cusum', 'IPR10',
                 'kurtosis', 'MPR40_5', 'MPR20_10', 'maxSlope', 'medianAbsDev', 'medianBRP10',
                 'percentAmplitude', 'meanVariance', 'andersonDarlingNorm', 'chi2', 'skew', 'stetsonK']

## Data preparation

### Getting data

In [2]:
# The following paths should be changed accordingly. The data can be saved in a csv from the clean_data.ipynb notebook.
positive_g = pd.read_csv('../../data/clean_data/positive_class_g.csv', index_col=0)
positive_r = pd.read_csv('../../data/clean_data/positive_class_r.csv', index_col=0)
negative_g = pd.read_csv('../../data/clean_data/negative_class_g.csv', index_col=0)
negative_r = pd.read_csv('../../data/clean_data/negative_class_r.csv', index_col=0)

# Adding labels for each class:
positive_g['class'] = 'positive'
positive_r['class'] = 'positive'
negative_g['class'] = 'negative'
negative_r['class'] = 'negative'

### Organizing samples

In [3]:
# IDs of unique objects in each class:
positive_g_IDs = np.unique(positive_g['objectId'])
positive_r_IDs = np.unique(positive_r['objectId'])
negative_g_IDs = np.unique(negative_g['objectId'])
negative_r_IDs = np.unique(negative_r['objectId'])

# Number of unique objects in each class:
print(f'There are {len(positive_g_IDs)} unique objects in the positive class (g filter).')
print(f'There are {len(positive_r_IDs)} unique objects in the positive class (r filter).')
print(f'There are {len(negative_g_IDs)} unique objects in the negative class (g filter).')
print(f'There are {len(negative_r_IDs)} unique objects in the negative class (r filter).')

There are 69 unique objects in the positive class (g filter).
There are 67 unique objects in the positive class (r filter).
There are 101035 unique objects in the negative class (g filter).
There are 93551 unique objects in the negative class (r filter).


In [4]:
np.random.seed(666)

# Sampling objects from the negative class to have the same number of objects as in the positive class:
negative_g_sample_IDs1 = np.random.choice(negative_g_IDs, len(positive_g_IDs), replace=False)
negative_r_sample_IDs1 = np.random.choice(negative_r_IDs, len(positive_r_IDs), replace=False)

# Sampling objects from the negative class to have twice the number of objects in the positive class:
negative_g_sample_IDs2 = np.random.choice(negative_g_IDs, 2*len(positive_g_IDs), replace=False)
negative_r_sample_IDs2 = np.random.choice(negative_r_IDs, 2*len(positive_r_IDs), replace=False)

# Sampling objects from the negative class to have three times the number of objects in the positive class:
negative_g_sample_IDs3 = np.random.choice(negative_g_IDs, 3*len(positive_g_IDs), replace=False)
negative_r_sample_IDs3 = np.random.choice(negative_r_IDs, 3*len(positive_r_IDs), replace=False)


# Selecting the alerts from the sampled objects in the negative class:
negative_g_sample1 = negative_g[negative_g['objectId'].isin(negative_g_sample_IDs1)]
negative_r_sample1 = negative_r[negative_r['objectId'].isin(negative_r_sample_IDs1)]

negative_g_sample2 = negative_g[negative_g['objectId'].isin(negative_g_sample_IDs2)]
negative_r_sample2 = negative_r[negative_r['objectId'].isin(negative_r_sample_IDs2)]

negative_g_sample3 = negative_g[negative_g['objectId'].isin(negative_g_sample_IDs3)]
negative_r_sample3 = negative_r[negative_r['objectId'].isin(negative_r_sample_IDs3)]

In [5]:
len(positive_g)

3917

In [6]:
len(negative_g_sample_IDs1)

69

In [7]:
len(negative_g_sample1)

196

In [8]:
len(negative_g_sample2)

361

In [9]:
len(negative_g_sample3)

633

For the same number of objects, we have much less alerts in the negative class than in the positive one