In [9]:
## import necessary modules

import os
import numpy as np
import pylab as plt
import pandas as pd

import joblib

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from pyod.models.knn import KNN
from pyod.models.iforest import IForest
from pyod.models.ecod import ECOD
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.cof import COF
from pyod.models.cd import CD
from pyod.models.copod import COPOD
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.inne import INNE
from pyod.models.kde import KDE
from pyod.models.loci import LOCI
from pyod.models.pca import PCA

In [6]:
## load in the data, and establish some definitions
infile_name = '/home/ellie/research/lsst/LSST_sim.csv'
df_initial = pd.read_csv(infile_name)

## initialize name of file that the GMM is saved to
gmm_fname = "/home/ellie/research/lsst/gmm.pkl"

## set a value for the percentage of outliers, aka the contamination
## (This choice was arbitrary, should probably be tweaked)
contamination = 0.005

In [14]:
# select fields of interest
subspace = ["a*", "i-z", "a", "sini", "e", "v-vk", "r"]
df = df_initial[subspace]
df = df.dropna()

# normalize the data so that values fall in a range from 0-1
scaler = MinMaxScaler()
scaler.fit(df.to_numpy())
norm_data = scaler.fit_transform(df.to_numpy()) ## note "norm_data" is the same as X in Brian's code

#print(norm_data)
#print(norm_data[0])

[[0.05494381 0.90483069 0.73278017 ... 0.12610062 0.5413601  0.06213448]
 [0.05494381 0.90483069 0.73278017 ... 0.12610062 0.53845063 0.06062557]
 [0.05494381 0.90483069 0.73278017 ... 0.12610062 0.55504017 0.06109883]
 ...
 [0.07300705 0.90026414 0.74741902 ... 0.07891649 0.53010546 0.06423747]
 [0.07300705 0.90026414 0.74741902 ... 0.07891649 0.53953043 0.05520661]
 [0.07300705 0.90026414 0.74741902 ... 0.07891649 0.53010655 0.06423745]]
[0.05494381 0.90483069 0.73278017 0.17357213 0.12610062 0.5413601
 0.06213448]


In [16]:
## Load the previously-saved Gaussian mixture model:
gmm = joblib.load(gmm_fname)

# predict the labels for the data samples in X using the trained model 
#(need to understand this part better) -MEW
labels = gmm.predict(norm_data)

## add labels to the pandas dataframe as a new column -MEW
df["labels"] = labels

In [23]:
## split the data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(
    df[["a*", "i-z", "a", "sini", "e", "v-vk", "r"]], 
    df["labels"], #is it ok to use the labels determined by the GMM here? 
    test_size=.3,
    #random_state=42,
    stratify=df["labels"], #not sure this is right? 
    shuffle=True
)

#print(X_test)

              a*       i-z         a      sini         e      v-vk         r
109248  0.090100 -0.062071  1.888813  0.398239  0.087706  0.000009  1.873713
182701  0.085854 -0.066548  2.684560  0.068681  0.225277  0.000009  2.860738
408882  0.125065 -0.041721  2.892423  0.231767  0.060799  0.000008  3.057419
220097  0.103178 -0.055671  2.347273  0.086732  0.143675  0.000008  2.328561
275050 -0.106551  0.064984  2.628937  0.188710  0.131200  0.000003  2.963394
...          ...       ...       ...       ...       ...       ...       ...
72277   0.092759 -0.056627  2.367557  0.080546  0.143822  0.000011  2.213702
428072  0.096613 -0.032858  3.204422  0.157824  0.067351  0.000006  3.004406
3117    0.106546 -0.041149  2.572446  0.135607  0.041630  0.000006  2.488952
343581  0.095042 -0.058189  2.565701  0.111044  0.298417  0.000022  3.138079
66661  -0.093855  0.011141  2.699426  0.078290  0.071763  0.000010  2.868295

[150000 rows x 7 columns]


In [21]:
## Try K nearest neighbors (KNN) first...note this bit of code is 
## directly borrowed from https://pyod.readthedocs.io/en/latest/example.html

# train kNN detector
clf_name = 'KNN'
clf = KNN()
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores
print(y_train_scores.shape)

(350000,)


In [22]:
# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores
print(y_test_scores.shape)

# it is possible to get the prediction confidence as well
y_test_pred, y_test_pred_confidence = clf.predict(X_test, return_confidence=True) # outlier labels (0 or 1) and confidence in the range of [0,1]

(150000,)
