In [86]:
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.mcd import MCD
from pyod.models.iforest import IForest
from pyod.models.pca import PCA
from pyod.models.abod import ABOD
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
alldata = pd.read_csv('classification_data_ts_truthing_subset.csv')
# Drop rows with missing values in HMTruth
alldata = alldata.dropna(subset=['HMTruth'])

In [None]:
# Add columns for each of the outlier detection methods
alldata['LOF'] = None
alldata['KNN'] = None
alldata['MCD'] = None
alldata['IForest'] = None
alldata['PCA'] = None
alldata['ABOD'] = None
alldata['LOF_scores'] = None
alldata['KNN_scores'] = None
alldata['MCD_scores'] = None
alldata['IForest_scores'] = None
alldata['PCA_scores'] = None
alldata['ABOD_scores'] = None

In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# Iterate through datasets by RepID, apply various outlier detection methods using pyod, and save results to a new dataframe
rep_ids = alldata['RepID'].unique()
for rep_id in rep_ids:
    flux = alldata[alldata['RepID'] == rep_id][['N2OFlux']]

    # Add column to flux dataframe counting upward from 0
    flux['index'] = np.arange(len(flux))

    # LOF
    lof = LOF(contamination=0.25)
    lof.fit(flux)
    lof_pred = lof.predict(flux)
    lof_scores = lof.decision_scores_

    # KNN
    knn = KNN(contamination=0.25)
    knn.fit(flux)
    knn_pred = knn.predict(flux)
    knn_scores = knn.decision_scores_

    # MCD
    mcd = MCD(contamination=0.25)
    mcd.fit(flux)
    mcd_pred = mcd.predict(flux)
    mcd_scores = mcd.decision_scores_

    # IForest
    iforest = IForest(contamination=0.25)
    iforest.fit(flux)
    iforest_pred = iforest.predict(flux)
    iforest_scores = iforest.decision_scores_

    # PCA
    pca = PCA(contamination=0.25)
    pca.fit(flux)
    pca_pred = pca.predict(flux)
    pca_scores = pca.decision_scores_

    # ABOD
    abod = ABOD(contamination=0.25)
    abod.fit(flux)
    abod_pred = abod.predict(flux)
    abod_scores = abod.decision_scores_

    # Add results lists to dataframe
    alldata.loc[alldata['RepID'] == rep_id, 'LOF'] = lof_pred
    alldata.loc[alldata['RepID'] == rep_id, 'KNN'] = knn_pred
    alldata.loc[alldata['RepID'] == rep_id, 'MCD'] = mcd_pred
    alldata.loc[alldata['RepID'] == rep_id, 'IForest'] = iforest_pred
    alldata.loc[alldata['RepID'] == rep_id, 'PCA'] = pca_pred
    alldata.loc[alldata['RepID'] == rep_id, 'ABOD'] = abod_pred
    alldata.loc[alldata['RepID'] == rep_id, 'LOF_scores'] = lof_scores
    alldata.loc[alldata['RepID'] == rep_id, 'KNN_scores'] = knn_scores
    alldata.loc[alldata['RepID'] == rep_id, 'MCD_scores'] = mcd_scores
    alldata.loc[alldata['RepID'] == rep_id, 'IForest_scores'] = iforest_scores
    alldata.loc[alldata['RepID'] == rep_id, 'PCA_scores'] = pca_scores
    alldata.loc[alldata['RepID'] == rep_id, 'ABOD_scores'] = abod_scores

    # Get median of flux
    median = np.median(flux['N2OFlux'])
    # Where RepID = rep_id and N2OFlux <= median, set each of the outlier detection methods to 0
    alldata.loc[(alldata['RepID'] == rep_id) & (alldata['N2OFlux'] <= median), 'LOF'] = 0
    alldata.loc[(alldata['RepID'] == rep_id) & (alldata['N2OFlux'] <= median), 'KNN'] = 0
    alldata.loc[(alldata['RepID'] == rep_id) & (alldata['N2OFlux'] <= median), 'MCD'] = 0
    alldata.loc[(alldata['RepID'] == rep_id) & (alldata['N2OFlux'] <= median), 'IForest'] = 0
    alldata.loc[(alldata['RepID'] == rep_id) & (alldata['N2OFlux'] <= median), 'PCA'] = 0
    alldata.loc[(alldata['RepID'] == rep_id) & (alldata['N2OFlux'] <= median), 'ABOD'] = 0


In [None]:
true = alldata[['HMTruth']].values.tolist()
iqrhm = alldata[['IQRHM']].values.tolist()
lof_pred = alldata[['LOF']].values.tolist()
knn_pred = alldata[['KNN']].values.tolist()
mcd_pred = alldata[['MCD']].values.tolist()
iforest_pred = alldata[['IForest']].values.tolist()
pca_pred = alldata[['PCA']].values.tolist()
abod_pred = alldata[['ABOD']].values.tolist()

# Print the accuracy of list lof_pred to list true
print("1.5 IQRHM Accuracy:", sum(1 for x,y in zip(iqrhm,true) if x == y) / len(true))
print("LOF Accuracy:", sum(1 for x,y in zip(lof_pred,true) if x == y) / len(true))
print("KNN Accuracy:", sum(1 for x,y in zip(knn_pred,true) if x == y) / len(true))
print("MCD Accuracy:", sum(1 for x,y in zip(mcd_pred,true) if x == y) / len(true))
print("IForest Accuracy:", sum(1 for x,y in zip(iforest_pred,true) if x == y) / len(true))
print("PCA Accuracy:", sum(1 for x,y in zip(pca_pred,true) if x == y) / len(true))
print("ABOD Accuracy:", sum(1 for x,y in zip(abod_pred,true) if x == y) / len(true))
print("Blind Accuracy:", sum(1 for x,y in zip(list(np.zeros(len(true))),true) if x == y) / len(true))

In [88]:
# Clear the figure
plt.clf()
%matplotlib inline

# For a random RepID, plot the N2OFlux as a scatterplot for each outlier detection method using the scores as the color
rep_id = np.random.choice(rep_ids)
flux = alldata[alldata['RepID'] == rep_id][['N2OFlux']].values.tolist()
exp_day = np.arange(len(flux))

# MCD
mcd_scores = alldata[alldata['RepID'] == rep_id][['MCD_scores']].values.tolist()
plt.scatter(exp_day, flux, c=mcd_scores, cmap='viridis')
plt.title('MCD')

In [87]:
print('hi')