***
# <font color=blue>UNSUPERVISED LEARNING</font>
# <font color=blue>Practice with Anomaly Detection</font>
# <font color=blue>(student version)</font>
<div style="text-align: right"><font color=magenta>Andrea De Simone</font></div>
***

In [None]:
import numpy as np  
import pandas as pd  
from scipy.stats import multivariate_normal  
import matplotlib.pyplot as plt  
%matplotlib inline
from sklearn.neighbors import NearestNeighbors

***
# 1. Model-Based Anomaly Detection

## 1.1 Load Dataset 2 (training data)

In [None]:
# Load data
X_tr = np.loadtxt('dataset2_tr.csv', delimiter=',')
print(X_tr.shape)

In [None]:
def scatter_plot(X):
    
    fig, ax = plt.subplots()
    ax.set_xlabel('$x_1$',size=16)
    ax.set_ylabel('$x_2$',size=16)
    
    ax.scatter(X[:,0], X[:,1], c='blue', alpha=0.5, marker='x')
    plt.show() 

In [None]:
scatter_plot(X_tr)

## 1.2 Gaussian Model

In [None]:
# Gaussian Parameters
mu = np.mean(X_tr, axis=0)
sigma = np.cov(X_tr.T)

print("mean =", mu)
print("covariance = \n",sigma)

p = multivariate_normal.pdf(X_tr,mean=mu, cov=sigma)

In [None]:
# Plot gaussian levels
fig,ax=plt.subplots()

xx, yy = np.meshgrid(np.arange(4, 25, .1),
                     np.arange(4, 25, .1))
XYpoints = np.c_[xx.ravel(), yy.ravel()] 
Z =  multivariate_normal.pdf(XYpoints,mean=mu, cov=sigma)
Z = Z.reshape(xx.shape)

ax.contour(xx, yy, Z, levels=[1e-7,1e-5,1e-3], 
           colors='green',
           linestyles= ['dotted','dashed','solid'])
plt.show()

## 1.3  Outliers with $p<\epsilon$

### <font color='magenta'>>>> Q1: Complete the function 'gaussian_outliers'. Find number and coordinates of outliers with $p<10^{-5}$</font>

In [None]:
def gaussian_outliers(X, p, epsilon):
    """
    Find coordinates of outliers with p<epsilon
    (where p contains the probability density of each point of the set X)
    """
    # Start Edit
    

    #print("Number of outliers = ", )
    #print("Coordinates of outliers = \n", outliers)
    # End Edit
    
    return(outliers)

In [None]:
# Find outliers
epsilon = 1e-5
outliers = gaussian_outliers(X_tr, p, epsilon)


In [None]:
def plot_outliers_with_contour(X, outliers, eps):

    mu = np.mean(X, axis=0)
    sigma = np.cov(X.T)

    fig,ax=plt.subplots()

    xx, yy = np.meshgrid(np.arange(X[:,0].min()-1, X[:,0].max()+1, .1),
                         np.arange(X[:,1].min()-1, X[:,1].max()+1, .1))
    XYpoints = np.c_[xx.ravel(), yy.ravel()] 
    Z =  multivariate_normal.pdf(XYpoints,mean=mu, cov=sigma)
    Z = Z.reshape(xx.shape)

    ax.contour(xx, yy, Z, levels=[eps], colors='green',linestyles='dashed')
    ax.plot(X[:,0],X[:,1], marker='x',c='blue',linestyle = 'None')  
    ax.plot(outliers[:,0], outliers[:,1], marker='s',c='red',linestyle = 'None')  
    ax.set_xlabel('$x_1$',size=16)
    ax.set_ylabel('$x_2$',size=16)
    plt.show()

In [None]:
plot_outliers_with_contour(X_tr, outliers, epsilon)

### <font color='magenta'>>>> Q2: find number and coordinates of outliers with $p<10^{-2}$ and $p<10^{-8}$</font>

In [None]:
# Start Edit
#epsilon = 

#outliers = 
# End Edit

plot_outliers_with_contour(X_tr, outliers, epsilon)

In [None]:
# Start Edit
#epsilon = 

#outliers = 
# End Edit

plot_outliers_with_contour(X_tr, outliers, epsilon)

## 1.5 Load dataset 2 (validation data)

In [None]:
# Load CV data (X_cv != X_tr)
X_cv = np.loadtxt('dataset2_cv.csv', delimiter=',')
Y_cv = np.loadtxt('dataset2_cv_labels.csv')
print(Y_cv[:10])
print(X_cv.shape)
print(Y_cv.shape)

## 1.5 Optimal threshold $\epsilon$

In [None]:
mu_cv = np.mean(X_cv, axis=0)
sigma_cv = np.cov(X_cv.T)
p_cv = multivariate_normal.pdf(X_cv,mean=mu_cv, cov=sigma_cv)

### <font color='magenta'>>>> Q3: complete function 'select_epsilon' by defining the variables 'tp' (True Positives), 'fp' (False Positives) and 'fn' (False Negatives) </font>

In [None]:
def select_epsilon(prob_cv, y_cv):
    
    best_epsilon = 0
    best_f1 = 0
    f1 = 0
    
    stepsize = (prob_cv.max() - prob_cv.min()) / 1000

    for epsilon in np.arange(prob_cv.min(),prob_cv.max(),stepsize):
        
        preds = (prob_cv < epsilon) 
        
        # Start Edit
        # True Positives (predicted 1, actual 1)
        #tp = 
        # False Positives (predicted 1, actual 0)
        #fp = 
        # True Negatives (predicted 0, actual 1)
        #fn = 
        # End Edit
        
        # Precision
        if (tp + fp) != 0:
            precision = tp / (tp + fp)
        else:
            precision = 0

        # Recall
        if (tp + fn) != 0:
            recall = tp / (tp + fn)
        else:
            recall = 0

        # F1-score
        if (precision + recall) != 0:                
            f1 = (2 * precision * recall) / (precision + recall)
        else:
            f1 = 0       
            
        if f1 > best_f1:
            best_epsilon = epsilon
            best_f1 = f1
       
    return(best_epsilon, best_f1)

In [None]:
optimal_epsilon, f1score = select_epsilon(p_cv, Y_cv)
print("optimal epsilon=",optimal_epsilon)
print("optimal f1-score =",f1score)

In [None]:
# Apply optimal epsilon to unlabelled (training) data

mu = np.mean(X_tr, axis=0)
sigma = np.cov(X_tr.T)
p = multivariate_normal.pdf(X_tr,mean=mu, cov=sigma)

outliers = gaussian_outliers(X_tr, p, optimal_epsilon)

plot_outliers_with_contour(X_tr, outliers, optimal_epsilon)

***
# 2. Distance-Based Anomaly Detection

## 2.1 Compute Nearest Neighbors distances

In [None]:
K=5

nbrs = NearestNeighbors(n_neighbors=K,algorithm='kd_tree').fit(X_tr)

dist, _ = nbrs.kneighbors(X_tr,  n_neighbors = K+1, return_distance=True)

print(dist[:5])

In [None]:
dist_Kth = dist[:,-1] # distances to the Kth NN

In [None]:
# Append kth-NN distance to point coordinates (3rd dimension)
X_ext = np.concatenate((X_tr, dist_Kth[:,np.newaxis]),axis=1)
print(X_ext[:5])

In [None]:
def scatter_plot_with_distances(X, dist):
    # Input is coordinate array X and the array of distances to the Kth NN    
    fig, ax = plt.subplots()

    ax.xaxis.set_tick_params(labelsize=12, direction='inout', length=6, width=1, color='gray')
    ax.yaxis.set_tick_params(labelsize=12, direction='inout', length=6, width=1, color='gray')
    ax.set_xlabel('$x_1$',size=16)
    ax.set_ylabel('$x_2$',size=16)

    sc = ax.scatter(X[:,0], X[:,1], c=dist, cmap='brg', alpha=.5, marker='x',s=100)
    cb = plt.colorbar(sc)
    cb.set_label('distance to $k^{th}$ nearest neighbor')
    plt.show() 

In [None]:
scatter_plot_with_distances(X_tr, dist[:,-1])

In [None]:
# Sort from smallest to largest kNN distance
X_ext_sorted = X_ext[X_ext[:,-1].argsort()]
print(X_ext_sorted[:5])
print("")
print(X_ext_sorted[-5:])


print("\nhighest kNN distance:             ", X_ext_sorted[-1])
print("second to highest kNN distance: ", X_ext_sorted[-2])

## 2.2 Top n outliers

### <font color='magenta'>>>> Q4: Find coordinates of top 5 outliers (the 5 points with the largest kNN distances). Find the coordinates of the 'normal' (not anomalous) points. </font>

In [None]:
# Select top n outliers
n_outliers = 5

# Start Edit
#outliers = 
#normal = 
# End Edit

In [None]:
def plot_outliers(normal, outliers):
        
    fig,ax=plt.subplots()
        
    ax.plot(normal[:,0],normal[:,1], marker='x',c='blue',linestyle = 'None')  
    ax.plot(outliers[:,0], outliers[:,1], marker='s',c='red',linestyle = 'None')  
    
    ax.set_xlabel('$x_1$',size=16)
    ax.set_ylabel('$x_2$',size=16)
    plt.show()

In [None]:
print("Number of outliers =", outliers.shape[0])
plot_outliers(normal,outliers)

## 2.3 Outliers with KNN distance > threshold

### <font color='magenta'>>>> Q5: Complete the function 'KNN_outliers'. Find coordinates of points whose Kth-NN distance > threshold (outliers) and Kth-NN distance $\leq$ threshold (normal points).</font>

In [None]:
def KNN_outliers(x_sorted, threshold):
    """
    Given array 'x_sorted' of coordinates and KNN-distances 
    (sorted in ascending order of KNN distance),
    find coordinates of outliers with KNN-distance > threshold
    and coordinates of normal points with KNN-distance <= threshold
    """
    
    # Start Edit
    
    
    #print("Number of outliers = ", )
    #print("Coordinates of outliers = \n", outliers)
    # End Edit
    
    return(normal, outliers)

In [None]:
# Select outliers with KNN-distance > 1
threshold = 1

normal, outliers = KNN_outliers(X_ext_sorted, threshold)

plot_outliers(normal,outliers)

## 2.4 Optimal threshold $thr$

In [None]:
# Use CV dataset

K=5

nbrs = NearestNeighbors(n_neighbors=K,algorithm='kd_tree').fit(X_cv)

dist, _ = nbrs.kneighbors(X_cv,  n_neighbors = K+1, return_distance=True)
dist_Kth = dist[:,-1] # distances to the Kth NN

# Append kth-NN distance to point coordinates (3rd dimension)
X_cv_ext = np.concatenate((X_cv, dist_Kth[:,np.newaxis]),axis=1)

# Sort from smallest to largest kNN distance
X_cv_ext_sorted = X_cv_ext[X_cv_ext[:,-1].argsort()]

### <font color='magenta'>>>> Q6: complete function 'select_threshold' by defining the variables 'tp' (True Positives), 'fp' (False Positives) and 'fn' (False Negatives). [Same as you did for 'select_epsilon' in Q3] </font>

In [None]:
def select_threshold(x_sorted, y_cv):
    
    best_threshold = 0
    best_f1 = 0
    f1 = 0
    
    stepsize = (x_sorted[:,-1].max() - x_sorted[:,-1].min()) / 1000

    for threshold in np.arange(x_sorted[:,-1].min(),x_sorted[:,-1].max(),stepsize):
        
        preds = ( x_sorted[:,-1] > threshold ) 

        # Start Edit
        # True Positives (predicted 1, actual 1)
        #tp = 
        # False Positives (predicted 1, actual 0)
        #fp = 
        # True Negatives (predicted 0, actual 1)
        #fn = 
        # End Edit
        
        # Precision
        if (tp + fp) != 0:
            precision = tp / (tp + fp)
        else:
            precision = 0

        # Recall
        if (tp + fn) != 0:
            recall = tp / (tp + fn)
        else:
            recall = 0

        # F1-score
        if (precision + recall) != 0:                
            f1 = (2 * precision * recall) / (precision + recall)
        else:
            f1 = 0       
        
        if f1 > best_f1:
            best_threshold = threshold            
            best_f1 = f1
         
    return(best_threshold, best_f1)

In [None]:
optimal_threshold, f1score = select_threshold(X_cv_ext_sorted,Y_cv)
print("optimal threshold=", optimal_threshold)
print("optimal f1-score =", f1score)

In [None]:
# Select outliers with KNN-distance > optimal threshold
threshold = optimal_threshold

normal, outliers = KNN_outliers(X_ext_sorted, threshold)

plot_outliers(normal,outliers)