In [1]:
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from scipy.stats import gaussian_kde

from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

## Simulated data

In each dataset, the first 100 rows are normal grades and the last 10 rows are abnormal grades.

In [2]:
def make_simple_1(): #with obvious cheating
  np.random.seed(17)
  d = 50 + np.random.randint(10, size=(20, 6))
  c = 60 + np.random.randint(10, size=(20, 6))
  b = 70 + np.random.randint(10, size=(20, 6))
  a = 80 + np.random.randint(10, size=(20, 6))

  incr = 50 + np.random.randint(30, size=(20, 6))
  incr = np.sort(incr)

  quizzes = 50 + np.random.randint(10, size=(10, 5))
  final = 85 + np.random.randint(10, size=(10, 1))
  cheat = np.hstack([quizzes, final])

  grades = np.vstack([d, c, b, a, incr, cheat])
  return pd.DataFrame(grades, columns=['Q1', 'Q2', 'Midterm', 'Q3', 'Q4', 'Final'])

def make_simple_2(): #with less obvious cheating
  np.random.seed(17)
  d = 50 + np.random.randint(10, size=(20, 6))
  c = 60 + np.random.randint(10, size=(20, 6))
  b = 70 + np.random.randint(10, size=(20, 6))
  a = 80 + np.random.randint(10, size=(20, 6))

  incr = 50 + np.random.randint(30, size=(20, 6))
  incr = np.sort(incr )

  quizzes = 60 + np.random.randint(10, size=(10, 5))
  final = 80 + np.random.randint(10, size=(10, 1))
  cheat = np.hstack([quizzes, final])

  grades = np.vstack([d, c, b, a, incr, cheat])
  return pd.DataFrame(grades, columns=['Q1', 'Q2', 'Midterm', 'Q3', 'Q4', 'Final'])



def make_simple_3():# with less obvious cheating and incremental increase of grades
  np.random.seed(17)
  d = 50 + np.random.randint(10, size=(20, 6))
  c = 60 + np.random.randint(10, size=(20, 6))
  b = 70 + np.random.randint(10, size=(20, 6))
  a = 80 + np.random.randint(10, size=(20, 6))

  incr = 50 + np.random.randint(30, size=(10, 6))
  incr = np.sort(incr )

  increment = np.array([0, 8, 16, 24, 32, 36])
  base_grades = 50+np.random.randint(10, size=10)
  incr_2 = np.array([increment+grade for grade in base_grades])

  quizzes = 60 + np.random.randint(10, size=(10, 5))
  final = 80 + np.random.randint(10, size=(10, 1))
  cheat = np.hstack([quizzes, final])

  grades = np.vstack([d, c, b, a, incr, incr_2, cheat])
  return pd.DataFrame(grades, columns=['Q1', 'Q2', 'Midterm', 'Q3', 'Q4', 'Final'])

def make_simple_4():# with less obvious cheating and incremental increase of grades
  np.random.seed(17)
  d = 50 + np.random.randint(10, size=(25, 5))
  c = 60 + np.random.randint(10, size=(25, 5))
  b = 70 + np.random.randint(10, size=(25, 5))
  a = 80 + np.random.randint(10, size=(25, 5))

  final_a = 10 + a.mean(axis=1)
  final_b = 10 + b.mean(axis=1)
  final_c = 10 + c.mean(axis=1)
  final_d = 10 + d.mean(axis=1)

  quizzes_cheat = 60 + np.random.randint(10, size=(10, 5))
  final_cheat = 85 + np.random.randint(10, size=(10, 1))

  quizzes_all = np.vstack([d, c, b, a, quizzes_cheat])
  final_all = np.hstack([final_d, final_c, final_b, final_a, final_cheat.flatten()])

  grades = np.hstack([quizzes_all, final_all.reshape(-1,1)])

  return pd.DataFrame(grades, columns=['Q1', 'Q2', 'Midterm', 'Q3', 'Q4', 'Final'])

In [3]:
make_simple_1()

Unnamed: 0,Q1,Q2,Midterm,Q3,Q4,Final
0,51,56,56,59,50,56
1,54,57,54,57,51,51
2,59,58,52,53,56,56
3,59,59,51,55,51,50
4,55,56,56,52,56,59
...,...,...,...,...,...,...
105,53,50,59,54,51,92
106,55,57,54,57,51,86
107,58,57,54,54,56,85
108,56,57,56,56,58,93


##Anomaly detection

In [4]:
df_anomalies = pd.DataFrame(index=['DS1', 'DS2', 'DS3', 'DS4'], columns=['RobustCov', '1-SVM', 'IsoForest', 'LOF'])
outliers_fraction = 0.09
anomaly_algorithms = [
    ("RobustCov", EllipticEnvelope(contamination=outliers_fraction)),
    ("1-SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
                                      gamma=0.1)),
    ("IsoForest", IsolationForest(behaviour='new',
                                         contamination=outliers_fraction,
                                         random_state=42)),
    ("LOF", LocalOutlierFactor(
        n_neighbors=5, contamination=outliers_fraction))]


dataset_list = [('DS1', make_simple_1()), 
                ('DS2', make_simple_2()), 
                ('DS3', make_simple_3()),
                ('DS4', make_simple_4())]
for set_name, dataset in dataset_list:
  tp_list=[]
  df_grades = dataset        
  for name, algorithm in anomaly_algorithms:
    if name == "LOF":
      y_pred = algorithm.fit_predict(df_grades)
    else:
      y_pred = algorithm.fit(df_grades).predict(df_grades)

    lowest_10 = df_grades[y_pred==-1] # y_pred has 1 for an inlier and -1 for an outlier
    true_positive = np.sum([1 for i in lowest_10.index if i in range(100, 110)])
    tp_list.append(true_positive)

  df_anomalies.loc[set_name, :] = tp_list



In [5]:
df_anomalies

Unnamed: 0,RobustCov,1-SVM,IsoForest,LOF
DS1,10,3,2,0
DS2,4,6,0,0
DS3,0,5,0,0
DS4,0,4,0,2


###Illustrated example using LOF and make_simple_1

In [6]:
dataset = make_simple_1()
algorithm = LocalOutlierFactor(n_neighbors=5, contamination=0.09)
y_pred = algorithm.fit_predict(df_grades)

In [7]:
y_pred

array([-1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1,  1,  1, -1,  1,  1])

In [8]:
lowest_10 = df_grades[y_pred==-1]
lowest_10

Unnamed: 0,Q1,Q2,Midterm,Q3,Q4,Final
0,51.0,56.0,56.0,59.0,50.0,64.4
2,51.0,51.0,59.0,58.0,52.0,64.2
21,59.0,55.0,59.0,52.0,50.0,65.0
22,52.0,59.0,51.0,58.0,51.0,64.2
64,79.0,77.0,79.0,77.0,77.0,87.8
80,89.0,80.0,80.0,80.0,82.0,92.2
89,80.0,81.0,80.0,89.0,81.0,92.2
93,89.0,84.0,89.0,88.0,84.0,96.8
104,63.0,60.0,67.0,67.0,67.0,94.0
107,67.0,69.0,60.0,60.0,64.0,91.0


In [9]:
true_positive = np.sum([1 for i in lowest_10.index if i in range(100, 110)])
true_positive

2