In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Outlier Detection

In [None]:
df = pd.read_csv('datasets/student_performance.csv')

df.sample(5)

In [None]:
fix, ax = plt.subplots(figsize=(20,10))

plt.scatter(df['Hours Studied'],
            df['Score Obtained'],
            color='black',
            s=100)

ax.set(xlabel='Hours Studied',
       ylabel='Score Obtained',
       title='Students Performance')

ax.grid()
plt.show()

In [None]:
df.groupby('Outliers').size()

In [None]:
X = df.drop('Outliers', axis=1)
X.head()

In [4]:
# Local Outlier Factor Method

from sklearn.neighbors import LocalOutlierFactor

In [3]:
lof = LocalOutlierFactor(n_neighbors=20,
                         contamination=0.2)

In [None]:
y_pred = lof.fit_predict(X)
y_pred

In [None]:
n_outliers = (y_pred == -1).sum()
n_outliers

In [None]:
colors = np.array(['r', 'b'])
plt.figure(figsize=(12, 10))

plt.title('Local Outlier Factor', size=18)

plt.scatter(df['Hours Studied'],
            df['Score Obtained'],
            s=100,
            color=colors[(y_pred + 1) // 2])

plt.show()

In [None]:
X_scores = lof.negative_outlier_factor_

X_scores

In [None]:
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())

In [None]:
plt.figure(figsize=(12,10))

plt.scatter(df['Hours Studied'],
            df['Score Obtained'],
            s=1000*radius,
            edgecolors='r',
            facecolors='none',
            label='Outlier scores')

plt.show()

In [6]:
# Isolation Forest Method

from sklearn.ensemble import IsolationForest

In [8]:
isf = IsolationForest(contamination=0.19)

In [None]:
y_pred = isf.fit_predict(X)
y_pred

In [None]:
n_outliers = (y_pred == -1).sum()
n_outliers

In [None]:
colors = np.array(['r', 'b'])
plt.figure(figsize=(12, 10))

plt.title('Isolation Forest Predictions', size=18)

plt.scatter(df['Hours Studied'],
            df['Score Obtained'],
            s=100,
            color=colors[(y_pred + 1) // 2])

plt.show()

In [None]:
xx, yy = np.meshgrid(np.linspace(0, 1010, 500),
                     np.linspace(0, 102, 500))

print('xx = ', xx.ravel())
print('yy = ', yy.ravel())

In [None]:
np.c_[xx.ravel(), yy.ravel()]

In [None]:
Z = isf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
Z

In [None]:
plt.figure(figsize=(12, 10))

plt.title('IsolationForest', size=18)

plt.scatter(df['Hours Studied'],
            df['Score Obtained'],
            s=100,
            color=colors[(y_pred + 1) // 2])

plt.contour(xx, yy, Z,
            levels=[0],
            linewidths=2,
            colors='black')

plt.show()

In [9]:
# Elliptic Envelope Method
from sklearn.covariance import EllipticEnvelope

In [10]:
ee = EllipticEnvelope(support_fraction=1., contamination=0.19)

In [None]:
y_pred = ee.fit_predict(X)
y_pred

In [None]:
n_outliers = (y_pred == -1).sum()
n_outliers

In [None]:
xx, yy = np.meshgrid(np.linspace(0, 1010, 500),
                     np.linspace(-5, 120, 500))

Z = ee.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

In [None]:
plt.figure(figsize=(12, 10))

plt.title('Elliptic Envelope', size=18)

plt.scatter(df['Hours Studied'],
            df['Score Obtained'],
            s=100,
            color=colors[(y_pred + 1) // 2])

plt.contour(xx, yy, Z,
            levels=[0],
            linewidths=2,
            colors='black')

plt.show()

## Novelty Detection