# Detecting anomalies in data

Models to implement:
1. Local Outlier Factor (LOF)
2. Isolation Forest
3. Autoencoders


In [None]:
def plot_X_ytest_ypred(X_test, y_test, y_pred):
    import matplotlib.gridspec as gridspec

    columns = X_test.columns
    cols = 3
    rows = int(np.ceil(len(columns)/cols))
    grid = gridspec.GridSpec(rows, cols)

    plt.figure(figsize=(rows*2,rows*3))

    for n, c in enumerate(columns):
        ax = plt.subplot(grid[n])
        sns.scatterplot(x=X_test.index, y=X_test[c], alpha=0.5, ax=ax, label='test_data')
        pos_pred = [i for i,x in enumerate(y_pred) if x==1]
        pos_test = [i for i,x in enumerate(y_test) if x==1]
        sns.scatterplot(x=X_test.iloc[pos_pred].index, y=X_test.iloc[pos_pred][c], color='orange', alpha=0.7, ax=ax, label='pred')
        sns.scatterplot(x=X_test.iloc[pos_test].index, y=X_test.iloc[pos_test][c], color='r', s=10, ax=ax, label='test')
        # sns.scatterplot(x=X_test.iloc[pos].index, y=X_test.iloc[pos][c], alpha=0.7, color='r', ax=ax)
        ax.set_title(str(c))
        ax.set_xlabel('')
        ax.set_ylabel('')
        if n>0:
            plt.legend('')
    plt.tight_layout()
    plt.show()

In [None]:
def get_metrics(clf, y_true, y_pred):
    from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, average_precision_score, roc_auc_score, recall_score, precision_recall_curve, classification_report
    return {
            "model": clf
            , "accuracy_score": accuracy_score(y_true,y_pred)
            , "confusion_matrix": confusion_matrix(y_true,y_pred)
            , "precision": precision_score(y_true,y_pred)
            , "recall_score": recall_score(y_true,y_pred)
            , "f1_score": f1_score(y_true,y_pred)
            , "classification_report": classification_report(y_true,y_pred, output_dict=True)
        }

## 1. Local Outlier Factor

- https://towardsdatascience.com/local-outlier-factor-lof-algorithm-for-outlier-identification-8efb887d9843
- https://towardsdatascience.com/novelty-detection-with-local-outlier-factor-4867e89e4f91

##### LOF
- LOF(k) ~ 1 means Similar density as neighbors.
- LOF(k) < 1 means Higher density than neighbors (Inlier/not an outlier).
- LOF(k) > 1 means Lower density than neighbors (Outlier)

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
%%time
LOF = LocalOutlierFactor(contamination = PercFraud)
y_pred = LOF.fit_predict(data1[X_cols])


In [None]:
data1["LocalOutlierFactor"] = [1 if a==-1 else 0 for a in y_pred]

In [None]:
LOF

In [None]:
y_pred = pd.DataFrame(y_pred).replace({1:0,-1:1})[0].values
# y_pred
n_errors = (y_pred != y_train).sum()
X_scores = mod_lof.negative_outlier_factor_


In [None]:
print(dict(pd.DataFrame(y_train).value_counts()))
print(dict(pd.DataFrame(y_pred).value_counts()))

In [None]:
%%time
plot_X_ytest_ypred(X_test=pd.DataFrame(X_train,columns=X_cols), y_test=y_train, y_pred=y_pred)

In [None]:
X_ = pd.concat([pd.DataFrame(X_train,columns=X_cols), pd.DataFrame([y_train, y_pred]).T],axis=1).rename(columns={0:"y_true", 1:"y_pred"})
pd.crosstab(X_["y_true"],X_["y_pred"])

## 2. IsolationForest

In [None]:
mod_iforest = IsolationForest(random_state=123)
mod_iforest.fit(X_train)
anom = mod_iforest.predict(X_train)

In [None]:
X_ = pd.DataFrame(X_train,columns=X_cols)
X_["y_true"] = y_train
X_["IsolationForest"] = [1 if a==-1 else 0 for a in anom]
pd.crosstab(X_["y_true"],X_["IsolationForest"])
# X_["IsolationForest"].value_counts()

In [None]:
%%time
plot_X_ytest_ypred(X_test=pd.DataFrame(X_train,columns=X_cols), y_test=y_train, y_pred=y_pred)

In [None]:
LOF = LocalOutlierFactor(novelty=True)
LOF.fit(normal_data)
normal_lof = LOF.predict(normal_data)
fraud_lof = LOF.predict(fraud_data)

In [None]:
def calc_accuracy(normal, fraud):
    return {0: len(normal[normal==1])/len(normal), 1: len(fraud[fraud==-1])/len(fraud)}

In [None]:
print(f"ACC IsolationForest:    {calc_accuracy(normal_isf,fraud_isf)}")
print(f"ACC LocalOutlierFactor: {calc_accuracy(normal_lof,fraud_lof)}")

In [None]:
values = normal_isf
tp=list(values).count(1)
tp

In [None]:
values.shape[0]


In [None]:
accuracy=np.round(tp/total,4)

In [None]:

in_accuracy_isf=normal_accuracy(normal_isf)
out_accuracy_isf=fraud_accuracy(fraud_isf)
print("Accuracy in Detecting Normal Cases:", in_accuracy_isf)
print("Accuracy in Detecting Fraud Cases:", out_accuracy_isf)

