In [3]:
import pandas as pd
import numpy as np

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from collections import Counter
import os

home_dir = os.getcwd()

In [4]:
def normalize_data(x):
    return (x - x.min(0)) / x.ptp(0)

In [5]:
def preprocess_data(data):
    data = data.replace("na", 0)
    data = data.replace("neg", 0)
    data = data.replace("pos", 1)

    return data

In [6]:
def oversample(data, target):
    smote = SMOTE(ratio='minority')
    X_sm, y_sm = smote.fit_sample(data, target)
    print(sorted(Counter(y_sm).items()))
    
    return X_sm, y_sm

In [7]:
def undersample(data, target):
    tl = TomekLinks(return_indices=True, ratio='majority')
    X_tl, y_tl, id_tl = tl.fit_sample(data, target)
    print(sorted(Counter(y_tl).items()))
    print('Removed indexes:', id_tl)

    return X_tl, y_tl

In [8]:
def combine_samplings(data, target):
    smt = SMOTETomek(ratio='auto')
    X_smt, y_smt = smt.fit_sample(data, target)
    print(sorted(Counter(y_smt).items()))
    
    return X_smt, y_smt

In [11]:
def prepare(dataset):    
    init_data = preprocess_data(dataset)
    dataset_y = init_data.iloc[:, 0]
    dataset_x = init_data.drop(dataset.columns[0], axis=1)    
      
    return dataset_y, dataset_x

In [10]:
def plot_classes(data):
    data = data.rename(columns={0: 'class'})
    sns.countplot('class', data = init_data)
    plt.title('Dataset: Anomaly (1) vs. Normal (0)')
    plt.show()