In [1]:
from google.colab import drive
drive.mount('/content/drive')

# 0.1 - General Purpose Libraries/Modules

# for linear algebra
import numpy as np
# for data processing and file I/O
import pandas as pd
# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# for measuring latency
import time

# for data preparation
from sklearn.model_selection import train_test_split
# for scaling
from sklearn.preprocessing import StandardScaler
# for balancing the loss function (the dataset is imbalanced)
from sklearn.utils.class_weight import compute_class_weight
from xgboost import  XGBClassifier
from sklearn.ensemble import RandomForestClassifier
# for evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# 0.2 - The CNN architecture is implemented via keras

# basic configuration
import tensorflow as tf
from tensorflow import keras

# for the implemantation of the models
from keras import layers, Sequential

# for readability purposes
from keras.models import Model
from keras.layers import Dense, Reshape, Concatenate, Conv2D, GlobalAveragePooling2D
from keras.layers import Input,BatchNormalization, MaxPooling2D, Flatten, Dropout

# for the customization of the model and the training process
from keras.callbacks import EarlyStopping, ReduceLROnPlateau


In [3]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [4]:
def get_feature_importances_rf (_data, _target):
  RFC = RandomForestClassifier(random_state=10, n_jobs=1) # 100 trees in forest
  RFC.fit(_data, _target)
  score = np.round(RFC.feature_importances_,5)
  _importances = pd.DataFrame({'features':_data.columns,'level of importance':score})
  _importances = _importances.sort_values('level of importance',ascending=False).set_index('features')
  return _importances


def get_feature_importances_xgb(_data, _target):
  XGB = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, missing=np.inf)
  XGB.fit(_data, _target)
  score = np.round(XGB.feature_importances_,5)
  _importances = pd.DataFrame({'features':_data.columns,'level of importance':score})
  _importances = _importances.sort_values('level of importance',ascending=False).set_index('features')
  return _importances

In [5]:
# Custom F1-Score Metric
class F1ScoreMetric(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1ScoreMetric, self).__init__(name=name, **kwargs)
        self.precision = self.add_weight(name='precision', initializer='zeros')
        self.recall = self.add_weight(name='recall', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.cast(tf.greater_equal(y_pred, 0.5), tf.int32)
        y_true = tf.cast(y_true, tf.int32)

        tp = tf.reduce_sum(tf.cast(y_true * y_pred, tf.float32))
        fp = tf.reduce_sum(tf.cast((1 - y_true) * y_pred, tf.float32))
        fn = tf.reduce_sum(tf.cast(y_true * (1 - y_pred), tf.float32))

        precision = tp / (tp + fp + tf.keras.backend.epsilon())
        recall = tp / (tp + fn + tf.keras.backend.epsilon())

        self.precision.assign(precision)
        self.recall.assign(recall)

    def result(self):
        precision = self.precision
        recall = self.recall
        return 2 * ((precision * recall) / (precision + recall + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.assign(0)
        self.recall.assign(0)