In [None]:
import json
from time import time

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from preprocessing import DataHandler
from models import Model

# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

from imblearn.over_sampling import SMOTE, ADASYN

# import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
filename = './parameters.json'
with open(filename) as f:
    parameters_data = json.load(f)

seed, data_config, model_config = [parameters_data[key] for key in parameters_data.keys()]

In [None]:
# Column Rankings:
# ('ALCOHOL', 1400)
# ('SULPHATES', 1249)
# ('VOLATILE_ACIDITY', 1111)
# ('CITRIC_ACID', 1000)
# ('TOTAL_SULFUR_DIOXIDE', 769)
# ('FIXED_ACIDITY', 747)
# ('CHLORIDES', 542)
# ('DENSITY', 454)
# ('RESIDUAL_SUGAR', 209)
# ('pH', 206)
# ('FREE_SULFUR_DIOXIDE', 13)]

In [None]:
df = pd.read_csv('dataset/winequality-red.csv')

In [None]:
quality_binarize = [1 if row_quality in data_config['to_predict'] else 0 for row_quality in df[data_config['target']]]
df[data_config['target']] = quality_binarize

In [None]:
wine_data = DataHandler(random_state = seed)

wine_data.load_data(df, data_config['features'], data_config['target'])
wine_data.split_dataset(cross_validation_size = data_config['cv_size'],
                        test_size = data_config['test_size'])

print(wine_data)

In [None]:
train_rate_keep = 1.0
subset_index = np.random.choice(wine_data.X_train.index,
                                size = int(len(wine_data.X_train.index) * train_rate_keep),
                                replace = False)

training_set_subset = df.iloc[subset_index]
X_train_subset = training_set_subset[data_config['features']]
y_train_subset = training_set_subset[data_config['target']]

In [None]:
positive_cnt = list(y_train_subset).count(1)
k_neighbors = 5 if positive_cnt >= 5 else positive_cnt
# print(k_neighbors)

In [None]:
X_resampled, y_resampled = SMOTE(random_state = seed,
                                 k_neighbors = k_neighbors
                                 ,
                                 sampling_strategy = 0.5
                                ).fit_resample(X_train_subset, y_train_subset)

In [None]:
print(np.unique(wine_data.y_train, return_counts = True))
print(np.unique(wine_data.y_cross_validation, return_counts = True))
print(np.unique(wine_data.y_test, return_counts = True))
print(np.unique(y_resampled, return_counts = True))

In [None]:
kfold = StratifiedKFold(n_splits = data_config['k_folds'], shuffle = True, random_state = seed)

In [None]:
models = []
models_scores = []
tensorboards = [
    tf.keras.callbacks.TensorBoard(log_dir="logs/resampled/{}".format(time())),
    tf.keras.callbacks.TensorBoard(log_dir="logs/vanilla/{}".format(time()))
]

for t in tensorboards:
    new_model = Model(tensorboard = t)
    new_model.create(input_dim = wine_data.X_train.shape[1])
    models.append(new_model)

In [None]:
def train_model(x_to_train, y_to_train, model=None):
    
    for training_index, cross_validation_index in kfold.split(x_to_train, y_to_train):
        
        if isinstance(x_to_train, pd.DataFrame):
            x = x_to_train.values[training_index]
            y = y_to_train.values[training_index]
            cv = [x_to_train.values[cross_validation_index], y_to_train.values[cross_validation_index]]
        else:
            x = x_to_train[training_index]
            y = y_to_train[training_index]
            cv = [x_to_train[cross_validation_index], y_to_train[cross_validation_index]]
        
        print(x)
        current_model = model.train(epochs = model_config['epochs_param'],
    #                                     batch_size = model_config['batch_size'],
                                        X_data = x, y_data = y,
                                        validation_data = cv
                                       )
        print('End of Model Training')

        current_model_predictions = model.model.predict(x = wine_data.X_test)

        # Data Type, Training Data Accuracy, Cross Validation Accuracy, Test Accuracy, Precision, Recall, F1_score
        models_scores.append([
            'vanilla',
            current_model.history['acc'][-1],
            current_model.history['val_acc'][-1],
            accuracy_score(wine_data.y_test, np.round(current_model_predictions)),
            precision_score(wine_data.y_test, np.round(current_model_predictions)),
            recall_score(wine_data.y_test, np.round(current_model_predictions)),
            f1_score(wine_data.y_test, np.round(current_model_predictions)),
        ])

In [None]:
train_model(X_resampled, y_resampled, models[0])
train_model(wine_data.X_train, wine_data.y_train, models[1])

In [None]:
index_label = np.arange(1, data_config['k_folds'] + 1)
index_label = np.tile(index_label, (2, 1))

results_info = pd.DataFrame(data = models_scores,
                            index = index_label,
                            columns = ['Data Type',
                                       'Training Accuracy', 'Cross Validation Accuracy','Test Accuracy',
                                       'Precision', 'Recall', 'F1_Score']
                           )
results_info.index.name = 'KFold Iteration'
results_info

In [None]:
cm = [confusion_matrix(wine_data.y_test, np.round(pred)) for pred in predictions]

fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (14, 6), dpi = 100)
sns.heatmap(cm[0], annot = True, fmt = 'd', ax = ax[0])
ax[0].set_title('Upsampling')
ax[0].set_xlabel('Predicted Label')
ax[0].set_ylabel('True Label')

sns.heatmap(cm[1], annot = True, fmt = 'd', ax = ax[1])
ax[1].set_title('Vanilla')
ax[1].set_xlabel('Predicted Label')
ax[1].set_ylabel('True Label')

In [None]:
fpr = []
tpr = []
roc_auc = []

for pred in predictions:
    
    current_fpr, current_tpr, _ = roc_curve(wine_data.y_test, pred)
    current_roc_auc = auc(current_fpr, current_tpr)
    
    fpr.append(current_fpr)
    tpr.append(current_tpr)
    roc_auc.append(current_roc_auc)
    
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (14, 6), dpi = 100)
ax[0].set_title('Receiver Operating Characteristic Upsampling')
ax[0].plot(fpr[0], tpr[0], 'b', label = 'ROC curve (area = %0.2f)' % roc_auc[0])
ax[0].legend(loc = 'lower right')
ax[0].plot([0, 1], [0, 1],'r--')
ax[0].set_xlim([0, 1])
ax[0].set_ylim([0, 1])
ax[0].set_ylabel('True Positive Rate')
ax[0].set_xlabel('False Positive Rate')

ax[1].set_title('Receiver Operating Characteristic Vanilla')
ax[1].plot(fpr[1], tpr[1], 'b', label = 'ROC curve (area = %0.2f)' % roc_auc[1])
ax[1].legend(loc = 'lower right')
ax[1].plot([0, 1], [0, 1],'r--')
ax[1].set_xlim([0, 1])
ax[1].set_ylim([0, 1])
ax[1].set_ylabel('True Positive Rate')
ax[1].set_xlabel('False Positive Rate')

plt.show()

In [None]:
# fpr_vanilla = []
# tpr_vanilla = []
# roc_auc_vanilla = []
# fpr_vanilla, tpr_vanilla, thresholds_vanilla = roc_curve(y_test, pred_vanilla)
# roc_auc_vanilla = auc(fpr_vanilla, tpr_vanilla)

# plt.title('Receiver Operating Characteristic')
# plt.plot(fpr_vanilla, tpr_vanilla, 'b', label = 'ROC curve (area = %0.2f)' % roc_auc_vanilla)
# plt.legend(loc = 'lower right')
# plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()

In [None]:
# models_history = []
# predictions = []
# model_scores = []

# for training_index, cross_validation_index in kfold.split(wine_data.X_train, wine_data.y_train):
#     for itr, model in enumerate(models):
#         data_for_training = training_data[itr][0]
#         data_target = training_data[itr][1]
#         current_model = model.fit(x = data_for_training[training_index], y = data_for_training[training_index],
#                                   validation_data = (data_for_training[cross_validation_index], data_for_training[cross_validation_index]),
#                                   batch_size = batch_size,
#                                   class_weight = class_weight,
#                                   epochs = epochs_param,
#                                   shuffle = True,
#                                   verbose = 2
#     #                               ,callbacks = [tensorboards[itr]]
#                                  )
#         models_history.append(current_model)
#         print('End of model training')

#         current_model_predictions = model.predict(x = wine_data.X_test)
#         predictions.append(current_model_predictions)

#         # Data Type, Training Data Accuracy, Cross Validation Accuracy, Test Accuracy, Precision, Recall, F1_score
#         model_scores.append([
#             training_data[itr][2],
#             current_model.history['acc'][-1],
#             current_model.history['val_acc'][-1],
#             accuracy_score(wine_data.y_test, np.round(current_model_predictions)),
#             precision_score(wine_data.y_test, np.round(current_model_predictions)),
#             recall_score(wine_data.y_test, np.round(current_model_predictions)),
#             f1_score(wine_data.y_test, np.round(current_model_predictions)),
#         ])

In [None]:
# tf.keras.backend.clear_session()
# # tensorboards = [
# #     tf.keras.callbacks.TensorBoard(log_dir="logs/resampled/{}".format(time())),
# #     tf.keras.callbacks.TensorBoard(log_dir="logs/vanilla/{}".format(time()))
# # ]

# epochs_param = 50
# input_dim = wine_data.X_train.shape[1]
# # optimizer = tf.keras.optimizers.Adam(lr = 0.01)
# optimizer = 'adam'
# batch_size = 128
# # class_weight = {0: 1, 1: 3}
# class_weight = None

# training_data = [
#     [X_resampled, y_resampled, 'resampled']
#     ,
#     [wine_data.X_train, wine_data.y_train, 'vanilla']
# ]

models = []
for _ in range(len(training_data)):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(4, activation = tf.nn.elu, input_dim = input_dim, use_bias = True))
#     model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(rate = 0.2, seed = seed))
    model.add(tf.keras.layers.Dense(4, activation = tf.nn.elu, use_bias = True))
#     model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(rate = 0.2, seed = seed))
    model.add(tf.keras.layers.Dense(1, activation = tf.nn.sigmoid))
    model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    models.append(model)

In [None]:
# index_numpy_array = np.array(df.index)
# training_set_indices = np.random.choice(index_numpy_array,
#                                         size = int(index_numpy_array.shape[0] * (1-test_size)),
#                                         replace = False)

# test_set_indices = np.delete(index_numpy_array, training_set_indices)
# assert(len(df) == len(training_set_indices) + len(test_set_indices))

# training_set = df.iloc[training_set_indices]
# test_set = df.iloc[test_set_indices]

# print(np.unique(training_set['QUALITY'], return_counts = True))
# print(np.unique(test_set['QUALITY'], return_counts = True))

# """
# Upsampling
#     - since we have a data imbalance between wine qualities, perform upsampling to the minority class,
#       so that the count between the qualities are the same
#     - this is done by sampling from the minority class with replacement
# """
# majority_size_pct = 0.3

# full_training_set_majority_class = training_set[training_set['QUALITY'] != 1].index
# full_training_set_minority_class = training_set[training_set['QUALITY'] == 1].index

# upsampling_training_majority = np.random.choice(full_training_set_majority_class,
#                                                size = int(len(full_training_set_majority_class) * majority_size_pct),
#                                                replace = False
#                                               )
# upsampling_training_minority = np.random.choice(full_training_set_minority_class,
#                                                size = int((len(full_training_set_majority_class) * majority_size_pct) * 1.0),
#                                                replace = True
#                                               )

# upsampling_training_set = df.iloc[np.append(upsampling_training_majority, upsampling_training_minority)]
# upsampling_cross_validation_set = training_set.drop(np.append(upsampling_training_majority, upsampling_training_minority))

# # assert(len(training_set) == len(cross_validation_set_indices) + len(upsampled_training_majority) + len(upsampled_training_minority))

# """
# Downsampling
#     - since we have a data imbalance between wine qualities, perform Downsampling to the minority class,
#       so that the count between the qualities are the same
#     - this is done by subsampling from the majority class without replacement
# """

# majority_size_pct = 0.3

# full_training_set_majority_class = training_set[training_set['QUALITY'] != 1].index
# full_training_set_minority_class = training_set[training_set['QUALITY'] == 1].index

# upsampling_training_majority = np.random.choice(full_training_set_majority_class,
#                                                size = len(full_training_set_minority_class) * 3,
#                                                replace = False
#                                               )
# upsampling_training_minority = np.random.choice(full_training_set_minority_class,
#                                                size = len(full_training_set_minority_class),
#                                                replace = False
#                                               )

# upsampling_training_set = df.iloc[np.append(upsampling_training_majority, upsampling_training_minority)]
# upsampling_cross_validation_set = training_set.drop(np.append(upsampling_training_majority, upsampling_training_minority))

# # assert(len(training_set) == len(cross_validation_set_indices) + len(upsampled_training_majority) + len(upsampled_training_minority))

# y_train = upsampling_training_set['QUALITY'].values
# X_train = upsampling_training_set.drop(columns = ['QUALITY']).values

# y_cross_validation = upsampling_cross_validation_set['QUALITY'].values
# X_cross_validation = upsampling_cross_validation_set.drop(columns = ['QUALITY']).values

# y_test = test_set['QUALITY'].values
# X_test = test_set.drop(columns = ['QUALITY']).values

In [None]:
# X = df[columns_to_choose]
# y = df[target]
#
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     test_size = test_size,
#                                                     random_state = seed
#                                                    )
# X_train, X_cross_validation, y_train, y_cross_validation = train_test_split(X_train, y_train,
#                                                                             test_size = cross_validation_size,
#                                                                             random_state = seed
#                                                                            )
#
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_resampled = sc.fit_transform(X_resampled)
# X_cross_validation = sc.fit_transform(X_cross_validation)
# X_test = sc.fit_transform(X_test)