In [None]:
import time
import time
import pandas as pd
import numpy as np
from numpy import array
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from keras.layers import Dense,RNN,LSTM,Activation,Dropout
from keras.models import Sequential
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, cohen_kappa_score

In [None]:
#for replicability purposes
tf.random.set_seed(91195003)
np.random.seed(91190530)
#for an easy reset backend session state
tf.keras.backend.clear_session()

In [None]:
df1 = pd.read_csv("../input/dataset/newsfeatures_withlabels.csv")
df2 = pd.read_csv("../input/category/topic_cats.csv")
df3 = pd.read_csv("../input/titleef-v2/items_title_EFv2.csv")

# descomentar para Métrica 2
#dfM2 = pd.read_csv("../input/metricas/news_M2.csv")
#dfM2 = dfM2[['news_id', 'Toxic_Class']]

# descomentar para Métrica 3
#dfM3 = pd.read_csv("../input/metricas/news_M3.csv")
#dfM3 = dfM3[['news_id', 'Toxic_Class']]

# descomentar para Métrica 4
#dfM4 = pd.read_csv("../input/metricas/news_M4.csv")
#dfM4 = dfM4[['news_id', 'Toxic_Class']]

# descomentar para Métrica 2
#df1 = pd.merge(df1.drop(columns=['Toxic_Class']), dfM2, on='news_id', how='inner')

# descomentar para Métrica 3
#df1 = pd.merge(df1.drop(columns=['Toxic_Class']), dfM3, on='news_id', how='inner')

# descomentar para Métrica 4
#df1 = pd.merge(df1.drop(columns=['Toxic_Class']), dfM4, on='news_id', how='inner')

dfaux = pd.merge(df1, df2, on = 'news_id', how='inner')
df = pd.merge(dfaux, df3, on = 'news_id', how='inner')

In [None]:
#Prepare Data
'''
Base + numero comentários + categoria
"^title_|time_of_day|newsoutlet_country|newsoutlet_name|num_comments_article|TC_"

Base + v1 + entidades corpo
"^title_|time_of_day|newsoutlet_country|newsoutlet_name|^freq_|text_"

Base + Keywords + v2
"^title_|time_of_day|newsoutlet_country|newsoutlet_name|[0-9]|noun_freq_"
'''

dX = df.filter(regex=("^title_|time_of_day|newsoutlet_country|newsoutlet_name|[0-9]|noun_freq_")).copy()
dX

In [None]:
#split data into training and validation sets
def split_data(training, perc=20):
  train_idx = np.arange(0, int(len(training)*(100-perc)/100))
  val_idx = np.arange(int(len(training)*(100-perc)/100+1), len(training))
  return train_idx, val_idx

In [None]:
def data_normalization(dataX, norm_range=(0, 1)):
    scaler = MinMaxScaler(feature_range=norm_range)
    
    for c in dataX.columns:
        dataX[[c]] = scaler.fit_transform(dataX[[c]])

In [None]:
#Vizualizing Learning Curves 
def plot_learning_curves(data, approach):

  plt.figure(figsize=(8,6))
  if approach == 'history':
    plt.title('Model train vs val loss per Training Split')
    plt.ylabel('Training RMSE (Normalized)')
    plt.xlabel('Epoch')
    for hist, i in zip(data, range(len(data))):
      plt.subplot(n_splits,1,i+1)
      plt.plot(hist.epoch, hist.history['loss'])
      plt.plot(hist.epoch, hist.history['val_loss'])
      plt.xlim([0, max(hist.epoch)])
      plt.legend(['Training split ' + str(i+1) + '- train loss', 'Training split ' + str(i+1) + '- val loss' ])
    plt.show()
  elif approach == 'loss':
    plt.figure(figsize=(6,3))
    plt.plot(range(len(data)),data)
    plt.title('RMSE value per K Fold')
    plt.ylabel('Evaluation RMSE')
    plt.xlabel('K Folds')
    plt.xlim([0,2])
    plt.ylim([0,(np.amax(data)+2)])
    plt.show()

In [None]:
def build_model(multivariate, h_layers = 2, h_neurons = 64, activation = 'sigmoid', dropout_rate = 0.5, deep_dense = False):
  model = tf.keras.models.Sequential()
  for i in range(h_layers):
    if i == 0:
      if i+1 == h_layers:
        model.add(CuDNNLSTM(h_neurons, return_sequences = False, input_shape = (multivariate, 1)))
      else:
        model.add(CuDNNLSTM(int(h_neurons/2), return_sequences = True, input_shape = (multivariate, 1)))
        model.add(tf.keras.layers.Dropout(dropout_rate))
    elif i+1 == h_layers:
      model.add(CuDNNLSTM(h_neurons*2, return_sequences = False))
    else:
      model.add(CuDNNLSTM(h_neurons, return_sequences = True))
      model.add(tf.keras.layers.Dropout(dropout_rate))
  
  model.add(tf.keras.layers.Dense(h_neurons, activation))
  model.add(tf.keras.layers.Dropout(dropout_rate))

  if deep_dense == True:
    model.add(tf.keras.layers.Dense(int(h_neurons/2), activation))
    model.add(tf.keras.layers.Dropout(dropout_rate))
  model.add(tf.keras.layers.Dense(1, activation='linear'))
  model.compile(loss = 'binary_crossentropy', optimizer = tf.keras.optimizers.Adam(), metrics = ['accuracy'])
  return model

In [None]:
#Compiling and fit the model
def compile_and_fit(model, epochs, batch_size):
  
  callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
      #saving in Keras HDF5 (or h5), a binary data format
      filepath='ckpt/my_model_{epoch}_{val_loss:.3f}.hdf5', #path where to save the model
      save_best_only=True, #overwrite the current checkpoint if and only if
      monitor='val_loss', #the val_loss score has improved
      save_weights_only=False, #if True, only the weights are saved
      verbose=1), #verbosity mode
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='auto', patience=20, min_delta=0.00001),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=50, min_lr=0.00005, cooldown=5)
  ]

  #fit
  hist_list = list()
  loss_list = list()
  
    
  #K Folds Validation
  kfold = StratifiedKFold(n_splits, shuffle=True, random_state=np.random.seed(seed))
  for train_idx, test_idx in kfold.split(dataX, datay):
    train_idx, val_idx = split_data(train_idx, perc=10) #further split into training and validation sets
    
    #build data
    X_train, y_train = dataX[train_idx], datay[train_idx] 
    X_val, y_val = dataX[val_idx], datay[val_idx] 
    X_test, y_test = dataX[test_idx], datay[test_idx]
    print(X_train.shape)
    
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0], X_train.shape[1], 1))
    X_val = np.reshape(X_val, (X_val.shape[0], X_train.shape[1], 1))
                        
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                        epochs=epochs, batch_size=batch_size, shuffle=False, callbacks=callbacks)
    
    metrics = model.evaluate(X_test, y_test)
    
    hist_list.append(history)
    loss_list.append(metrics[1])

  print(f'ACCURACY LIST {loss_list} MEAN: {np.mean(loss_list)}')


  plot_learning_curves(hist_list, approach='history')
  plot_learning_curves(loss_list, approach='loss')
  
  return model, hist_list, loss_list, history

In [None]:
#Main Execution
multivariate = 1064 #number of features used by the model
seed = 7
n_splits = 7 
epochs = 100
batch_size = 64 

scaler = data_normalization(dX)
dataX=np.array(dX)
datay=np.array(d['Toxic_Class'])

#fitting the model
startTime = time.time()

model = build_model(multivariate, h_layers = 3, h_neurons = 128, activation = 'sigmoid', dropout_rate = 0.2, deep_dense = False)
model.summary()
model, hist_list, loss_list, history = compile_and_fit(model, epochs, batch_size)

finishTime = time.time() - startTime
print('Execution Time:', finishTime)

In [None]:
Xtest=np.random.rand(1,1083)
Xtest

In [None]:
Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1))

In [None]:
#y_pred = model.predict(Xtest)
#score = model.evaluate(X_test, y_test,verbose=1)
#print(score)

In [None]:
y_pred = model.predict_classes(Xtest)
print("X=%s, Predicted=%s" % (Xtest[0], y_pred[0]))