In [None]:
# Deep learning arquitecture - SE3M

from google.colab import drive
import pandas as pd
import numpy as np
from tensorflow import keras
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM,Bidirectional,Dropout
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, RMSprop
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.layers import Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing
from sklearn.metrics import * 
from sklearn.model_selection import KFold 

drive.mount('/content/gdrive/', force_remount=True)

df = pd.read_csv('/content/gdrive/My Drive/data/reqTxt.csv', header=None) #Report or file containing the set of training and test texts.
dfRequire = df.iloc[:,:] 
print(dfRequire.shape)
print(dfRequire.columns)
X = dfRequire[0]
print(X[0])
X = np.array(X)

print(len(X))

print('Train and test dataset loaded...')

y = pd.read_csv('/content/gdrive/My Drive/data/estiDeep.data', header=None) #File containing the set of training and test labels.
y = np.array(y)
print ('Shape of label tensor:', y.shape)
print(y.dtype)

#Number of texts in train and test dataset 
MAX_LEN = 23313

kf = KFold(n_splits=10, shuffle=True, random_state=1000) 
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
print(kf) 

# Load Word2Vec sentende embeddings - generic
#pret_model = pd.read_csv('/content/gdrive/My Drive/pretrain_model/word2vec_base.csv', delimiter= ',', sep= ',', header = None)  #Usado esse como Wiki medio genérico

# Load Word2Vec sentende embeddings - fine-tuning
#pret_model = pd.read_csv('/content/gdrive/My Drive/pretrain_model/word2vec_SE.csv', delimiter= ',', sep= ',', header=None) #embedding word2vec Wiki fine-tuning com o dataset de pré-treino

# Load BERT sentence embeddings - generic
#pret_model = pd.read_csv('/content/gdrive/My Drive/pretrain_model/BERT_base.csv', delimiter= ',', sep= ',', header=None) #novo bert as service

# Load BERT sentence embeddings - fine-tuning
pret_model = pd.read_csv('/content/gdrive/My Drive/pretrain_model/BERT_SE.csv', delimiter= ',', sep= ',', header=None) #novo bert as service

embedding_matrix = pret_model.iloc[0:23313,:] # for BERT models
###embedding_matrix = pret_model.iloc[:,1:101] # for word2vec_base
###embedding_matrix = pret_model.iloc[1:,:] # for word2vec_SE

dfEmbedding_mat = pd.DataFrame(embedding_matrix)
embedding_mat = dfEmbedding_mat.fillna('0') 

print('Embedding mat: ' + str(embedding_mat.shape))

vetMAE = []
vetR2 = []
vetMSE = []
vetMdae = []

i = 0

for train_index, test_index in kf.split(X):

    x_train, test_x = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]
    
    # get the raw text data
    texts_train = x_train.astype(str)
    texts_test = test_x.astype(str)

    # vectorize the text samples                                   
    tokenizer = Tokenizer(num_words = MAX_LEN, char_level=False, lower=False) 
    tokenizer.fit_on_texts(texts_train)                            
    encSequences = tokenizer.texts_to_sequences(texts_train)          
    encSequences_test = tokenizer.texts_to_sequences(texts_test)      

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    vocab_size = len(tokenizer.word_index) + 1 
    print('Vocab_size: '+ str(vocab_size))

    MAX_SEQUENCE_LENGTH = 100 #number of words in each text

    x_train = pad_sequences(encSequences, maxlen= MAX_SEQUENCE_LENGTH, padding='post')
    x_test = pad_sequences(encSequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    print('Shape of data tensor:', x_train.shape)
    print('Shape of data test tensor:', x_test.shape)

    print('train_y: ' + str(train_y.shape))
    print('test_y: ' + str(test_y.shape))


    #Sequential model
    model = Sequential()

    embedding = Embedding(MAX_LEN, 768, input_length = 100, name='embedding', trainable=True) #for BERT pre-trained model
    ###embedding = Embedding(MAX_LEN, 100, input_length = 100, name='embedding') #for word2vec pre-trained model
    embedding.build(input_shape=(1,)) # the input_shape here has no effect in the build function
    embedding.set_weights([embedding_mat])
    model.add(embedding)
    
    model.add(AveragePooling1D(pool_size=100))
    print(model.output_shape)

    #Enable this line for the model with LSTM
    ##model.add(LSTM(50, dropout=0.3, recurrent_dropout=0.2, return_sequences=False)) 
    
    #Disable the line below for the model with the LSTM layer
    model.add(Flatten()) 
    
    model.add(Dense(50, activation='relu'))
    model.add(Dense(10, activation='relu')) 
    model.add(Dense(1, activation='linear'))

    adam = Adam(lr = 0.001, beta_1 = 0.99, beta_2 = 0.999, epsilon = None, decay = 0.01, amsgrad = False)

    model.compile(loss = 'mse', optimizer= 'adam', metrics=['mae'])

    model.summary()

    print('Modelo compilado...')

    es = EarlyStopping(monitor='val_mae', mode='min', verbose=1, patience=10, restore_best_weights=True)

    model_history = model.fit(x_train, train_y,
              batch_size= 128,
              epochs=30, callbacks=[es],
              validation_data=(x_test, test_y))                

    y_pred = model.predict(x_test, batch_size=None, verbose=0, steps=None)
    x_pred = model.predict(x_train, batch_size=None, verbose=0, steps=None)


    #Metrics
    print("\n")
    mae = mean_absolute_error(test_y, y_pred)
    vetMAE.append(mae)
    print("MAE: %f" % (mae))
    medAE = median_absolute_error(test_y, y_pred)
    vetMdae.append(medAE)
    print("MedAE: %f" % (medAE))
    r2 = r2_score(test_y, y_pred, multioutput='raw_values')
    vetR2.append(r2)
    print("r2: %f" % (r2))
    mse = mean_squared_error(test_y, y_pred)
    vetMSE.append(mse)
    print("MSE: %f" % (mse))
    mErr = max_error(test_y, y_pred)
    print("maxrror: %f" % (mErr))

    i = i + 1
    print("Concluido " + str(i))

maeMedio = np.mean(vetMAE)  
madAEMedio = np.mean(vetMdae)  
r2Medio = np.mean(vetR2)  
mseMedio = np.mean(vetMSE) 
stdMae = np.std(vetMAE)
stdr2 = np.std(vetR2)
stdMse = np.std(vetMSE)

print('maeMedio: ' + str(maeMedio))
print('madAEMedio: ' + str(madAEMedio))
print('r2Medio: ' + str(r2Medio))
print('mseMedio: ' + str(mseMedio))
print('stdMae: ' + str(stdMae))
print('stdr2: ' + str(stdr2))
print('stdMse: ' + str(stdMse))




