### Importando Bibliotecas

In [None]:
#importando bibliotecas
import pandas as pd
import numpy as np
from tensorflow import keras
from sklearn.utils import shuffle
#from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM,Bidirectional,Dropout
from keras.layers import AveragePooling1D
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, RMSprop
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.layers import Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing
from sklearn.metrics import * 
from sklearn.model_selection import KFold 
import os
import matplotlib.pyplot as plt
import seaborn as sns

### Carregando Dados

In [11]:
df = pd.read_csv('reqTxt.csv', header=None) #Report or file containing the set of training and test texts.

In [12]:
dfRequire = df.iloc[:,:] 

In [None]:
print(dfRequire.shape)
print(dfRequire.columns)
X = dfRequire[0]
print(X[0])
X = np.array(X)

print(len(X))

print('Train and test dataset loaded...')

In [None]:
y = pd.read_csv('estiDeep.data', header=None) #File containing the set of training and test labels.
y = np.array(y)
print ('Shape of label tensor:', y.shape)
print(y.dtype)

#Number of texts in train and test dataset 
MAX_LEN = 23313

kf = KFold(n_splits=10, shuffle=True, random_state=1000) 
kf.get_n_splits(X) #returns the number of splitting iterations in the cross-validator
print(kf) 


### Carregando Embeddings ja convertidos
O arquivo Convert embeddings mostra a conversão da base de requisitos em embeddings

In [None]:
pret_model = pd.read_csv('embeddings_finetuned_FastText.csv', delimiter= ',', header=None) #insert embedding 
embedding_matrix = pret_model.iloc[0:23313,:] 
dfEmbedding_mat = pd.DataFrame(embedding_matrix)
embedding_mat = dfEmbedding_mat.fillna('0') 

print('Embedding mat: ' + str(embedding_mat.shape))

i = 0

### Model 


In [None]:
all_y_test_Deep = []
all_y_pred_Deep = []
vetMAEDeep = []
vetR2Deep = []
vetMSEDeep = []
vetMdaeDeep = []
vetPred25Deep = []

#Pred(25)
def calcular_pred_25(y_true, y_pred):
    abs_errors = np.abs(y_true - y_pred)
    pred_25 = np.mean((abs_errors / np.abs(y_true)) <= 0.25) * 100
    return pred_25

#Cross-validation loop
for train_index, test_index in kf.split(X):
    x_train, test_x = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]
    
    texts_train = x_train.astype(str)
    texts_test = test_x.astype(str)

    tokenizer = Tokenizer(num_words=MAX_LEN, char_level=False, lower=False) 
    tokenizer.fit_on_texts(texts_train)                            
    encSequences = tokenizer.texts_to_sequences(texts_train)          
    encSequences_test = tokenizer.texts_to_sequences(texts_test)      

    vocab_size = len(tokenizer.word_index) + 1 
    print('Vocab_size: ' + str(vocab_size))

    MAX_SEQUENCE_LENGTH = 100  

    x_train = pad_sequences(encSequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    x_test = pad_sequences(encSequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    print('Shape of data tensor:', x_train.shape)
    print('Shape of data test tensor:', x_test.shape)

    # Definindo o modelo
    model = Sequential()

    #embedding = Embedding(MAX_LEN, 300, input_length=MAX_SEQUENCE_LENGTH, trainable=True) #Using this for FastText
    embedding = Embedding(MAX_LEN, 768, input_length=MAX_SEQUENCE_LENGTH, trainable=True) #Using this for XLNET
    
    embedding.build(input_shape=(None,))  #input_shape é ajustado para (None,) para batch size variável
    embedding.set_weights([embedding_mat])
    model.add(embedding)
    
    model.add(AveragePooling1D(pool_size=100))
    model.add(LSTM(50, dropout=0.3, recurrent_dropout=0.2, return_sequences=False)) 
    model.add(Dense(50, activation='relu'))
    model.add(Dense(10, activation='relu')) 
    model.add(Dense(1, activation='linear'))

    adam = Adam(learning_rate=0.001) 
    model.compile(loss='mse', optimizer=adam, metrics=['mae'])

    model.summary()

    es = EarlyStopping(monitor='val_mae', mode='min', verbose=1, patience=10, restore_best_weights=True)

    model_history = model.fit(x_train, train_y,
              batch_size=128,
              epochs=30, callbacks=[es],
              validation_data=(x_test, test_y))                

    y_pred = model.predict(x_test)

    # Armazenar os resultados
    all_y_test_Deep.extend(test_y.flatten())
    all_y_pred_Deep.extend(y_pred.flatten())

    # Cálculo das métricas
    pred_25 = calcular_pred_25(test_y.flatten(), y_pred.flatten())
    vetPred25Deep.append(pred_25)
    
    mae = mean_absolute_error(test_y, y_pred)
    vetMAEDeep.append(mae)
    medAE = median_absolute_error(test_y, y_pred)
    vetMdaeDeep.append(medAE)
    r2 = r2_score(test_y, y_pred)
    vetR2Deep.append(r2)
    mse = mean_squared_error(test_y, y_pred)
    vetMSEDeep.append(mse)

maeMedio = np.mean(vetMAEDeep)  
madAEMedio = np.mean(vetMdaeDeep)  
r2Medio = np.mean(vetR2Deep)  
mseMedio = np.mean(vetMSEDeep) 



In [None]:
#Formatar os resultados
maeMedio = np.mean(vetMAEDeep)  
madAEMedio = np.mean(vetMdaeDeep)  
r2Medio = np.mean(vetR2Deep)  
mseMedio = np.mean(vetMSEDeep) 
stdMae = np.std(vetMAEDeep)
stdr2 = np.std(vetR2Deep)
stdMse = np.std(vetMSEDeep)


mae_result = f"{maeMedio:.2f} ± {stdMae:.2f}"
mse_result = f"{mseMedio:.2f} ± {stdMse:.2f}"
r2_result = f"{r2Medio:.2f} ± {stdr2:.2f}"
mad_result = f"{madAEMedio:.2f}" 


#Imprimir os resultados
print('MAE:', mae_result)
print('MSE:', mse_result)
print('R2:', r2_result)
print('MdAE:', mad_result)


In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()