In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import gensim
from gensim.models import Word2Vec
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import tensorboard
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
stop_words = set(stopwords.words('portuguese'))
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
file_path = '/content/drive/MyDrive/data-csv/data.csv'

headers = ["numero", "intencao", "pergunta", "resposta"]

df = pd.read_csv(file_path, sep=";")

df.shape

(981, 24)

In [9]:
df.dropna(axis=1, how="all", inplace=True)
df.dropna(axis=0, how="all", inplace=True)

df.shape

(505, 4)

In [11]:
df = df.apply(lambda x: x.str.replace("\n", " ", regex=False) if x.dtype == "object" else x)
df = df.apply(lambda x: x.str.replace("\t", " ", regex=False) if x.dtype == "object" else x)
df = df.apply(lambda x: x.str.replace("\r", " ", regex=False) if x.dtype == "object" else x)
df = df.apply(lambda x: x.str.replace("\f", " ", regex=False) if x.dtype == "object" else x)
df = df.apply(lambda x: x.str.replace("\v", " ", regex=False) if x.dtype == "object" else x)

In [12]:
print(df.shape)
df.head()

(505, 4)


Unnamed: 0,No,Intencao,Pergunta,Resposta\n
0,1.0,Como depositar,Boa dia.tudo bem?eu gostaria de saber sobre aq...,"Bom dia! Sim, o sr pode utilizar o cartão de d..."
1,2.0,Como fazer remessa,Como enviar dinheiro do Japão?,"Para se inscrever no serviço de remessa, por f..."
2,3.0,Tempo de remessa,Quanto tempo levará para o beneficiário recebe...,"Via de regra, as remessas serão pagas via PIX ..."
3,4.0,"Pedido de envio via metodo ""ByPhone""",Boa tarde Acabei de fazer a transferência de 2...,iremos processar a sua solicitacao. Muito obri...
4,5.0,"Pedido de envio via metodo ""ByPhone""",Poderia fazer a remessa de 22yenes para o BBB ...,iremos processar a sua solicitacao. Muito obri...


In [19]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    filtered_words = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_words)

df['texto_processado'] = df['Pergunta'].apply(preprocess_text)

le = LabelEncoder()
df['label'] = le.fit_transform(df['Intencao'])

X_train, X_test, y_train, y_test = train_test_split(df['texto_processado'], df['label'], test_size=0.2, random_state=42)

In [20]:
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=50, padding='post', truncating='post')

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=50, padding='post', truncating='post')

In [21]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=50),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dense(32, activation='relu'),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs')

history = model.fit(train_padded, y_train, epochs=10, validation_data=(test_padded, y_test), callbacks=[tensorboard_callback])

y_pred = np.argmax(model.predict(test_padded), axis=-1)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")



Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 163ms/step - accuracy: 0.1180 - loss: 2.8422 - val_accuracy: 0.2079 - val_loss: 2.6814
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 97ms/step - accuracy: 0.2445 - loss: 2.5975 - val_accuracy: 0.2079 - val_loss: 2.6222
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 134ms/step - accuracy: 0.2624 - loss: 2.4657 - val_accuracy: 0.2079 - val_loss: 2.5743
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 213ms/step - accuracy: 0.1976 - loss: 2.4991 - val_accuracy: 0.2079 - val_loss: 2.5469
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 319ms/step - accuracy: 0.2781 - loss: 2.3137 - val_accuracy: 0.2178 - val_loss: 2.4492
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 179ms/step - accuracy: 0.3334 - loss: 2.2053 - val_accuracy: 0.3366 - val_loss: 2.3369
Epoch 7/10
[1m13/13[0m [3

In [23]:
sentences = [text.split() for text in df['texto_processado']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def get_w2v_vectors(text, model, size):
    words = text.split()
    vector = np.zeros(size)
    n_words = 0

    for word in words:
        if word in model.wv:
            vector += model.wv[word]
            n_words += 1

    if n_words > 0:
        vector /= n_words

    return vector

X_train_w2v = np.array([get_w2v_vectors(text, w2v_model, 100) for text in X_train])
X_test_w2v = np.array([get_w2v_vectors(text, w2v_model, 100) for text in X_test])

X_train_w2v = np.nan_to_num(X_train_w2v)
X_test_w2v = np.nan_to_num(X_test_w2v)

clf = RandomForestClassifier()
clf.fit(X_train_w2v, y_train)
y_pred_w2v = clf.predict(X_test_w2v)

print(f"Accuracy: {accuracy_score(y_test, y_pred_w2v)}")
print(f"Recall: {recall_score(y_test, y_pred_w2v, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, y_pred_w2v, average='weighted')}")

Accuracy: 0.44554455445544555
Recall: 0.44554455445544555
F1-Score: 0.3879399684473942
