**Autor:** __Tiago Dias__

In [156]:
!pip install wordninja
!pip install textblob



In [157]:
# Importação dos pacotes para Analise
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

import wordninja
import textblob
import collections

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [158]:
# Importação dos dataset
df = pd.read_csv('Train.csv')

In [159]:
print('Tamanho do arquivo de Treino', df.shape)

Tamanho do arquivo de Treino (616, 3)


In [160]:
df.head(2)

Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
1,9JDAGUV3,Why do I get hallucinations?,Drugs


In [161]:
# Separando palavras juntas
df['text_split'] = df['text'].apply(wordninja.split)
df['text_new'] = df['text_split'].apply(TreebankWordDetokenizer().detokenize)
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",I feel that it was better I die Am happy
1,9JDAGUV3,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",Why do I get hallucinations


In [162]:
# Corrigindo palavras incorretas
df['text_new'] = df['text_new'].apply(textblob.TextBlob).apply(textblob.TextBlob.correct).apply(str)
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",I feel that it was better I die Am happy
1,9JDAGUV3,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",Why do I get hallucinations


In [163]:
# Verificando quantas classes de cada
df['label'].value_counts()

Depression    352
Alcohol       140
Suicide        66
Drugs          58
Name: label, dtype: int64

In [164]:
# Variáveis importantes
# Palavras para retirar da análise
stop_words = stopwords.words('english')
# Variável tamando da base de teste
test_size = 0.2
# Variáveis do modelo
epochs = 10
batch_size = 128

In [165]:
# Excluindo da descrição texto após os números, informações julgadas irrelevantes para a classificação.
df['text_new'] = df['text_new'].str.replace('[0-9]+', '', regex=True).copy()
# Excluindo da descrição puntuação, informações julgadas irrelevantes para a classificação.
df['text_new'] = df['text_new'].str.replace('[,.:;!?]+', ' ', regex=True).copy()
# Excluindo da descrição caracteres especiais, informações julgadas irrelevantes para a classificação.
df['text_new'] = df['text_new'].str.replace('[/<>()|\+\-\$%&#@\'\"]+', ' ', regex=True).copy()
# Colocando todos os caracteres em caixa baixa.
df['text_new'] = df['text_new'].str.lower().copy()
  
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",i feel that it was better i die am happy
1,9JDAGUV3,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",why do i get hallucinations


In [166]:
# Função para retirar stop words
def tokenize_df(tokenized_words):
  tokenized_words = word_tokenize(tokenized_words)
  stop = [word for word in tokenized_words if word not in stop_words]
  text = TreebankWordDetokenizer().detokenize(stop)
  return text

# Eliminando as stop words
df['text_new'] = df['text_new'].apply(tokenize_df).copy()
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",feel better die happy
1,9JDAGUV3,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",get hallucinations


In [167]:
# Selecionando as unicas palavras da variável text_new
df['text_new_split'] = df['text_new'].apply(word_tokenize).copy()
text = list(df.text_new_split)
list_words = [item for sublist in text for item in sublist]
list_words = sorted(list_words)
only_words = set(list_words)
print('Quantidade de frases:', len(text), '\n')
print('Quantidade de GERAL palavras:', len(list_words), '\n')
print('Quantidade de UNICA palavras:', len(only_words), '\n')
counter = collections.Counter(list_words)

Quantidade de frases: 616 

Quantidade de GERAL palavras: 2293 

Quantidade de UNICA palavras: 711 



In [168]:
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new,text_new_split
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",feel better die happy,"[feel, better, die, happy]"
1,9JDAGUV3,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",get hallucinations,"[get, hallucinations]"


In [169]:
# Max tamanho dos text_new
df['text_new_split'].apply(len).max()

17

In [170]:
# Gerando encoder com o vocabulário das palavras
encoder = tfds.features.text.SubwordTextEncoder(vocab_list=only_words)
print('Vocabulary size(text_new): {}'.format(encoder.vocab_size))

Vocabulary size(text_new): 968


In [171]:
# Exemplo do encode e decode da text_new	
sample_string = 'feel better die happy'
# Encode
encoded_string = encoder.encode(sample_string)
print('Encoded string is {}'.format(encoded_string))
#Decode
original_string = encoder.decode(encoded_string)
print('The original string: "{}"'.format(original_string))

Encoded string is [30, 744, 235, 744, 490, 744, 386]
The original string: "feel better die happy"


In [172]:
# Validando a string original com a string após o decode
original_string == sample_string

True

In [173]:
# Exemplificando a relação de index ----> word
for index in encoded_string:
  print('{} ----> {}'.format(index, encoder.decode([index])))

30 ----> feel
744 ---->  
235 ----> better
744 ---->  
490 ----> die
744 ---->  
386 ----> happy


In [174]:
# Encode do label
label_encode = LabelEncoder()
target = label_encode.fit_transform(df['label'])

In [175]:
# Definindo feature e target
x = df['text_new']
y = target

# Dividindo dataset em treino e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size)

In [176]:
# Criação da matriz
def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec

# Encode do input
def encode_input(text_new):
  list_x = []
  for text in text_new:
    text_encode = encoder.encode(text)
    text_encode = pad_to_size(text_encode, 64)
    list_x.append(text_encode)
  # Convertendo x em tensor
  input_encode = tf.cast(list_x, tf.float32)
  return input_encode

# Encode do x_train e x_test
x_train = encode_input(x_train)
x_test = encode_input(x_test)

In [177]:
# Criação do modelo de RNN
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(encoder.vocab_size, 128)(inputs)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(4, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "functional_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, None)]            0         
_________________________________________________________________
embedding_10 (Embedding)     (None, None, 128)         123904    
_________________________________________________________________
bidirectional_20 (Bidirectio (None, None, 128)         98816     
_________________________________________________________________
bidirectional_21 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 4)                 516       
Total params: 322,052
Trainable params: 322,052
Non-trainable params: 0
_______________________________________________

In [178]:
# Compilando modelo e configurando o processo de treinamento
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="rmsprop",
              metrics=['accuracy'])

# Treinando o modelo
history = model.fit(x_train, y_train, epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [179]:
# Testando a qualidade do modelo
test_loss, test_acc = model.evaluate(x_test, y_test)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Test Loss: 0.8578904867172241
Test Accuracy: 0.6612903475761414


In [180]:
# Função para predição
def sample_predict(sample_pred_text):
  encoded_sample_pred_text = encoder.encode(sample_pred_text)
  encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.float32)
  predictions = model.predict(tf.expand_dims(encoded_sample_pred_text, 0))
  return (predictions)

In [181]:
sample_predict('feel better die happy')

array([[0.04718233, 0.7572406 , 0.08205433, 0.11352286]], dtype=float32)

In [182]:
# Predição do exemplo
exemple = 'feel better die happy'
predictions = sample_predict(exemple)
probabilities = [np.argmax(predictions[0])]
# Retornando os labels
new_label = label_encode.inverse_transform(probabilities)
new_label[0]

'Depression'