![Texto](https://dadosaocubo.com/wp-content/uploads/2020/06/DADOS-AO-CUBO-Vers%C3%A3o-04-1.png) 

[__D³__](https://dadosaocubo.com/) by [__Tiago Dias__](https://www.linkedin.com/in/diasctiago/) 

In [1]:
!pip install wordninja
!pip install textblob

Collecting wordninja
[?25l  Downloading https://files.pythonhosted.org/packages/30/15/abe4af50f4be92b60c25e43c1c64d08453b51e46c32981d80b3aebec0260/wordninja-2.0.0.tar.gz (541kB)
[K     |████████████████████████████████| 542kB 2.6MB/s 
[?25hBuilding wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py) ... [?25l[?25hdone
  Created wheel for wordninja: filename=wordninja-2.0.0-cp36-none-any.whl size=541553 sha256=57ca6397672c0614e0fba581f53ff08ca585fd34e348941fecc021712ca7598d
  Stored in directory: /root/.cache/pip/wheels/22/46/06/9b6d10ed02c85e93c3bb33ac50e2d368b2586248f192a2e22a
Successfully built wordninja
Installing collected packages: wordninja
Successfully installed wordninja-2.0.0


# Carregando Bibliotecas e Dataset

In [2]:
# Importação das bibliotecas
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import wordninja
import textblob
import collections
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Importação dos dataset
df = pd.read_csv('https://raw.githubusercontent.com/dadosaocubo/nlp/master/mental_health.csv')

In [4]:
print('Tamanho do arquivo de Treino', df.shape)

Tamanho do arquivo de Treino (616, 3)


In [5]:
df.head(2)

Unnamed: 0,ID,text,label
0,0,I feel that it was better I dieAm happy,Depression
1,1,Why do I get hallucinations?,Drugs


### Variáveis importantes

In [6]:
# Palavras para retirar da análise
stop_words = stopwords.words('english')
# Variável tamando da base de teste
test_size = 0.1
# Variáveis do modelo
epochs = 10
batch_size = 128

# Preparação dos Dados

In [7]:
# Separando palavras juntas
df['text_split'] = df['text'].apply(wordninja.split)
df['text_new'] = df['text_split'].apply(TreebankWordDetokenizer().detokenize)
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new
0,0,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",I feel that it was better I die Am happy
1,1,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",Why do I get hallucinations


In [8]:
# Corrigindo palavras incorretas
df['text_new'] = df['text_new'].apply(textblob.TextBlob).apply(textblob.TextBlob.correct).apply(str)
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new
0,0,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",I feel that it was better I die Am happy
1,1,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",Why do I get hallucinations


In [9]:
# Verificando quantas classes de cada
df['label'].value_counts()

Depression    352
Alcohol       140
Suicide        66
Drugs          58
Name: label, dtype: int64

In [10]:
# Excluindo da descrição os números, informações julgadas irrelevantes para a classificação.
df['text_new'] = df['text_new'].str.replace('[0-9]+', '', regex=True).copy()
# Excluindo da descrição puntuação, informações julgadas irrelevantes para a classificação.
df['text_new'] = df['text_new'].str.replace('[,.:;!?]+', ' ', regex=True).copy()
# Excluindo da descrição caracteres especiais, informações julgadas irrelevantes para a classificação.
df['text_new'] = df['text_new'].str.replace('[/<>()|\+\-\$%&#@\'\"]+', ' ', regex=True).copy()
# Colocando todos os caracteres em caixa baixa.
df['text_new'] = df['text_new'].str.lower().copy()
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new
0,0,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",i feel that it was better i die am happy
1,1,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",why do i get hallucinations


In [11]:
# Função para retirar stop words
def tokenize_df(tokenized_words):
  tokenized_words = word_tokenize(tokenized_words)
  stop = [word for word in tokenized_words if word not in stop_words]
  text = TreebankWordDetokenizer().detokenize(stop)
  return text
# Eliminando as stop words
df['text_new'] = df['text_new'].apply(tokenize_df).copy()
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new
0,0,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",feel better die happy
1,1,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",get hallucinations


In [12]:
# Selecionando as unicas palavras da variável text_new
df['text_new_split'] = df['text_new'].apply(word_tokenize).copy()
text = list(df.text_new_split)
list_words = [item for sublist in text for item in sublist]
list_words = sorted(list_words)
only_words = set(list_words)
print('Quantidade de frases:', len(text), '\n')
print('Quantidade de GERAL palavras:', len(list_words), '\n')
print('Quantidade de UNICA palavras:', len(only_words), '\n')
counter = collections.Counter(list_words)

Quantidade de frases: 616 

Quantidade de GERAL palavras: 2293 

Quantidade de UNICA palavras: 711 



In [13]:
df.head(2)

Unnamed: 0,ID,text,label,text_split,text_new,text_new_split
0,0,I feel that it was better I dieAm happy,Depression,"[I, feel, that, it, was, better, I, die, Am, h...",feel better die happy,"[feel, better, die, happy]"
1,1,Why do I get hallucinations?,Drugs,"[Why, do, I, get, hallucinations]",get hallucinations,"[get, hallucinations]"


In [14]:
# Max tamanho dos text_new
df['text_new_split'].apply(len).max()

17

In [15]:
# Gerando encoder com o vocabulário das palavras
encoder = tfds.features.text.SubwordTextEncoder(vocab_list=only_words)
print('Vocabulary size(text_new): {}'.format(encoder.vocab_size))

Vocabulary size(text_new): 968


In [16]:
# Exemplo do encode e decode da text_new	
sample_string = 'feel better die happy'
# Encode
encoded_string = encoder.encode(sample_string)
print('Encoded string is {}'.format(encoded_string))
#Decode
original_string = encoder.decode(encoded_string)
print('The original string: "{}"'.format(original_string))

Encoded string is [390, 744, 4, 744, 449, 744, 300]
The original string: "feel better die happy"


In [17]:
# Validando a string original com a string após o decode
original_string == sample_string

True

In [18]:
# Exemplificando a relação de index ----> word
for index in encoded_string:
  print('{} ----> {}'.format(index, encoder.decode([index])))

390 ----> feel
744 ---->  
4 ----> better
744 ---->  
449 ----> die
744 ---->  
300 ----> happy


In [19]:
# Encode do label
label_encode = LabelEncoder()
target = label_encode.fit_transform(df['label'])

In [20]:
# Definindo feature e target
x = df['text_new']
y = target
# Dividindo dataset em treino e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size)

In [21]:
# Função para criação da matriz
def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec
# Função para encode do input
def encode_input(text_new):
  list_x = []
  for text in text_new:
    text_encode = encoder.encode(text)
    text_encode = pad_to_size(text_encode, 64)
    list_x.append(text_encode)
  # Convertendo x em tensor
  input_encode = tf.cast(list_x, tf.float32)
  return input_encode
# Encode do x_train e x_test
x_train = encode_input(x_train)
x_test = encode_input(x_test)

# Modelo RNN

In [22]:
# Criação do modelo de RNN
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(encoder.vocab_size, 128)(inputs)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(4, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         123904    
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         98816     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 4)                 516       
Total params: 322,052
Trainable params: 322,052
Non-trainable params: 0
________________________________________________

# Treinamento do Modelo

In [23]:
# Compilando modelo e configurando o processo de treinamento
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="rmsprop",
              metrics=['accuracy'])
# Treinando o modelo
history = model.fit(x_train, y_train, epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Avaliação do Modelo

In [24]:
# Testando a qualidade do modelo
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Test Loss: 0.7113182544708252
Test Accuracy: 0.725806474685669


# Exemplo de Predição

In [25]:
# Função para predição
def sample_predict(sample_pred_text):
  encoded_sample_pred_text = encoder.encode(sample_pred_text)
  encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.float32)
  predictions = model.predict(tf.expand_dims(encoded_sample_pred_text, 0))
  return (predictions)

In [26]:
sample_predict('feel better die happy')

array([[0.02238215, 0.86019254, 0.03252797, 0.08489731]], dtype=float32)

In [41]:
# Predição do exemplo
example = 'feel better die happy'
predictions = sample_predict(example)
probabilities = [np.argmax(predictions[0])]
# Retornando os labels
new_label = label_encode.inverse_transform(probabilities)
print('O exemplo "{}" foi classificado como "{}"'.format(example, new_label[0]))

O exemplo "feel better die happy" foi classificado como "Depression"


# Bônus - Tradução de texto com python

In [None]:
!pip install googletrans

In [39]:
from googletrans import Translator

translator = Translator()
translation_example = translator.translate(example, dest='pt')
translation_label = translator.translate(new_label[0], dest='pt')
print('Exemplo Original "{}" ---> Label Original "{}"'.format(translation_example.origin, translation_label.origin))
print('Exemplo Traduzido "{}" ---> Label Traduzido "{}"'.format(translation_example.text, translation_label.text))

Exemplo Original "feel better die happy" ---> Label Original "Depression"
Exemplo Traduzido "sinta-se melhor morrer feliz" ---> Label Traduzido "Depressão"
