# LIMPANDO

In [21]:
import pandas as pd

In [31]:
df_binary_classification = pd.read_csv("2019-05-28_portuguese_hate_speech_binary_classification.csv")

In [32]:
df_binary_classification.head(3)

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3
0,@__andrea__b \r\nO cara vive em outro mundo\r\...,1,1,A,1.0,V,0,E
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C
2,@_carmeloneto \r\nOs 'cumpanhero' quebraram to...,0,1,A,0.0,B,0,E


In [33]:
df_hate = df_binary_classification[['text', 'hatespeech_comb']]
df_hate.rename(columns = 
{
    'hatespeech_comb':'hate',
}, inplace = True)

In [34]:
df_hate.head(3)

Unnamed: 0,text,hate
0,@__andrea__b \r\nO cara vive em outro mundo\r\...,1
1,@_carmeloneto Estes incompetentes não cuidam n...,0
2,@_carmeloneto \r\nOs 'cumpanhero' quebraram to...,0


In [35]:
import re, os
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from string import punctuation

In [36]:
def clean_str(string):

#    word_tokens = word_tokenize(string)
    
    string = re.sub(r"@[A-Za-z0-9]+", ' ', string)
    string = re.sub(r"https?://[A-Za-z0-9./]+", ' ', string)
    # Remove as imagens
    string = re.sub(r'pic.twitter.com/\S+', ' ', string)
    string = re.sub(r'(\<.*?\>)', ' ', string, flags=re.UNICODE)
    
    string = re.sub(r"[^a-zA-Z.!?']", ' ', string)
    string = re.sub(r" +", ' ', string)
    
    word_tokens = word_tokenize(string.lower())
    
    stop_words = set(stopwords.words('portuguese') + list(punctuation))
    palavras_sem_stopwords = [palavra for palavra in word_tokens if palavra not in stop_words]
    
    string = ' '.join(palavras_sem_stopwords)

    
    return string.strip()

In [37]:
df_hate['clean_text'] = df_hate['text'].apply(clean_str)

df_hate=df_hate.reindex(columns= ['text', 'clean_text', 'hate'])

In [38]:
df_hate.head(3)

Unnamed: 0,text,clean_text,hate
0,@__andrea__b \r\nO cara vive em outro mundo\r\...,andrea b cara vive outro mundo n mundo real re...,1
1,@_carmeloneto Estes incompetentes não cuidam n...,carmeloneto incompetentes n cuidam povo brasil...,0
2,@_carmeloneto \r\nOs 'cumpanhero' quebraram to...,carmeloneto 'cumpanhero quebraram todas regras,0


# BERT

In [63]:
from keras.layers import Dense, Input
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, Dropout
from keras.models import Sequential, Model
from keras import metrics
import keras.models

In [53]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased')
#embedder = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

In [54]:
sentences = df_hate["clean_text"].values
labels = df_hate['hate'].values
sentence_embeddings = sbert_model.encode(sentences)

In [55]:
length_long_sentence = max([len(s) for s in sentences])

In [56]:
def TrainDeepLearningModel(maiorTamanho, tamanhoVocabulario, numDimensions, matriz_embedding = None):

    print("Maior tamanho:", maiorTamanho)
    print("Tamanho vocabulário:", tamanhoVocabulario)
    print("Numero dimensões:", numDimensions)
    
    deep_inputs = Input(shape=(numDimensions,))
    embedding = Embedding(tamanhoVocabulario, numDimensions, input_length=maiorTamanho, weights=[matriz_embedding], trainable=False)(deep_inputs) # line A
    flatten = Flatten()(embedding)
    hidden1 = Dense(20, activation='relu')(flatten)

    h4 = Dense(10, activation='sigmoid')(hidden1)
    hidden3 = Dense(2, activation='softmax')(h4)
    model = Model(inputs=deep_inputs, outputs=hidden3)

    model.compile(loss="mse", optimizer="adam", metrics=[metrics.mean_squared_error, metrics.mean_absolute_error])
    model.summary();
    
    return model

In [64]:
model = TrainDeepLearningModel(
    length_long_sentence, 
    sentence_embeddings.shape[0], 
    sentence_embeddings.shape[1], 
    sentence_embeddings
)

Maior tamanho: 130
Tamanho vocabulário: 5670
Numero dimensões: 512
Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 512)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 512, 512)          2903040   
_________________________________________________________________
flatten_2 (Flatten)          (None, 262144)            0         
_________________________________________________________________
dense_6 (Dense)              (None, 20)                5242900   
_________________________________________________________________
dense_7 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 22        
Total params: 8,146,172
Trainable params: 5,243,132
N

In [65]:
model.fit(sentence_embeddings, labels, epochs=2, verbose=1);

Epoch 1/2
Epoch 2/2


In [67]:
teste = sbert_model.encode(['eu odeio pretos']);
resultado = pd.DataFrame(model.predict(teste), columns=["positive", 'negative']);
resultado

Unnamed: 0,positive,negative
0,0.5,0.5


### Não entendi direito o q fiz ai em cima... vou tentar outro modelo da [internet]('https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671')

In [74]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading: 100%|██████████| 433/433 [00:00<00:00, 32.3kB/s]
Downloading: 100%|██████████| 536M/536M [03:10<00:00, 2.81MB/s]
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream t

In [75]:
import tensorflow as tf


In [76]:
df_hate.shape

(5670, 3)

In [77]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=2500, 
    validation_split=0.2, 
    subset='training', 
    seed=123
)

test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=3170, 
    validation_split=0.2,
    subset='validation', 
    seed=123
)


FileNotFoundError: [WinError 3] O sistema não pode encontrar o caminho especificado: 'aclImdb/train'

In [70]:
import numpy as np

In [72]:
for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['clean_text', 'hate']
train['clean_text'] = train['clean_text'].str.decode("utf-8")
train.head()


NameError: name 'train' is not defined