# LIMPANDO

In [1]:
import pandas as pd

In [2]:
df_binary_classification = pd.read_csv("2019-05-28_portuguese_hate_speech_binary_classification.csv")

In [3]:
df_binary_classification.head(3)

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3
0,@__andrea__b \r\nO cara vive em outro mundo\r\...,1,1,A,1.0,V,0,E
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C
2,@_carmeloneto \r\nOs 'cumpanhero' quebraram to...,0,1,A,0.0,B,0,E


In [4]:
df_hate = df_binary_classification[['text', 'hatespeech_comb']]
df_hate.rename(columns = 
{
    'hatespeech_comb':'hate',
}, inplace = True)

In [5]:
df_hate.head(3)

Unnamed: 0,text,hate
0,@__andrea__b \r\nO cara vive em outro mundo\r\...,1
1,@_carmeloneto Estes incompetentes não cuidam n...,0
2,@_carmeloneto \r\nOs 'cumpanhero' quebraram to...,0


In [6]:
import re, os
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from string import punctuation

In [7]:
def clean_str(string):

#    word_tokens = word_tokenize(string)
    
    string = re.sub(r"@[A-Za-z0-9]+", ' ', string)
    string = re.sub(r"https?://[A-Za-z0-9./]+", ' ', string)
    # Remove as imagens
    string = re.sub(r'pic.twitter.com/\S+', ' ', string)
    string = re.sub(r'(\<.*?\>)', ' ', string, flags=re.UNICODE)
    
    string = re.sub(r"[^a-zA-Z.!?']", ' ', string)
    string = re.sub(r" +", ' ', string)
    
    word_tokens = word_tokenize(string.lower())
    
    stop_words = set(stopwords.words('portuguese') + list(punctuation))
    palavras_sem_stopwords = [palavra for palavra in word_tokens if palavra not in stop_words]
    
    string = ' '.join(palavras_sem_stopwords)

    
    return string.strip()

In [8]:
df_hate['clean_text'] = df_hate['text'].apply(clean_str)

df_hate=df_hate.reindex(columns= ['text', 'clean_text', 'hate'])

In [9]:
df_hate.head(3)

Unnamed: 0,text,clean_text,hate
0,@__andrea__b \r\nO cara vive em outro mundo\r\...,andrea b cara vive outro mundo n mundo real re...,1
1,@_carmeloneto Estes incompetentes não cuidam n...,carmeloneto incompetentes n cuidam povo brasil...,0
2,@_carmeloneto \r\nOs 'cumpanhero' quebraram to...,carmeloneto 'cumpanhero quebraram todas regras,0


# BERT

In [10]:
from keras.layers import Dense, Input
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, Dropout
from keras.models import Sequential, Model
from keras import metrics
import keras.models

In [11]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased')
#sbert_model = SentenceTransformer('distiluse-base-multilingual-cased')
#embedder = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

In [12]:
sentences = df_hate["clean_text"].values
labels = df_hate['hate'].values
sentence_embeddings = sbert_model.encode(sentences)

In [13]:
length_long_sentence = max([len(s) for s in sentences])

In [14]:
def TrainDeepLearningModel(maiorTamanho, tamanhoVocabulario, numDimensions, matriz_embedding = None):

    print("Maior tamanho:", maiorTamanho)
    print("Tamanho vocabulário:", tamanhoVocabulario)
    print("Numero dimensões:", numDimensions)
    
    deep_inputs = Input(shape=(numDimensions,))
    embedding = Embedding(tamanhoVocabulario, numDimensions, input_length=maiorTamanho, weights=[matriz_embedding], trainable=False)(deep_inputs) # line A
    flatten = Flatten()(embedding)
    hidden1 = Dense(20, activation='relu')(flatten)

    h4 = Dense(10, activation='sigmoid')(hidden1)
    hidden3 = Dense(2, activation='softmax')(h4)
    model = Model(inputs=deep_inputs, outputs=hidden3)

    model.compile(loss="mse", optimizer="adam", metrics=[metrics.mean_squared_error, metrics.mean_absolute_error])
    model.summary();
    
    return model

In [15]:
model = TrainDeepLearningModel(
    length_long_sentence, 
    sentence_embeddings.shape[0], 
    sentence_embeddings.shape[1], 
    sentence_embeddings
)

Maior tamanho: 130
Tamanho vocabulário: 5670
Numero dimensões: 512
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 512)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 512, 512)          2903040   
_________________________________________________________________
flatten (Flatten)            (None, 262144)            0         
_________________________________________________________________
dense (Dense)                (None, 20)                5242900   
_________________________________________________________________
dense_1 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 22        
Total params: 8,146,172
Trainable params: 5,243,132
N

In [16]:
model.fit(sentence_embeddings, labels, epochs=2, verbose=1);

Epoch 1/2
Epoch 2/2


In [17]:
teste = sbert_model.encode(['eu odeio pretos']);
resultado = pd.DataFrame(model.predict(teste), columns=["positive", 'negative']);
resultado

Unnamed: 0,positive,negative
0,0.479165,0.520835


### Não entendi direito o q fiz ai em cima... Uma opção para explorar é essa da [internet]('https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671')

### Outra opção é [essa](https://github.com/thomas-ferraz/FakeNews-BERTimbau/blob/main/Modelo_BERTimbau.ipynb) que usa BERTimbau para analisar Fake News.

Vamos testar ela abaixo.

In [None]:
#Separar treino e test do treinamento

#aleatorizar e separar a 0
df_train=df.loc[df['class'] == 0].sample(frac=0.7,random_state=12) #random state is a seed value
df_val=df.loc[df['class'] == 0].drop(df_train.index)

#aleatorizar e separar a 1
df_train2=df.loc[df['class'] == 1].sample(frac=0.7,random_state=12) #random state is a seed value
df_val2=df.loc[df['class'] == 1].drop(df_train2.index)

#Juntar
df_train = pd.concat([df_train, df_train2])
df_val = pd.concat([df_val, df_val2])

#aleatorizar
from sklearn.utils import shuffle
df_train = shuffle(df_train, random_state=12)
df_val = shuffle(df_val, random_state=12)

In [None]:
X_val = df_val['text'].to_numpy()
X_train = df_train['text'].to_numpy()
Y_val = df_val['class'].to_numpy()
Y_train = df_train['class'].to_numpy()

In [None]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel

Tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased')

In [None]:
def tokenize(sentence, max_len=3000):
    tokens = Tokenizer.encode_plus(sentence, max_length=max_len,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

def token_encode(X, max_len):
  input_ids = []
  masks = []
  for text in X:
    text_input_id, text_mask = tokenize(text, max_len=max_len)
    input_ids.append(tf.reshape(text_input_id, [max_len]))
    masks.append(tf.reshape(text_mask, [max_len]))
  return np.array(input_ids), np.array(masks)

In [None]:
def build_model(max_len=512, type_model='OnlyBERT'):
  input_ids = tf.keras.layers.Input(shape=(max_len,), name='input_ids', dtype=tf.int32)
  mask = tf.keras.layers.Input(shape=(max_len,), name='attention_mask', dtype=tf.int32)
  
  # By December 12th Hugging Face's Transformer Library only had the PyTorch version of BERTimbau so we needed to
  # convert the PyTorch model to TensorFlow using the AutoConfig class and adding from_pt (from pytorch) equal to True
  config = AutoConfig.from_pretrained('neuralmind/bert-large-portuguese-cased')
  BERTimbau = TFAutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased', from_pt=True, config=config)

  last_bert_hidden_layer = BERTimbau(input_ids, attention_mask=mask)[0]

  if type_model == 'OnlyBERT':
    net = tf.keras.layers.Dense(64, activation='relu')(last_bert_hidden_layer)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Flatten()(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(net)

  elif type_model == 'BERT_LSTM':
    net = tf.keras.layers.Dense(64, activation='relu')(last_bert_hidden_layer)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Flatten()(net)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(net)

  model = tf.keras.models.Model(inputs=[input_ids, mask], outputs=out)
  model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

  return model

In [None]:
max_len = 100

X_train = token_encode(df_train['text'].to_numpy(), max_len)
X_val = token_encode(df_val['text'].to_numpy(), max_len)

In [None]:
X_train[0].shape


In [None]:
X_train[1].shape


In [None]:
print(X_train[0])


In [None]:
print(X_train[1])


In [None]:
model3 = build_model(max_len=max_len)
model3.summary()

In [None]:
train_history_3 = model3.fit(
    X_train, Y_train, 
    validation_data=(X_val, Y_val),
    epochs=10,
    batch_size=8,
    verbose=1
)

In [None]:
Y_test = model3.predict(X_val)


In [None]:
def y_ajust(y):
  y_hat = []
  for x in y:
    if x >= 0.5:
      y_hat.append(1)
    else:
      y_hat.append(0)
  return y_hat

In [None]:
from sklearn import metrics
Y_test = y_ajust(Y_test)
conf_matrix = metrics.confusion_matrix(Y_val, Y_test)
print(conf_matrix)
print(metrics.classification_report(Y_val, Y_test, digits=2))

In [None]:
metrics.ConfusionMatrixDisplay(conf_matrix, display_labels=[0,1]).plot()


In [None]:
plot_graphs(train_history_3, "accuracy")


In [None]:
plot_graphs(train_history_3, "loss")
