In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle



print(tf.__version__)


2.6.0


In [2]:
df = pd.read_excel('Telegram_mens 07.09.xlsx', header=[0,1])

In [3]:
df = df.iloc[:,:2]
df.columns = ['Mensagem','Classificação']
df.dropna(inplace=True) 

In [4]:
df.drop(df[(df['Classificação']!=1) & (df['Classificação']!=0)].index, inplace=True)

In [5]:
df.reset_index(inplace=True)
df

Unnamed: 0,index,Mensagem,Classificação
0,0,https://bit.ly/3yZpz9A ATENÇÃO: RECADO PARA...,1
1,1,Padre Gabriel mostra a verdade sobre maçona...,0
2,2,"Quando voce se sentir um idiota, veja essa ...",1
3,3,"_Boa noite meus irmãos do BEM, pacíficos e o...",1
4,4,...A fraudemia é um plano arquitetado pelos ...,0
...,...,...,...
3972,4884,Zetinhahttps://youtu.be/vB31fGZ5m6IYouTube#07/...,0
3973,4885,Zetinhahttps://youtu.be/Yhy5KEmOFjEYouTubeVeja...,1
3974,4886,Zetinhahttps://youtu.be/znQIxxKQ_R4YouTubeOLHA...,1
3975,4887,zeyton santiz tudo se transforma em magia .......,0


In [6]:
sentences = df.Mensagem.to_list()
labels = df.Classificação.to_list()

In [7]:
training_size = 3500

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]

training_labels = labels [0:training_size]
testing_labels = labels [training_size:]

In [8]:
vocab_size = 5000
max_length = 400
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

# creating training sequences and padding them
traning_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(traning_sequences, maxlen = max_length,
                                padding = padding_type,
                                truncating=trunc_type,
                                )

# creating  testing sequences and padding them using same tokenizer
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen = max_length,
                                padding = padding_type,
                                truncating=trunc_type,
                                )

In [10]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [11]:
embedding_dim = 16

model  = tf.keras.Sequential([
                # addinging an Embedding layer for Neural Network to learn the vectors
                tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
                tf.keras.layers.GlobalAveragePooling1D(),
                tf.keras.layers.Dense(24, activation = 'relu'),
                tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [13]:
history = model.fit(training_padded,training_labels, epochs = 15,
                    validation_data = (testing_padded,testing_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [14]:
# sentence 1 is bit sarcastic, whereas sentence two is a general statment.
new_sentence = [
                "Eu odeio bolsonaro, aquele filha da puta",
                "Eu apoio todo o brasil"
                ]

# Converting the sentences to sequences using tokenizer
new_sequences = tokenizer.texts_to_sequences(new_sentence)
new_padded = pad_sequences(new_sequences, maxlen = max_length,
                           padding = padding_type,
                           truncating = trunc_type)

new_padded = np.array(new_padded )

print(model.predict(new_padded))

[[0.24040264]
 [0.76771903]]


In [15]:
new_sequences

[[83, 1, 21, 1588, 987, 9, 3820], [83, 241, 122, 3, 13]]

In [25]:
model.save('Model_NLP.h5')
print('Model Saved!')
model.save_weights('Weights_NLP.h5')
print('Model Saved!')

Model Saved!
Model Saved!


In [19]:
def save_object(obj, filename):
    with open(filename, 'wb') as outp: 
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

In [20]:
objs = {'Tokenizer':tokenizer, 'vocab_size':5000, 'max_length': 400, 'trunc_type':'post', 'padding_type':'post','oov_tok': "<OOV>"}

save_object(objs, 'Functions_to_nlp.obj')

In [21]:
with open('Functions_to_nlp.obj', 'rb') as inp:
    tech_companies = pickle.load(inp)

In [22]:
tech_companies

{'Tokenizer': <keras_preprocessing.text.Tokenizer at 0x1acae7004f0>,
 'vocab_size': 5000,
 'max_length': 400,
 'trunc_type': 'post',
 'padding_type': 'post',
 'oov_tok': '<OOV>'}