In [1]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
import unicodedata
import re
import pandas as pd
import numpy as np
  
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Convolution1D
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [53]:
path = 'drive/My Drive/Doutorado/'
df = pd.read_excel(path+'Fake_dataset_truncated.xlsx')

In [54]:
from sklearn.utils import shuffle
df = shuffle(df)

In [55]:
df.head()

Unnamed: 0,Classe,Texto
3215,fake,"Mais um general se pronuncia: ""A esquerda derr..."
6637,true,Em duas sessões plenárias esta semana mais pel...
1179,fake,Áudio entre Renan Calheiros e Sérgio Machado: ...
2162,fake,Bolívia e Venezuela estão enviando frotas de ô...
7096,true,Justiça quer que socialite quite dívida antes ...


In [56]:
df['Classe'] = df['Classe'].map(lambda x: 1 if x == 'fake' else 0)

In [57]:
 import nltk
 nltk.download('stopwords')
 nltk.download('punkt')
 from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
stopwords = set(stopwords.words('portuguese') + list(punctuation)) # cria a lista de stopwords em português

In [59]:
# Função que remove as stopwords dado um texto
def remove_stopwords(texto,stopwords):
    palavras = word_tokenize(texto.lower())
    palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]
    return ' '.join(palavras_sem_stopwords)

In [60]:
# Função que remove caracteres especiais das frases
def remove_special_characters(palavra):

    # Unicode normalize transforma um caracter em seu equivalente em latin.
    nfkd = unicodedata.normalize('NFKD', palavra)
    palavraSemAcento = u"".join([c for c in nfkd if not unicodedata.combining(c)])

    # Usa expressão regular para retornar a palavra apenas com números, letras e espaço
    return re.sub('[^a-zA-Z0-9 \\\]', '', palavraSemAcento)

In [61]:
df['Texto'] = df['Texto'].apply(lambda x: remove_stopwords(x,stopwords)) # removendo stopwords da nossa base

In [62]:
df['Texto'] = df['Texto'].apply(lambda x: remove_special_characters(x)) # removendo acentuacao e caracteres especiais

In [294]:
MAX_FEATURES = 10000
EMBED_SIZE = 150
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(df['Texto'])
list_tokenized_train = tokenizer.texts_to_sequences(df['Texto'])
  
RNN_CELL_SIZE = 32
  
#MAX_LEN = 2885  # Since our mean length is 128.5
#MAX_LEN = 130
MAX_LEN = None
X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
y_train = list(df['Classe'].values)

In [295]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [304]:
def create_model():
  sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
  embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)
  lstm_out, hidden_state  = GRU(RNN_CELL_SIZE, return_sequences = True, return_state=True,  name="gru_0", kernel_initializer='glorot_uniform')(embedded_sequences)
  context_vector, attention_weights = BahdanauAttention(10)(hidden_state, lstm_out)
  dense1 = Dense(200, activation="relu")(context_vector)
  dropout_1 = Dropout(0.2)(dense1)
  dense2 = Dense(200, activation="relu")(dropout_1)
  dropout_2 = Dropout(0.05)(dense2)
  output = Dense(1, activation="sigmoid")(dropout_2)
  
  model = keras.Model(inputs=sequence_input, outputs=output)
  optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
  model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
  return model

In [305]:
model = create_model()

In [306]:
# summarize layers
print(model.summary())

Model: "functional_105"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_57 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_54 (Embedding)        (None, None, 150)    1500000     input_57[0][0]                   
__________________________________________________________________________________________________
gru_0 (GRU)                     [(None, None, 32), ( 17664       embedding_54[0][0]               
__________________________________________________________________________________________________
bahdanau_attention_53 (Bahdanau ((None, 32), (None,  671         gru_0[0][1]                      
                                                                 gru_0[0][0]         

In [307]:
BATCH_SIZE = 64
EPOCHS = 10
history = model.fit(X_train,np.array(y_train),
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [308]:
## Making predictions on our model
prediction = model.predict(X_train)
y_pred = (prediction > 0.5)
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_pred)

0.9772222222222222

In [309]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [310]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [311]:
model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=64, verbose=0)

In [312]:
seed = 7
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = cross_val_score(model, X_train, y_train, cv=kfold, verbose = 10)
print(results.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.906, total=  53.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.3s remaining:    0.0s


[CV] .................................... , score=0.890, total=  53.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s


[CV] .................................... , score=0.903, total=  53.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.7min remaining:    0.0s


[CV] .................................... , score=0.905, total=  53.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.6min remaining:    0.0s


[CV] .................................... , score=0.890, total=  52.6s
0.8986111164093018


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.4min finished


In [313]:
results

array([0.90625   , 0.88958335, 0.90277779, 0.90486109, 0.88958335])