In [None]:


#@title The MIT License (MIT)
#
# Copyright (c) 2024 Eric dos Santos.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

# Sistema de Classificação de Notícias Falsas

Este projeto tem como objetivo desenvolver uma rede neural para detecção de fake news em língua portuguesa, utilizando o dataset [Fake.br-Corpus](https://github.com/roneysco/Fake.br-Corpus). Com isso, buscamos criar um sistema capaz de identificar padrões e distinguir notícias falsas de verdadeiras, contribuindo para o combate à desinformação.

<table class="tfo-notebook-buttons" align="center">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/ericshantos/br_fake_news_detector_model/blob/main/br_fake_news_detector_model.ipynb
"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Rode no Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/ericshantos/br_fake_news_detector_model/blob/main/br_fake_news_detector_model/br_fake_news_detector_model.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />Visualize o código no GitHub</a>
  </td>
</table>

## Carregamento do dataset

In [None]:
!git clone https://github.com/roneysco/Fake.br-Corpus
DATA_PATH = "./Fake.br-Corpus/full_texts"

Cloning into 'Fake.br-Corpus'...
remote: Enumerating objects: 28763, done.[K
remote: Total 28763 (delta 0), reused 0 (delta 0), pack-reused 28763 (from 1)[K
Receiving objects: 100% (28763/28763), 37.10 MiB | 14.56 MiB/s, done.
Resolving deltas: 100% (14129/14129), done.
Updating files: 100% (21602/21602), done.


In [None]:
import pandas as pd
import os

# Diretório de notícias
fake_dir = f"{DATA_PATH}/fake"
real_dir = f"{DATA_PATH}/true"

### Extração do conteúdo das notícias:


In [None]:
import os
import pandas as pd

def load_news(news_dir: str, label: str) -> pd.DataFrame:
    # Lista para armazenar as notícias
    news = []

    # Percorre todos os arquivos no diretório especificado
    for filename in os.listdir(news_dir):
        # Verifica se o arquivo tem a extensão .txt
        if filename.endswith(".txt"):
            # Obtém o caminho completo do arquivo
            file_path = os.path.join(news_dir, filename)

            # Abre o arquivo e lê seu conteúdo
            with open(file_path, "r") as file:
                content = file.read()

                # Adiciona o conteúdo e o rótulo à lista de notícias
                news.append({"text": content, "label": label})

    # Retorna um DataFrame do pandas contendo as notícias
    return pd.DataFrame(news)

Resultado:

In [None]:
fake_news = load_news(fake_dir, 0)
real_news = load_news(real_dir, 1)

## Pré-processamento dos dados

### Concatenar os DataFrames

Agrupar os Dataframes para gerar uma única base de dados robusta.

In [None]:
data_news = pd.concat([fake_news, real_news], ignore_index=True).sample(frac=1, random_state=13)

Informações sobre a base final:

In [None]:
data_news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7200 entries, 3248 to 338
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7200 non-null   object
 1   label   7200 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 168.8+ KB


In [None]:
data_news = data_news.apply(

    # Caso válido, tipa a coluna como float
    lambda col: col.astype(float) if col.apply(

        # Verifica se são dígitos
        lambda x: str(x).replace('.', '', 1).isdigit()
    ).all() else col
)

# Resultado
print(data_news.dtypes)

text      object
label    float64
dtype: object


### Limpeza de dados

In [None]:
!python -m spacy download pt_core_news_sm > /dev/null 2>&1
!pip install unidecode > /dev/null 2>&1

from unidecode import unidecode
import spacy

nlp = spacy.load("pt_core_news_sm")

def clean_text(text):

  # Processamento do texto
  doc = nlp(text)

  # Tokenização, remoção de stopwords, pontuação e acentuação
  tokens = [unidecode(token.lemma_) for token in doc if not token.is_stop and not token.is_punct]

  return ' '.join(tokens)

Limpar conteúdo das notícia:

In [None]:
data_news["text"] = data_news["text"].apply(clean_text)

In [None]:
data_news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7200 entries, 3248 to 338
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    7200 non-null   object 
 1   label   7200 non-null   float64
dtypes: float64(1), object(1)
memory usage: 168.8+ KB


## Treinamento

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Objeto Tokenizer
tokenizer = Tokenizer(num_words=10000)

tokenizer.fit_on_texts(data_news['text'])

# Conversão dos textos em sequências de números
sequences = tokenizer.texts_to_sequences(data_news['text'])

### Prepara os rótulos e dados para treinamento

In [None]:
# Transforma os textos
X = pad_sequences(sequences, maxlen=200)

# Rótulos das notícias (fake ou real)
y = data_news["label"]

### Divisão do conjunto de dados em treino e teste

In [None]:
from sklearn.model_selection import train_test_split

# Divide os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

print(f"Tamanho do conjunto de treino: {X_train.shape}")
print(f"Tamanho do conjunto de teste: {X_test.shape}")

Tamanho do conjunto de treino: (5760, 200)
Tamanho do conjunto de teste: (1440, 200)


### Arquitetura do modelo

In [None]:
model = Sequential([
  # Converte tokens em vetores densos: camada de entrada
  Embedding(input_dim=10000, output_dim=128, input_length=200),

  # Camadas ocultas
  LSTM(128, return_sequences=True),
  Dropout(0.2),
  LSTM(64, return_sequences=True),
  Dropout(0.2),
  LSTM(32),

  # Camada de saída
  Dense(1, activation="sigmoid")
])



**Compilação do modelo**:

In [None]:
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

### Treinando o modelo

In [None]:
history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 2s/step - accuracy: 0.9964 - loss: 0.0237 - val_accuracy: 0.9500 - val_loss: 0.1705
Epoch 2/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 2s/step - accuracy: 0.9975 - loss: 0.0187 - val_accuracy: 0.9493 - val_loss: 0.1812
Epoch 3/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2s/step - accuracy: 0.9985 - loss: 0.0121 - val_accuracy: 0.9521 - val_loss: 0.1905
Epoch 4/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 2s/step - accuracy: 0.9972 - loss: 0.0131 - val_accuracy: 0.9500 - val_loss: 0.1883
Epoch 5/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 1s/step - accuracy: 0.9982 - loss: 0.0102 - val_accuracy: 0.9528 - val_loss: 0.1991


#### Avaliação do modelo

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Perda: {loss}, Acurácia: {accuracy}")

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 182ms/step - accuracy: 0.9554 - loss: 0.1663
Perda: 0.19907745718955994, Acurácia: 0.9527778029441833


### Salvar o modelo

In [None]:
model.save("br_fake_news_predict_model.keras")