In [None]:


#@title The MIT License (MIT)
#
# Copyright (c) 2025 Eric dos Santos.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

# Fake News Classification

This project aims to develop a neural network for detecting fake news in Portuguese, using the dataset [Fake.br-Corpus](https://github.com/roneysco/Fake.br-Corpus). With this, we seek to create a system capable of identifying patterns and distinguishing fake news from real news, contributing to the fight against misinformation.

<table class="tfo-notebook-buttons" align="center">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/ericshantos/br_fake_news_detector/blob/main/br_fake_news_detector_model.ipynb
"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run on Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/ericshantos/br_fake_news_detector/blob/main/br_fake_news_detector_model.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View the code on GitHub</a>
  </td>
</table>

## Dataset loading

In [2]:
!git clone https://github.com/roneysco/Fake.br-Corpus
DATA_PATH = "./Fake.br-Corpus/full_texts"

Cloning into 'Fake.br-Corpus'...
remote: Enumerating objects: 28763, done.[K
remote: Total 28763 (delta 0), reused 0 (delta 0), pack-reused 28763 (from 1)[K
Receiving objects: 100% (28763/28763), 37.10 MiB | 11.95 MiB/s, done.
Resolving deltas: 100% (14129/14129), done.
Updating files: 100% (21602/21602), done.


In [3]:
import pandas as pd
import os

# News Directory
fake_dir = f"{DATA_PATH}/fake"
real_dir = f"{DATA_PATH}/true"

### News content extraction:


In [4]:
import os
import pandas as pd

def load_news(news_dir: str, label: str) -> pd.DataFrame:
    # List to store news
    news = []

    # Cycle through all files in the specified directory
    for filename in os.listdir(news_dir):
        # Checks if the file has the .txt extension
        if filename.endswith(".txt"):
            # Gets the full path of the file
            file_path = os.path.join(news_dir, filename)

            # Open the file and read its contents
            with open(file_path, "r") as file:
                content = file.read()

                # Adds the content and label to the news list
                news.append({"text": content, "label": label})

    # Returns a pandas DataFrame containing the news
    return pd.DataFrame(news)

Result:

In [5]:
fake_news = load_news(fake_dir, 0)
real_news = load_news(real_dir, 1)

## Data preprocessing

### Concatenate the DataFrames

Group Dataframes to generate a single robust database.

In [6]:
data_news = pd.concat([fake_news, real_news], ignore_index=True).sample(frac=1, random_state=13)

Final base information:

In [7]:
data_news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7200 entries, 3248 to 338
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7200 non-null   object
 1   label   7200 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 168.8+ KB


In [8]:
data_news = data_news.apply(

    # If valid, type the column as float
    lambda col: col.astype(float) if col.apply(

        # Check if they are digits
        lambda x: str(x).replace('.', '', 1).isdigit()
    ).all() else col
)

# Result
print(data_news.dtypes)

text      object
label    float64
dtype: object


### Data cleaning

In [9]:
!python -m spacy download pt_core_news_sm > /dev/null 2>&1
!pip install unidecode > /dev/null 2>&1

from unidecode import unidecode
import spacy

nlp = spacy.load("pt_core_news_sm")

def clean_text(text):

  # Text processing
  doc = nlp(text)

  # Tokenization, stopword removal, punctuation and accentuation
  tokens = [unidecode(token.lemma_) for token in doc if not token.is_stop and not token.is_punct]

  return ' '.join(tokens)

Clear news content:

In [10]:
data_news["text"] = data_news["text"].apply(clean_text)

In [11]:
data_news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7200 entries, 3248 to 338
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    7200 non-null   object 
 1   label   7200 non-null   float64
dtypes: float64(1), object(1)
memory usage: 168.8+ KB


## Training

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, Input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Tokenizer Object
tokenizer = Tokenizer(num_words=10000)

tokenizer.fit_on_texts(data_news['text'])

# Converting texts into sequences of numbers
sequences = tokenizer.texts_to_sequences(data_news['text'])

### Prepares the labels and data for training

In [13]:
# Transform the texts into sequences of numbers
X = pad_sequences(sequences, maxlen=200)

# news labels(fake or real)
y = data_news["label"]

### Save the tokenizer

In [14]:
import pickle

In [15]:
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

### Splitting the dataset into training and testing

In [16]:
from sklearn.model_selection import train_test_split

# Splits data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (5760, 200)
Test set size: (1440, 200)


### Model architecture

In [17]:
model = Sequential([

    # Input layer
    Input(shape=(200,)),

    # Convert tokens to dense vectors: input layer
    Embedding(input_dim=10000, output_dim=128),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),

    # Output Layer
    Dense(1, activation="sigmoid")
])

**Model compilation**:

In [18]:
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

### Training the model

In [19]:
history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - accuracy: 0.7121 - loss: 0.4965 - val_accuracy: 0.9368 - val_loss: 0.2330
Epoch 2/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9412 - loss: 0.2239 - val_accuracy: 0.9368 - val_loss: 0.2352
Epoch 3/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9443 - loss: 0.2158 - val_accuracy: 0.9368 - val_loss: 0.2337
Epoch 4/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.9431 - loss: 0.2172 - val_accuracy: 0.9368 - val_loss: 0.2334
Epoch 5/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9384 - loss: 0.2313 - val_accuracy: 0.9368 - val_loss: 0.2346


#### Model evaluation

In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score

In [21]:
# Getting the variances (probabilities)
y_pred_proba = model.predict(X_test)

# Applying the threshold
y_pred = (y_pred_proba >= 0.7).astype(int)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step


In [22]:
print(f"F1 Score: ", f1_score(y_test, y_pred))

F1 Score:  0.9345794392523364


In [23]:
print(f"Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.9368055555555556


In [24]:
print(f"Precision: ", precision_score(y_test, y_pred))

Precision:  0.9558823529411765


In [25]:
print(f"Recall: ", recall_score(y_test, y_pred))

Recall:  0.9142053445850914


In [26]:
print(f"ROC AUC: ", roc_auc_score(y_test, y_pred_proba))

ROC AUC:  0.9588766763325289


### Save the model

In [27]:
model.save("veritas-lstm-v1.1-ptbr.keras")