
# **Table of Contents**

1. [Installing and Importing Dependencies](#installing-and-importing-dependencies)  
2. [Mounting Google Drive](#mounting-google-drive)  
3. [Logging Configuration](#logging-configuration)  
4. [Preparing the DataFrame for the Fake.Br Corpus](#preparing-the-dataframe-for-the-fakebr-corpus)  
5. [Dataset Source Configuration and Column Mapping](#dataset-source-configuration-and-column-mapping)  
6. [Loading, Standardizing and Deduplicating Datasets](#loading-standardizing-and-deduplicating-datasets)  
7. [Text Preprocessing](#text-preprocessing)  
8. [Building MinHash-based LSH for Near-Duplicate Detection](#building-minhash-based-lsh-for-near-duplicate-detection)  
   8.1. [Creating MinHash Signatures](#creating-minhash-signatures)  
   8.2. [Creating LSH Index](#creating-lsh-index)  
9. [Computing Text Similarity Metrics](#computing-text-similarity-metrics)  
10. [Deduplicating While Retaining False Entries](#deduplicating-while-retaining-false-entries)  
11. [Transforming the Unified Dataset to Flat Format](#transforming-the-unified-dataset-to-flat-format)  
12. [Running the Full Data Processing Pipeline](#running-the-full-data-processing-pipeline)  
13. [Main Execution and Outputs](#main-execution-and-outputs)  
    13.1. [Unified Dataset Overview](#unified-dataset-overview)  
    13.2. [Flat Dataset Overview](#flat-dataset-overview)  
14. [Saving Current Session for Test Reuse](#saving-current-session-for-test-reuse)  


# **Installing and Importing Dependencies**

In [None]:
!pip install datasketch dill strsimpy



In [None]:
import dill
import json
import logging
import nltk
import numpy as np
import os
import pandas as pd
import re
import unicodedata

from datasketch import MinHash, MinHashLSH
from google.colab import drive
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from strsimpy.cosine import Cosine
from strsimpy.jaccard import Jaccard
from strsimpy.overlap_coefficient import OverlapCoefficient
from typing import Dict, Tuple

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# **Mounting Google Drive**

In [None]:
drive.mount('/content/drive', force_remount=True)
base_path = '/content/drive/MyDrive/FakeNews/'

Mounted at /content/drive


# **Logging Configuration**




In [None]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    force=True
)

# **Preparing the DataFrame for the Fake.Br Corpus**

In [None]:
def extract_fakebr_corpus(base_path):
    """
    Loads and structures the Fake.Br Corpus dataset with aligned fake and true news texts
    and their respective metadata, saving the result to a CSV file.

    Parameters:
    - base_path (str): Base directory containing the Fake.br-Corpus-master folder.

    Output:
    - Saves the structured DataFrame as 'fake_br.csv' under the base path.
    """

    root_path = os.path.join(base_path, "Fake.br-Corpus-master/full_texts")
    fake_path = os.path.join(root_path, "fake")
    true_path = os.path.join(root_path, "true")
    fake_meta_path = os.path.join(root_path, "fake-meta-information")
    true_meta_path = os.path.join(root_path, "true-meta-information")

    meta_columns = [
        "author", "link", "category", "date_of_publication", "num_tokens",
        "num_words_no_punctuation", "num_types", "num_links", "num_uppercase_words",
        "num_verbs", "num_subjunctive_imperative_verbs", "num_nouns", "num_adjectives",
        "num_adverbs", "num_modal_verbs", "num_singular_first_second_pronouns",
        "num_plural_first_pronouns", "num_pronouns", "pausality", "num_characters",
        "avg_sentence_length", "avg_word_length", "perc_spelling_errors",
        "emotiveness", "diversity"
    ]

    records = []

    for fake_file in sorted(os.listdir(fake_path)):
        if not fake_file.endswith(".txt"):
            continue

        fake_text_path = os.path.join(fake_path, fake_file)
        true_text_path = os.path.join(true_path, fake_file)

        fake_meta_file = fake_file.replace(".txt", "-meta.txt")
        true_meta_file = fake_meta_file

        fake_meta_path_full = os.path.join(fake_meta_path, fake_meta_file)
        true_meta_path_full = os.path.join(true_meta_path, true_meta_file)

        if not (os.path.exists(fake_text_path) and os.path.exists(true_text_path)):
            continue
        if not (os.path.exists(fake_meta_path_full) and os.path.exists(true_meta_path_full)):
            continue

        with open(fake_text_path, "r", encoding="utf-8") as file:
            fake_text = file.read().strip()

        with open(true_text_path, "r", encoding="utf-8") as file:
            true_text = file.read().strip()

        fake_meta = read_metadata(fake_meta_path_full)
        true_meta = read_metadata(true_meta_path_full)

        if fake_meta is None or true_meta is None:
            continue

        row = {
            "fake_news": fake_text,
            "true_news": true_text
        }

        for i, column in enumerate(meta_columns):
            row[f"fake_{column}"] = fake_meta[i]
            row[f"true_{column}"] = true_meta[i]

        records.append(row)

    df = pd.DataFrame(records)
    output_path = os.path.join(base_path, "fake_br.csv")
    df.to_csv(output_path, index=False)

In [None]:
def read_metadata(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = [line.strip() for line in file.readlines()]
    return lines if len(lines) == 24 else None

# **Dataset Source Configuration and Column Mapping**

In [None]:
file_config = {
    'Central de Fatos': {
        'file_path': base_path + 'central_de_fatos.csv',
        'text_columns': ['text_news'],
        'label_column': 'rating',
        'delimiter': ';'
    },
    'Fake.Br': {
        'file_path': base_path + 'fake_br.csv',
        'text_columns': ['fake_news', 'true_news'],
        'label_column': None
    },
    'FakeRecogna': {
        'file_path': 'hf://datasets/recogna-nlp/FakeRecogna/FakeRecogna.csv',
        'text_columns': ['Noticia'],
        'label_column': 'Classe'
    },
    'FakeTrueBR': {
        'file_path': base_path + 'FakeTrueBr_corpus.csv',
        'text_columns': ['fake', 'true'],
        'label_column': None
    }
}

# **Loading, Standardizing and Deduplicating Datasets**


In [None]:
def load_datasets():
    """
    Loads datasets from the specified sources in file_config, normalizes columns, and
    executes exact and orphan deduplication using dedupe_exact_and_orphans().

    Parameters:
    - None

    Returns:
    - pd.DataFrame: Unified and deduplicated dataset.
    """
    frames = []
    standard_columns = ['origin', 'fake_news', 'true_news', 'label', 'metadata']

    logging.info("Loading and normalizing source datasets")
    for source, cfg in file_config.items():
        logging.info(f"  • Source: {source}")
        df = pd.read_csv(cfg['file_path'], delimiter=cfg.get('delimiter', ','))
        df['origin'] = source

        # Normalizing text columns
        df['fake_news'] = df[cfg['text_columns'][0]].astype(str)
        df['true_news'] = (df[cfg['text_columns'][1]].astype(str) if len(cfg['text_columns']) == 2 else pd.NA)

        # Assign label column
        lbl = cfg.get('label_column')
        df['label'] = df[lbl].astype(str) if lbl and lbl in df.columns else pd.NA

        # Adding metadata
        extra_columns = [c for c in df.columns if c not in standard_columns and c != lbl]
        df['metadata'] = df[extra_columns].apply(lambda r: json.dumps(r.dropna().to_dict()), axis=1)
        frames.append(df[standard_columns])

    combined = pd.concat(frames, ignore_index=True, sort=False)
    logging.info(f"Combined {len(combined)} rows from all sources")

    # Drop rows without text
    before = len(combined)
    combined = combined.dropna(subset=['fake_news', 'true_news'], how='all')
    logging.info(f"Dropped {before - len(combined)} rows with no text")

    # Execute exact and orphan deduplication
    return dedupe_exact_and_orphans(combined)


In [None]:
def dedupe_exact_and_orphans(df):
    """
    Removes exact duplicates and orphaned entries (missing either fake_news or true_news):
    - Exact duplicates are removed based on the combination (fake_news, true_news, label).
    - Orphaned entries are those where either fake_news or true_news is missing. These entries are processed as individual rows.

    Logs the number of rows removed for each type of duplication.

    Parameters:
    - df (pd.DataFrame): Unified dataset with at least columns 'fake_news', 'true_news', 'label'.

    Returns:
    - pd.DataFrame: Deduplicated dataset.
    """
    # 1) Removing exact duplicates based on (fake_news, true_news, label)
    before = len(df)
    df_exact = df.drop_duplicates(subset=['fake_news', 'true_news', 'label'])
    removed_exact = before - len(df_exact)
    logging.info(f"Exact deduplication: Removed {removed_exact} rows")

    # 2) Handling orphaned entries (rows with either fake_news or true_news missing)
    fake_only = df_exact[df_exact['true_news'].isna()] \
        .drop_duplicates(subset=['fake_news', 'label'])
    true_only = df_exact[df_exact['fake_news'].isna()] \
        .drop_duplicates(subset=['true_news', 'label'])
    mixed = df_exact[df_exact['fake_news'].notna() & df_exact['true_news'].notna()]

    df_final = pd.concat([mixed, fake_only, true_only], ignore_index=True, sort=False)
    logging.info(f"Orphan deduplication: Final unified dataset has {len(df_final)} rows after processing orphaned entries")
    return df_final


# **Text Preprocessing**


In [None]:
def preprocess_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""

    text = ''.join(c for c in unicodedata.normalize('NFD', text.lower()) if unicodedata.category(c) != 'Mn')
    text = re.sub(r'[^a-z\s]', '', text)

    tokens = word_tokenize(text, language="portuguese")
    stop_words = set(stopwords.words('portuguese'))
    tokens = [word for word in tokens if word not in stop_words]

    return " ".join(tokens)

# **Building MinHash-based LSH for Near-Duplicate Detection**

## **Creating MinHash Signatures**

In [None]:
def create_minhash(text, num_perm=128):
    """
    Generates a MinHash signature from the given text.

    Parameters:
    - text (str): Raw text to be transformed into a MinHash signature.
    - num_perm (int): Number of permutations for the MinHash object.

    Returns:
    - MinHash or None: MinHash signature of the input text, or None if text is empty.
    """
    text = preprocess_text(text)

    if not text:
        return None

    m = MinHash(num_perm=num_perm)
    for word in text.split():
        m.update(word.encode('utf8'))

    return m if m.hashvalues is not None else None  # Ensure a valid MinHash is returned

## **Creating LSH Index**

In [None]:
def build_lsh_index(dataset, threshold = 0.9, num_perm = 128):
    """
    Build an LSH index where each document is the concatenation of
    `fake_news` + `true_news`, so that inverted pairs are also caught.

    Returns both the LSH and a map from key -> (MinHash, original row).
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    index_map: Dict[str, Tuple[MinHash, pd.Series]] = {}

    for idx, row in dataset.iterrows():
        fake = row["fake_news"] or ""
        true = row["true_news"] or ""
        pair_text = f"{fake} {true}".strip()
        if not pair_text:
            continue

        sig = create_minhash(pair_text, num_perm=num_perm)
        if sig is None:
            continue

        key = f"doc_{idx}"
        lsh.insert(key, sig)
        index_map[key] = (sig, row)

    return lsh, index_map

# **Computing Text Similarity Metrics**

In [None]:
def compute_similarity_metrics(text1: str, text2: str, ngram_size: int = 3) -> Tuple[float, float, float]:
    """
    Compute the Cosine, Jaccard, and Overlap similarity between two texts.

    Parameters:
    - text1 (str): First text to compare.
    - text2 (str): Second text to compare.
    - ngram_size (int): Size of n-grams to use for the similarity calculations.

    Returns:
    - cosine (float): Cosine similarity.
    - jaccard (float): Jaccard similarity.
    - overlap (float): Overlap coefficient similarity.
    """
    if not isinstance(text1, str) or not isinstance(text2, str):
        return 0.0, 0.0, 0.0

    text1, text2 = preprocess_text(text1), preprocess_text(text2)
    if not text1 or not text2:
        return 0.0, 0.0, 0.0

    cosine = Cosine(ngram_size).similarity(text1, text2)
    jaccard = Jaccard(ngram_size).similarity(text1, text2)
    overlap = OverlapCoefficient().similarity(text1, text2)

    return cosine, jaccard, overlap

# **Deduplicating**

In [None]:
def remove_near_duplicates(lsh, minhashes_map, threshold = 0.9):
    """
    Removes near-duplicate entries based on similarity metrics using MinHash LSH.
    The function compares `fake_news + true_news` pairs based on the 3 similarity metrics: Cosine, Jaccard, and Overlap.

    Parameters:
    - lsh (MinHashLSH): The LSH index for near-duplicate detection.
    - minhashes_map (dict): A dictionary mapping keys to (MinHash, DataFrame row) pairs.
    - threshold (float): The similarity threshold for considering two texts as near-duplicates.

    Returns:
    - pd.DataFrame: Deduplicated dataset.
    """
    removed_keys = set()
    results = []

    for key, (minhash, row) in minhashes_map.items():
        if key in removed_keys:
            continue

        candidates = lsh.query(minhash)
        keep_key = key
        keep_text = f"{row['fake_news']} {row['true_news']}".strip()

        for cand_key in candidates:
            if cand_key == keep_key or cand_key in removed_keys:
                continue

            cand_row = minhashes_map[cand_key][1]
            cand_text = f"{cand_row['fake_news']} {cand_row['true_news']}".strip()

            cosine, jaccard, overlap = compute_similarity_metrics(keep_text, cand_text)
            max_similarity = max(cosine, jaccard, overlap)
            if max_similarity >= threshold:
                lbl_keep = str(row['label']).lower() if pd.notna(row['label']) else ""
                lbl_cand = str(cand_row['label']).lower() if pd.notna(cand_row['label']) else ""

                # Prefer true label
                if lbl_cand == "true" and lbl_keep != "true":
                    removed_keys.add(keep_key)
                    keep_key = cand_key
                    keep_text = cand_text
                else:
                    removed_keys.add(cand_key)

        results.append(minhashes_map[keep_key][1].to_dict())

    logging.info(f"Near-duplicate removal: Removed {len(removed_keys)} near-duplicates (threshold={threshold})")
    return pd.DataFrame(results)

# **Transforming the Unified Dataset to Flat Format**

In [None]:
def transform_to_flat_format(unified_df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates a flat version of the unified dataset where each text (fake or true) is treated as an individual example with its own label.
    - Preserves original labels (if available).
    - If no label exists, assigns 'fake' for fake_news and 'true' for true_news.
    - Drops duplicates based on text and label pairs.

    Parameters:
    - unified_df (pd.DataFrame): Unified dataset with 'fake_news', 'true_news', and 'label' columns.

    Returns:
    - pd.DataFrame: Flattened dataset without duplicate (text, label) pairs.
    """
    records = []
    logging.info("Transforming the unified dataset into flat format")

    for _, row in unified_df.iterrows():
        original_label = row.get("label") if pd.notna(row.get("label")) else None

        # Handling fake_news
        if pd.notna(row["fake_news"]):
            label = original_label or "fake"
            records.append({
                "origin":   row["origin"],
                "text":     row["fake_news"],
                "label":    label,
                "metadata": row["metadata"]
            })

        # Handling true_news
        if pd.notna(row["true_news"]):
            label = original_label or "true"
            records.append({
                "origin":   row["origin"],
                "text":     row["true_news"],
                "label":    label,
                "metadata": row["metadata"]
            })

    flat_df = pd.DataFrame(records)
    before_dedupe = len(flat_df)
    flat_df = flat_df.drop_duplicates(subset=["text", "label"]).reset_index(drop=True)
    logging.info(f"Flat dataset deduplication: Removed {before_dedupe - len(flat_df)} duplicate (text, label) pairs")
    return flat_df

# **Running the Full Data Processing Pipeline**

In [None]:
def run_pipeline(
    output_file = base_path + 'dataset_unified.csv', flat_output_file = base_path + 'dataset_unified_flat.csv', sim_threshold = 0.9):
    """
    Execute end‐to‐end pipeline:
      1) load & dedupe exact/orphan
      2) build MinHash LSH index
      3) remove near‐duplicates
      4) sort & save unified
      5) transform to flat, dedupe, sort & save

    Each major step logs before/after with counts.
    """
    try:
        logging.info("STEP 1/5: Loading & exact/orphan deduplication")
        unified = load_datasets()
        logging.info(f"STEP 1/5: {len(unified)} rows after exact/orphan deduplication")

        logging.info("STEP 2/5: Building MinHash LSH index (threshold=%.2f)", sim_threshold)
        lsh, idx_map = build_lsh_index(unified, threshold=sim_threshold)
        logging.info(f"STEP 2/5: LSH index contains {len(idx_map)} entries")

        logging.info("STEP 3/5: Removing near-duplicates (threshold=%.2f)", sim_threshold)
        unified = remove_near_duplicates(lsh, idx_map, threshold=sim_threshold)
        logging.info(f"STEP 3/5: {len(unified)} rows remain after fuzzy deduplication")

        logging.info("STEP 4/5: Sorting & saving unified dataset")
        unified.sort_values(by=['origin','label'], inplace=True)
        unified.to_csv(output_file, index=False)
        logging.info(f"STEP 4/5: Unified saved to {output_file}")

        logging.info("STEP 5/5: Transforming to flat format & saving")
        flat = transform_to_flat_format(unified)
        flat.sort_values(by=['origin','label'], inplace=True)
        flat.to_csv(flat_output_file, index=False)
        logging.info(f"STEP 5/5: Flat saved to {flat_output_file}")

        logging.info("Pipeline completed successfully: unified=%d, flat=%d", len(unified), len(flat))

    except Exception:
        logging.error("Pipeline execution failed", exc_info=True)


#**Main Execution and Outputs**

In [None]:
# Execute the pipeline
run_pipeline()

# Load generated datasets
df_unificado = pd.read_csv(base_path + "dataset_unified.csv")
df_flat = pd.read_csv(base_path + "dataset_unified_flat.csv")

2025-05-02 16:49:54,593 - INFO - STEP 1/5: Loading & exact/orphan deduplication
2025-05-02 16:49:54,596 - INFO - Loading and normalizing source datasets
2025-05-02 16:49:54,600 - INFO -   • Source: Central de Fatos
2025-05-02 16:50:02,427 - INFO -   • Source: Fake.Br
2025-05-02 16:50:04,952 - INFO -   • Source: FakeRecogna
2025-05-02 16:50:08,808 - INFO -   • Source: FakeTrueBR
2025-05-02 16:50:09,643 - INFO - Combined 28941 rows from all sources
2025-05-02 16:50:09,695 - INFO - Dropped 0 rows with no text
2025-05-02 16:50:10,288 - INFO - Exact deduplication: Removed 23 rows
2025-05-02 16:50:10,576 - INFO - Orphan deduplication: Final unified dataset has 28918 rows after processing orphaned entries
2025-05-02 16:50:10,592 - INFO - STEP 1/5: 28918 rows after exact/orphan deduplication
2025-05-02 16:50:10,596 - INFO - STEP 2/5: Building MinHash LSH index (threshold=0.90)
2025-05-02 16:54:00,710 - INFO - STEP 2/5: LSH index contains 28918 entries
2025-05-02 16:54:00,711 - INFO - STEP 3/5:

## **Exploring the Final Datasets**

In [None]:
def display_dataframe_summary(df, name="DataFrame"):
    print(f"\nSummary for: {name}")
    display(df.describe())
    print(f"\nFirst 10 rows of {name}")
    display(df.head(10))
    print(f"\nLast 10 rows of {name}")
    display(df.tail(10))

### **Unified Dataset Overview**

In [None]:
display_dataframe_summary(df_unificado, name="Unified Dataset")


Summary for: Unified Dataset


Unnamed: 0,origin,fake_news,true_news,label,metadata
count,28633,28632,5241,23392.0,28633
unique,4,28627,4982,1024.0,28633
top,FakeRecogna,autor do massacre na escola nos eua. adivinha...,circula pelas redes sociais um vídeo que most...,1.0,"{""title_fake"": ""nova york acabou de aprovar ab..."
freq,11826,2,12,5917.0,1



First 10 rows of Unified Dataset


Unnamed: 0,origin,fake_news,true_news,label,metadata
0,Central de Fatos,Ao participar do Fórum Econômico Mundial de 20...,,"['AINDA É CEDO PARA DIZER', 'AINDA É CEDO PARA...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
1,Central de Fatos,Os candidatos à prefeitura de São Paulo têm us...,,"['AINDA É CEDO PARA DIZER', 'AINDA É CEDO PARA...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
2,Central de Fatos,"Na semana passada, 11 pré-candidatos à Presidê...",,"['AINDA É CEDO PARA DIZER', 'EXAGERADO', 'EXAG...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
3,Central de Fatos,"Na fim da semana passada, o PT colocou no ar m...",,"['AINDA É CEDO PARA DIZER', 'EXAGERADO', 'EXAG...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
4,Central de Fatos,Em entrevista concedida ao programa “Diálogos ...,,"['AINDA É CEDO PARA DIZER', 'EXAGERADO', 'VERD...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
5,Central de Fatos,A Câmara dos Deputados votou na madrugada de q...,,"['AINDA É CEDO PARA DIZER', 'EXAGERADO', 'VERD...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
6,Central de Fatos,"Candidato à reeleição, o prefeito de São Paulo...",,"['AINDA É CEDO PARA DIZER', 'FALSO', 'VERDADEI...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
7,Central de Fatos,"De olho nas eleições de 2022, o governador Joã...",,"['AINDA É CEDO PARA DIZER', 'SUBESTIMADO', 'EX...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
8,Central de Fatos,"Os aplicativos de transporte Uber, Cabify e 99...",,"['AINDA É CEDO PARA DIZER', 'VERDADEIRO', 'DE ...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
9,Central de Fatos,"Assim como o Brasil, Rio de Janeiro, Rio Grand...",,"['AINDA É CEDO PARA DIZER', 'VERDADEIRO', 'EXA...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."



Last 10 rows of Unified Dataset


Unnamed: 0,origin,fake_news,true_news,label,metadata
28623,FakeTrueBR,ivermectina supera vacinas: mais de 83% de efi...,Um estudo publicado na plataforma Research Sq...,,"{""title_fake"": ""ivermectina supera vacinas ao ..."
28624,FakeTrueBR,o ministro paulo guedes afirma que mandetta e...,O representante de uma vendedora de vacinas a...,,"{""title_fake"": ""mandetta embolsou r$ 5 bilhoes..."
28625,FakeTrueBR,segundo informacoes passada a nossa pagina es...,A apreensão de um suposto lobisomem pela Polí...,,"{""title_fake"": ""lobisomem foi capturado pela p..."
28626,FakeTrueBR,aqui esta um dos maiores responsaveis por tudo...,Circula nas redes sociais que o bilionário Ge...,,"{""title_fake"": ""george soros diz que bolsonaro..."
28627,FakeTrueBR,governadores escondem vacinas para desestabili...,Circula nas redes sociais a informação de que...,,"{""title_fake"": ""governadores estao escondendo ..."
28628,FakeTrueBR,"- bom dia, eu to aqui no paraguai. estamos aq...",O preço médio da gasolina nos postos do Brasi...,,"{""title_fake"": ""gasolina e vendida a r$ 2,62 n..."
28629,FakeTrueBR,pesquisem quem e michael yeadon. e pesquisam o...,Não há nenhuma comprovação que vacinas contra...,,"{""title_fake"": ""medico michael yeadon (ex-vice..."
28630,FakeTrueBR,na vacinacao do covid e super importante olhar...,A seringa que foi usada por uma técnica de en...,,"{""title_fake"": ""estao usando seringas vazias p..."
28631,FakeTrueBR,lider indigina morre apos tomar vachina! a mor...,A SESAI (Secretaria Especial de Saúde Indígen...,,"{""title_fake"": ""indigena morreu por causa de v..."
28632,FakeTrueBR,aborto aprovado ate um dia antes do nascimento...,"Nova York, 22 jan (EFE).- O estado de Nova Yo...",,"{""title_fake"": ""nova york acabou de aprovar ab..."


### **Flat Dataset Overview**

In [None]:
display_dataframe_summary(df_flat, name="Flat Dataset")


Summary for: Flat Dataset


Unnamed: 0,origin,text,label,metadata
count,33610,33609,33610.0,33610
unique,4,33609,1026.0,28633
top,FakeRecogna,"Nova York, 22 jan (EFE).- O estado de Nova Yo...",1.0,"{""title_fake"": ""vacina brasileira contra covid..."
freq,11826,1,5917.0,2



First 10 rows of Flat Dataset


Unnamed: 0,origin,text,label,metadata
0,Central de Fatos,Ao participar do Fórum Econômico Mundial de 20...,"['AINDA É CEDO PARA DIZER', 'AINDA É CEDO PARA...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
1,Central de Fatos,Os candidatos à prefeitura de São Paulo têm us...,"['AINDA É CEDO PARA DIZER', 'AINDA É CEDO PARA...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
2,Central de Fatos,"Na semana passada, 11 pré-candidatos à Presidê...","['AINDA É CEDO PARA DIZER', 'EXAGERADO', 'EXAG...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
3,Central de Fatos,"Na fim da semana passada, o PT colocou no ar m...","['AINDA É CEDO PARA DIZER', 'EXAGERADO', 'EXAG...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
4,Central de Fatos,Em entrevista concedida ao programa “Diálogos ...,"['AINDA É CEDO PARA DIZER', 'EXAGERADO', 'VERD...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
5,Central de Fatos,A Câmara dos Deputados votou na madrugada de q...,"['AINDA É CEDO PARA DIZER', 'EXAGERADO', 'VERD...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
6,Central de Fatos,"Candidato à reeleição, o prefeito de São Paulo...","['AINDA É CEDO PARA DIZER', 'FALSO', 'VERDADEI...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
7,Central de Fatos,"De olho nas eleições de 2022, o governador Joã...","['AINDA É CEDO PARA DIZER', 'SUBESTIMADO', 'EX...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
8,Central de Fatos,"Os aplicativos de transporte Uber, Cabify e 99...","['AINDA É CEDO PARA DIZER', 'VERDADEIRO', 'DE ...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."
9,Central de Fatos,"Assim como o Brasil, Rio de Janeiro, Rio Grand...","['AINDA É CEDO PARA DIZER', 'VERDADEIRO', 'EXA...","{""url"": ""https://piaui.folha.uol.com.br/lupa/2..."



Last 10 rows of Flat Dataset


Unnamed: 0,origin,text,label,metadata
33600,FakeTrueBR,Um estudo publicado na plataforma Research Sq...,True,"{""title_fake"": ""ivermectina supera vacinas ao ..."
33601,FakeTrueBR,O representante de uma vendedora de vacinas a...,True,"{""title_fake"": ""mandetta embolsou r$ 5 bilhoes..."
33602,FakeTrueBR,A apreensão de um suposto lobisomem pela Polí...,True,"{""title_fake"": ""lobisomem foi capturado pela p..."
33603,FakeTrueBR,Circula nas redes sociais que o bilionário Ge...,True,"{""title_fake"": ""george soros diz que bolsonaro..."
33604,FakeTrueBR,Circula nas redes sociais a informação de que...,True,"{""title_fake"": ""governadores estao escondendo ..."
33605,FakeTrueBR,O preço médio da gasolina nos postos do Brasi...,True,"{""title_fake"": ""gasolina e vendida a r$ 2,62 n..."
33606,FakeTrueBR,Não há nenhuma comprovação que vacinas contra...,True,"{""title_fake"": ""medico michael yeadon (ex-vice..."
33607,FakeTrueBR,A seringa que foi usada por uma técnica de en...,True,"{""title_fake"": ""estao usando seringas vazias p..."
33608,FakeTrueBR,A SESAI (Secretaria Especial de Saúde Indígen...,True,"{""title_fake"": ""indigena morreu por causa de v..."
33609,FakeTrueBR,"Nova York, 22 jan (EFE).- O estado de Nova Yo...",True,"{""title_fake"": ""nova york acabou de aprovar ab..."


# **Saving Current Session for Test Reuse**

In [None]:
to_save = {
    "load_datasets": load_datasets,
    "preprocess_text": preprocess_text,
    "create_minhash": create_minhash,
    "build_lsh_index": build_lsh_index,
    "compute_similarity_metrics": compute_similarity_metrics,
    "transform_to_flat_format": transform_to_flat_format,
    "run_pipeline": run_pipeline
}

session_path = "/content/drive/MyDrive/FakeNews/gold_standard_functions_funcs.dill"
with open(session_path, "wb") as f:
    dill.dump(to_save, f)
print("Selected functions saved successfully:", session_path)

Selected functions saved successfully: /content/drive/MyDrive/FakeNews/gold_standard_functions_funcs.dill
