# Crearea setului de date
1. Preluam stirile de pe veridica
2. Verificam similaritatea dintre ele si cele din setul online
3. Preluam stirile de pe DIgi24
4. Preluam stirile de pe TNR
5. Combinam toate datele intr un singur set

In [None]:
#preluam baza de date veridica
import requests
from bs4 import BeautifulSoup
import re
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed

BASE_URL = "https://www.veridica.ro"
START_PAGE = 1
END_PAGE = 187
articles_data = []
MAX_WORKERS = 10  #nr de threaduri

def scrape_article_page(article_url):
    try:
        response = requests.get(article_url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # luam titlul si tag-ul
        title_tag = soup.find("h1", class_="responsiveTitle")
        title = title_tag.get_text(strip=True) if title_tag else ""
        title_upper = title.upper()

        # verificam daca titlul incepe cu "Veridica.md:"
        if title_upper.startswith("VERIDICA.MD:"):
            if "FAKE NEWS" in title_upper:
                tag = "fake_news"
            elif "PROPAGANDĂ" in title_upper or "PROPAGANDA" in title_upper:
                tag = "propaganda"
            elif "DEZINFORMARE" in title_upper:
                tag = "misinformation"
            else:
                tag = "unknown"
        else:
            if title_upper.startswith("FAKE NEWS"):
                tag = "fake_news"
            elif title_upper.startswith("PROPAGANDĂ") or title_upper.startswith("PROPAGANDA"):
                tag = "propaganda"
            elif title_upper.startswith("DEZINFORMARE"):
                tag = "misinformation"
            else:
                tag = "unknown"

        # preluam doar sectiunea ȘTIRE
        paragraphs = soup.select("div.page-content p")
        content = []
        capture = False
        for p in paragraphs:
            strong = p.find("strong")
            strong_text = strong.get_text(strip=True).upper() if strong else ""

            if "ȘTIRE" in strong_text and not capture:
                capture = True
                strong.extract()
                text = p.get_text(strip=True)
                if text:
                    content.append(text)
                continue

            if capture:
                if strong and re.search(r"NARAȚIUNE|NARAȚIUNI|NARAŢIUNE|NARAŢIUNI", strong_text, re.IGNORECASE):
                    break
                text = p.get_text(strip=True)
                if text:
                    content.append(text)

        full_content = "\n".join(content).strip() if content else None

        if full_content:
            return {
                "url": article_url,
                "title": title,
                "tag": tag,
                "content": full_content
            }
        return None

    except Exception as e:
        print(f"Error scraping article: {article_url}\n{e}")
        return None

def extract_article_links_from_page(page_num):
    url = f"{BASE_URL}/baza-de-date?page={page_num}"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        table = soup.find("table", class_="rwd-table")
        links = []
        if table:
            rows = table.find_all("tr")
            for row in rows:
                link_tag = row.find("a", href=True)
                if link_tag:
                    href = link_tag["href"]
                    full_url = href if href.startswith("http") else BASE_URL + href
                    links.append(full_url)
        return links
    except Exception as e:
        print(f"Failed to get links from page {page_num}: {e}")
        return []

# scrape cu thread-uri
for page in range(START_PAGE, END_PAGE + 1):
    print(f"Processing page {page}...")
    article_urls = extract_article_links_from_page(page)
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_url = {executor.submit(scrape_article_page, url): url for url in article_urls}
        for future in as_completed(future_to_url):
            result = future.result()
            if result:
                articles_data.append(result)

# salvam in csv
with open("veridica_articles.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["url", "title", "tag", "content"])
    writer.writeheader()
    writer.writerows(articles_data)

Verificam similaritatile dintre setul de pe veridica si cel online

In [None]:
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

hf_dataset_dict = load_dataset("mateiaass/FakeRom")
hf_dataset = hf_dataset_dict["train"]
hf_df = hf_dataset.to_pandas()

# Load csv local
local_df = pd.read_csv("veridica_articles.csv")
hf_column = 'Text'
local_column = 'content'

hf_texts = hf_df[hf_column].dropna().astype(str).tolist()
local_texts = local_df[local_column].dropna().astype(str).tolist()

model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode texts
hf_embeddings = model.encode(hf_texts, convert_to_tensor=True, show_progress_bar=True)
local_embeddings = model.encode(local_texts, convert_to_tensor=True, show_progress_bar=True)

# Compute cosine similarity matrix
cosine_scores = util.cos_sim(local_embeddings, hf_embeddings)

#definim treshhold de similaritate
threshold = 0.8
matches = []

#cautam si pastram intrarile similare
for i, local_score in enumerate(cosine_scores):
    for j, score in enumerate(local_score):
        if score >= threshold:
            matches.append((i, j, float(score)))

#printam rez
print(f"Found {len(matches)} similar entries with similarity >= {threshold}.")
print("Sample matches:")
for i, j, score in matches[:10]:  # afisam primele 10 matchuri
    print(f"\n[Local {i}] {local_texts[i][:200]}...\n[HF {j}] {hf_texts[j][:200]}...\n→ Similarity: {score:.4f}")


Preluam stirile de pe TNR

In [None]:
#preluam stirile satirice de pe TNR
import requests
from bs4 import BeautifulSoup
import csv
from concurrent.futures import ThreadPoolExecutor
import time

BASE_URLS = {
    "politic": "https://www.timesnewroman.ro/politic",
    "sport": "https://www.timesnewroman.ro/sport",
    "monden": "https://www.timesnewroman.ro/monden",
    "life-death": "https://www.timesnewroman.ro/life-death",
    "it-stiinta": "https://www.timesnewroman.ro/it-stiinta"
}

MAX_ARTICLES_PER_CATEGORY = 180
MAX_PAGES_PER_CATEGORY = 100
MAX_WORKERS = 10

def extract_articles_from_page(category, category_url, page_num):
    page_url = f"{category_url}/page/{page_num}"
    try:
        response = requests.get(page_url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.select("a.article-url")

        articles = []
        for a in links:
            href = a.get("href")
            if href and href.startswith("https://"):
                articles.append({
                    "category": category,
                    "url": href.strip()
                })
        return articles
    except Exception as e:
        print(f"Failed to fetch page: {page_url} -> {e}")
        return []

def scrape_article(article_info):
    try:
        url = article_info["url"]
        category = article_info["category"]
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # skip la articolele premium
        if soup.find("div", class_="join-premium-container"):
            return None

        # Extragem titlul
        title_tag = soup.find("h1", class_="mb-4")
        title = title_tag.get_text(strip=True) if title_tag else ""

        # si content
        content_div = soup.find("div", class_="content-container page-editor-content mb-3")
        paragraphs = content_div.find_all("p") if content_div else []
        content = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        if not content:
            return None

        return {
            "url": url,
            "title": title,
            "tag": "satire",
            "category": category,
            "content": content
        }
    except Exception as e:
        print(f"Failed to scrape article: {article_info['url']} -> {e}")
        return None

def collect_articles_for_category(category, url):
    collected = []
    seen_urls = set()
    page = 1

    print(f" Collecting articles for category: {category}")

    while len(collected) < MAX_ARTICLES_PER_CATEGORY and page <= MAX_PAGES_PER_CATEGORY:
        articles = extract_articles_from_page(category, url, page)
        page += 1

        # stergem duplicatele
        new_articles = [a for a in articles if a['url'] not in seen_urls]
        seen_urls.update(a['url'] for a in new_articles)

        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            results = executor.map(scrape_article, new_articles)

            for result in results:
                if result:
                    collected.append(result)
                    if len(collected) >= MAX_ARTICLES_PER_CATEGORY:
                        break

        print(f"Collected {len(collected)} articles for {category}...")

        if not articles:
            print(f"No more articles found in {category}, stopping early.")
            break

    return collected

# preluam articolele
all_articles = []
for category, url in BASE_URLS.items():
    articles = collect_articles_for_category(category, url)
    all_articles.extend(articles)

# salvam in csv
with open("timesnewroman_200_per_category.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["url", "title", "tag", "category", "content"])
    writer.writeheader()
    writer.writerows(all_articles)

print(f"\n Done! Total articles saved: {len(all_articles)}")


Preluam stirile de pe Digi24

In [None]:
#Preluam stirile reale de pe Digi24

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv

BASE_SECTIONS = {
    "politica": "https://www.digi24.ro/stiri/actualitate/politica",
    "actualitate": "https://www.digi24.ro/stiri/actualitate",
    "economie": "https://www.digi24.ro/stiri/economie",
    "externe": "https://www.digi24.ro/stiri/externe",
    "sport": "https://www.digi24.ro/stiri/sport",
    "stil-de-viata": "https://www.digi24.ro/magazin/stil-de-viata"
}

PAGES_PER_SECTION = 3
MAX_WORKERS = 10
BASE_URL = "https://www.digi24.ro"

def extract_article_links(section_name, section_url, page):
    url = f"{section_url}?p={page}"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        articles = []
        for tag in soup.select("h2.article-title a"):
            href = tag.get("href")
            if href and href.startswith("/"):
                full_url = BASE_URL + href
                articles.append({
                    "category": section_name,
                    "url": full_url
                })
        return articles
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return []

def scrape_article(article_info):
    try:
        url = article_info["url"]
        category = article_info["category"]

        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # extragem titlul
        title_tag = soup.find("h1")
        title = title_tag.get_text(strip=True) if title_tag else ""

        # extragem paragrafele cu atributul data-index
        paragraphs = soup.find_all("p", attrs={"data-index": True})
        content = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        if not content:
            return None

        return {
            "url": url,
            "title": title,
            "tag": 'real_news',
            "content": content
        }

    except Exception as e:
        print(f"Failed to scrape article: {article_info['url']} -> {e}")
        return None


# luam toate linkurile de la articole
article_links = []
for category, base_url in BASE_SECTIONS.items():
    for page in range(1, PAGES_PER_SECTION + 1):
        links = extract_article_links(category, base_url, page)
        article_links.extend(links)

print(f"Collected {len(article_links)} article URLs. Now scraping content...")

#facem scraping in paralel
articles_data = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(scrape_article, info) for info in article_links]
    for future in as_completed(futures):
        result = future.result()
        if result:
            articles_data.append(result)

# salvam in csv
with open("digi24_articles.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["url", "title", "tag", "content"])
    writer.writeheader()
    writer.writerows(articles_data)

print(f"Finished: {len(articles_data)} articles saved to digi24_articles.csv")


COMBINAM TOATE DATELE INTR UN SINGUR SET

In [None]:
import pandas as pd
from datasets import load_dataset

hf_dataset_dict = load_dataset("mateiaass/FakeRom")
hf_dataset = hf_dataset_dict["train"]
df = hf_dataset.to_pandas()

# Keep only 'Folder' and 'Text' columns
df = df[["Folder", "Text"]].copy()

# Remove the exact string "---Vaccin Sample---" (but keep surrounding text)
df["Text"] = df["Text"].str.replace("---Vaccin Sample---", "", regex=False).str.strip()

# Map 'Folder' to custom tags
folder_to_tag = {
    "stiri_reale_img": "real_news",
    "stiri_satirice_img": "satire",
    "stiri_propagandistice_img": "propaganda",
    "stiri_fabricate_img": "fake_news",
    "stiri_plauzibile_img": "misinformation"
}
df["tag"] = df["Folder"].map(folder_to_tag)

# Final cleanup and renaming
df = df.rename(columns={"Text": "content"})
df = df[["tag", "content"]]
df = df.dropna(subset=["tag", "content"])

# Load additional local datasets
veridica_df = pd.read_csv("veridica_articles.csv")
digi24_df = pd.read_csv("digi24_articles_deduplicated.csv")
times_df = pd.read_csv("timesnewroman_200_per_category.csv")

# Check column existence
for name, dataset in [("veridica", veridica_df), ("digi24", digi24_df), ("timesnewroman", times_df)]:
    if not {"tag", "content"}.issubset(dataset.columns):
        raise ValueError(f"The {name} dataset must contain 'tag' and 'content' columns.")

# Combine all datasets
combined_df = pd.concat([
    df,
    veridica_df[["tag", "content"]],
    digi24_df[["tag", "content"]],
    times_df[["tag", "content"]]
], ignore_index=True)

# Shuffle the dataset
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the final dataset
combined_df.to_csv("augmented_fake_rom_combined.csv", index=False)

# Summary
print("Final combined dataset saved as 'augmented_fake_rom_combined.csv'.")
print(f"Total samples: {len(combined_df)}")
print("Tag distribution:")
print(combined_df['tag'].value_counts())


# Splittuirea textelor pt a le aduce la acelasi numar

In [None]:
import pandas as pd
import spacy
from collections import defaultdict

# Load spaCy model for Romanian
nlp = spacy.load("ro_core_news_sm")

# Load and preprocess dataset
df = pd.read_csv("augmented_fake_rom_combined.csv")

# Remove exact duplicates based on "content"
df = df.drop_duplicates(subset="content").copy()

# Compute content length and sort descending
df["content_length"] = df["content"].str.len()
df = df.sort_values(by="content_length", ascending=False).reset_index(drop=True)

# Parameters
tags_to_split = {"propaganda", "fake_news", "misinformation"}
max_length = 512
chunking_limit = 1000

# Initialize tag counters
initial_tag_counts = df["tag"].value_counts().to_dict()
current_tag_counts = initial_tag_counts.copy()

# To track how many extra chunks we’ve added per tag
new_tag_counts = defaultdict(int)

# Store final processed rows
final_rows = []

for _, row in df.iterrows():
    tag = row["tag"]
    content = str(row["content"]).strip()

    # Skip processing if tag is not eligible for chunking
    if tag not in tags_to_split:
        final_rows.append({"tag": tag, "content": content})
        continue

    # Check if chunking is still allowed for this tag
    if current_tag_counts[tag] >= chunking_limit or len(content) <= max_length:
        final_rows.append({"tag": tag, "content": content})
        continue

    # Split into sentences and chunk
    doc = nlp(content)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    chunks = []
    current_chunk = ""

    for sent in sentences:
        if len(current_chunk) + len(sent) + 1 <= max_length:
            current_chunk += (" " if current_chunk else "") + sent
        else:
            chunks.append(current_chunk)
            current_chunk = sent
    if current_chunk:
        chunks.append(current_chunk)

    # If splitting this text will exceed the limit, skip and keep original
    if current_tag_counts[tag] - 1 + len(chunks) > chunking_limit:
        final_rows.append({"tag": tag, "content": content})
        continue

    # Otherwise, accept the chunked version
    for chunk in chunks:
        final_rows.append({"tag": tag, "content": chunk})
    current_tag_counts[tag] = current_tag_counts[tag] - 1 + len(chunks)
    new_tag_counts[tag] += len(chunks) - 1  # for summary

# Convert to DataFrame and shuffle
new_df = pd.DataFrame(final_rows)
new_df = new_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save result
new_df.to_csv("augmented_fake_rom_combined_correctly_split.csv", index=False)

# Print summary
print("Processed and saved to 'augmented_fake_rom_combined_correctly_split.csv'.")
print(f"Original rows: {len(df)} → Final rows (with augmentation): {len(new_df)}")
print("Initial tag distribution:")
print(pd.Series(initial_tag_counts))
print("Final tag distribution:")
print(new_df['tag'].value_counts())
print("Augmentation impact (added chunks minus originals replaced):")
print(pd.Series(new_tag_counts))

# Cream setul de date tradus
1. Facem traducerile
2. Veridicam similaritatea

Traducem setul de date ro->zh->ro

In [None]:
# 1. Setup si import-uri
!pip install google-cloud-translate==2.0.1 --quiet
!pip install pandas tqdm --quiet

import os
import time
import random
import pandas as pd
from tqdm.auto import tqdm
from google.api_core import exceptions
from google.cloud import translate_v2 as translate

# 2. Configuratie
GCP_PROJECT_ID = "translate"
PATH_TO_GCP_KEY = "/content/strange-bird-462911-i4-468a1f01f318.json"
PATH_TO_DATASET = "augmented_fake_rom_combined_correctly_split.csv"
OUTPUT_CSV_PATH = "5_class_back_translated_dataset.csv"
ERROR_LOG_PATH = "translation_errors.log"

SOURCE_LANGUAGE = 'ro'
INTERMEDIATE_LANGUAGE = 'zh-CN'
FALLBACK_LANGUAGE = 'en'
TARGET_LANGUAGE = 'ro'
DELAY_BETWEEN_CALLS = 0.1
MAX_RETRIES = 3

# Verificam configuratia
if GCP_PROJECT_ID == "your-gcp-project-id" or PATH_TO_GCP_KEY == "your-gcp-key-file.json":
    print("WARNING: Update GCP_PROJECT_ID si PATH_TO_GCP_KEY.")
if not os.path.exists(PATH_TO_DATASET):
    print(f"ERROR: Datasetul nu a fost gasit la '{PATH_TO_DATASET}'.")
if not os.path.exists(PATH_TO_GCP_KEY):
    print(f"ERROR: Cheia GCP nu a fost gasita la '{PATH_TO_GCP_KEY}'.")

# 3. Autentificare Google Cloud
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = PATH_TO_GCP_KEY
translate_client = translate.Client()

# 4. Functie de traducere cu retry

def safe_translate(text, source, target, retries=MAX_RETRIES):
    for attempt in range(retries):
        try:
            result = translate_client.translate(text, source_language=source, target_language=target)
            return result['translatedText'], None
        except exceptions.GoogleAPICallError as e:
            if attempt < retries - 1:
                sleep_time = DELAY_BETWEEN_CALLS * (2 ** attempt + random.random())
                time.sleep(sleep_time)
            else:
                return text, f"API_ERROR - {e.message}"
        except Exception as e:
            return text, f"GENERAL_ERROR - {str(e)}"


def back_translate(text: str, src_lang: str, intermediate_lang: str, target_lang: str):
    if not isinstance(text, str):
        text = str(text) if not pd.isna(text) else ""

    if not text.strip():
        return text, "SKIP - input invalid sau gol"

    # Prima incercare cu limba intermediara principala
    intermediate, error = safe_translate(text, src_lang, intermediate_lang)
    if error:
        # Incearca fallback (engleza) daca chineza esueaza
        intermediate, fallback_error = safe_translate(text, src_lang, FALLBACK_LANGUAGE)
        if fallback_error:
            return text, f"FAILED_BOTH - {error} | FALLBACK - {fallback_error}"
        intermediate_lang = FALLBACK_LANGUAGE

    final, error = safe_translate(intermediate, intermediate_lang, target_lang)
    return final if not error else text, error

# 5. Procesare dataset
back_translated_content = []
error_log = []

try:
    df = pd.read_csv(PATH_TO_DATASET)
    print(f"Incarcat datasetul: {df.shape} linii.")

    # Asiguram ca 'content' este text
    df['content'] = df['content'].astype(str).fillna("")

    for row in tqdm(df.itertuples(index=True), total=len(df), desc="Translating entries"):
        index = row.Index
        original_text = row.content

        if not isinstance(original_text, str):
            print(f"Warning: entry {index} is not a string (type={type(original_text)}). Converting.")

        translated_text, error = back_translate(
            original_text,
            SOURCE_LANGUAGE,
            INTERMEDIATE_LANGUAGE,
            TARGET_LANGUAGE
        )

        back_translated_content.append(translated_text)

        if error:
            error_log.append(f"{index},{error},{original_text[:100].replace(',', ' ')}")

        time.sleep(DELAY_BETWEEN_CALLS)

    df['back_translated_content'] = back_translated_content
    print("\nTraducerea inversa s-a terminat cu succes.")

except FileNotFoundError:
    print(f"ERROR: Fisierul '{PATH_TO_DATASET}' nu a fost gasit.")
except KeyError:
    print("ERROR: CSV-ul trebuie sa contina o coloana numita 'content'.")
except Exception as e:
    print(f"Unexpected error: {e}")

# 6. Salvare rezultate
if 'df' in locals() and not df.empty:
    df.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8')
    print(f"Salvat datasetul augmentat la '{OUTPUT_CSV_PATH}'")

    if error_log:
        with open(ERROR_LOG_PATH, 'w', encoding='utf-8') as f:
            f.write("index,error,message\n")
            f.write("\n".join(error_log))
        print(f"Erorile au fost salvate in '{ERROR_LOG_PATH}'")
    else:
        print("Nicio eroare nu a fost intalnita in timpul traducerii.")

Analizam similaritatea semantica

In [None]:

# 1. setup si import-uri necesare
#!pip install pandas --quiet
#!pip install sentence-transformers --quiet
#!pip install matplotlib seaborn --quiet

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import os
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries installed and imported successfully.")

# 2. configuratia
PATH_TO_BACK_TRANSLATED_DATASET = "5_class_back_translated_dataset.csv"
FINAL_CSV_WITH_SIMILARITY_PATH = "similarity_report.csv"

# folosim un model multilingvistic care s mearga la limba romana
SIMILARITY_MODEL_NAME = 'distiluse-base-multilingual-cased-v1'

if not os.path.exists(PATH_TO_BACK_TRANSLATED_DATASET):
    print(f"ERROR: Dataset file not found at '{PATH_TO_BACK_TRANSLATED_DATASET}'.")
    print("Please make sure the file path is correct.")

# 3. Incarcam modelul si datele
try:
    # incarcam setul tradus
    df_results = pd.read_csv(PATH_TO_BACK_TRANSLATED_DATASET)
    print(f"Dataset '{PATH_TO_BACK_TRANSLATED_DATASET}' loaded successfully. Shape: {df_results.shape}")

    # eliminam linia 2448 daca exista
    if 2124 in df_results.index:
        df_results = df_results.drop(index=2124)
        print("Row 2448 has been dropped.")

    # verificam existenta coloanelor
    required_cols = ['tag','content', 'back_translated_content']
    if not all(col in df_results.columns for col in required_cols):
        raise KeyError("The CSV must contain 'content', 'back_translated_content', and 'tag' columns.")

    # incarcam modelul sentence transformer
    print(f"\nLoading sentence transformer model '{SIMILARITY_MODEL_NAME}'...")
    print("This may take a few moments.")
    model = SentenceTransformer(SIMILARITY_MODEL_NAME)
    print(" Model loaded successfully.")

except FileNotFoundError:
    print(f" FATAL ERROR: The file '{PATH_TO_BACK_TRANSLATED_DATASET}' was not found.")
except KeyError as e:
    print(f" FATAL ERROR: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


# 4. calculam similaritatea semantica
if 'model' in locals() and 'df_results' in locals():
    print("\n--- Starting Semantic Similarity Calculation ---")
    try:
        # ne asiguram ca coloanele sunt tratate ca string-uri pentru a nu avea erori
        original_texts = df_results['content'].fillna('').astype(str).tolist()
        back_translated_texts = df_results['back_translated_content'].fillna('').astype(str).tolist()

        # generam embedding-uri pentru ambele texte
        print("Generating embeddings for original texts...")
        embeddings1 = model.encode(original_texts, convert_to_tensor=True, show_progress_bar=True)
        print("\nGenerating embeddings for back-translated texts...")
        embeddings2 = model.encode(back_translated_texts, convert_to_tensor=True, show_progress_bar=True)

        # calculam similaritatea cosinus
        print("\nCalculating cosine similarity scores...")
        cosine_scores = util.cos_sim(embeddings1, embeddings2)

        #(diagonala matricei de similaritate)
        similarities = [cosine_scores[i][i].item() for i in range(len(original_texts))]

        # adaugam similaritatile ca o coloana noua in dataset
        df_results['semantic_similarity'] = similarities
        print("Similarity calculation complete.")

    except Exception as e:
        print(f" An error occurred during similarity calculation: {e}")
else:
    print("\nSkipping calculation because the model or data was not loaded correctly.")


# 5. analizam si salvam rezultatele
if 'semantic_similarity' in df_results.columns:
    print("\n--- Overall Analysis of Semantic Similarity Scores ---")
    average_similarity = df_results['semantic_similarity'].mean()
    median_similarity = df_results['semantic_similarity'].median()
    std_dev = df_results['semantic_similarity'].std()
    min_similarity = df_results['semantic_similarity'].min()
    max_similarity = df_results['semantic_similarity'].max()

    print(f"Average Similarity: {average_similarity:.4f}")
    print(f"Median Similarity:  {median_similarity:.4f}")
    print(f"Standard Deviation: {std_dev:.4f}")
    print(f"Minimum Similarity: {min_similarity:.4f}")
    print(f"Maximum Similarity: {max_similarity:.4f}")

    # analiza de grup dupa tag
    print("\n--- Average Similarity per Tag Class ---")
    try:
        # grupam dupa tag si calculam media pt semantic_similarity
        similarity_by_tag = df_results.groupby('tag')['semantic_similarity'].mean().sort_values(ascending=False)
        print(similarity_by_tag.to_string())
    except KeyError:
        print("Could not perform grouped analysis because 'tag' column was not found.")


        # grafic de distributie
    # Custom color mapping for each tag
    custom_palette = {
        'fake_news': 'tab:blue',
        'misinformation': 'tab:orange',
        'propaganda': 'tab:green',
        'real_news': 'tab:red',
        'satire': 'tab:purple'
    }

    # Plot: clearer and less cluttered
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.figure(figsize=(12, 6))

    sns.histplot(
        data=df_results,
        x='semantic_similarity',
        hue='tag',
        bins=25,
        multiple='dodge',  # separate bars
        palette=custom_palette,
        alpha=0.6,
        edgecolor='black'
    )


    # salvam setul final
    df_results.to_csv(FINAL_CSV_WITH_SIMILARITY_PATH, index=False, encoding='utf-8')
    print(f"\n Final dataset with similarity scores saved to '{FINAL_CSV_WITH_SIMILARITY_PATH}'")

    # afisam top 5 cele mai putin similare
    print("\n--- Top 5 lowest similarity examples for manual review ---")
    print(df_results.nsmallest(5, 'semantic_similarity')[['tag', 'content', 'back_translated_content', 'semantic_similarity']])

else:
    print("\nSkipping analysis because similarity scores were not calculated.")


# Incercarile nereusite de> introdus zgomot si redus numarul de clase la 3


In [None]:
#incercarea cu setul cu 3 clase care n a mers

import csv

# Input files
input_files = [
    "digi24_articles.csv",
    "timesnewroman_200_per_category.csv",
    "veridica_articles.csv"
]

# Output file
output_file = "3_class_dataset.csv"

# Columns to keep
fieldnames = ["tag", "content"]

with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()

    for file in input_files:
        with open(file, newline='', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            for row in reader:
                content = row.get("content", "").strip()
                if not content:
                    continue

                if file == "veridica_articles.csv":
                    tag = "fake_news"
                else:
                    tag = row.get("tag", "").strip()

                if tag:
                    writer.writerow({"tag": tag, "content": content})

print(f"Combined dataset saved to: {output_file}")


Experimentul de introdus zgomot

In [None]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np


DATA_PATH = "3_class_dataset.csv"
RANDOM_SEED = 42
AUGMENT_FRACTION = 1.0  # 1.0 = augmentarea intregului set de antrenare

# seed pt reproductibilitate
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# load si stergem duplicatele
print("Loading and deduplicating data...")
df = pd.read_csv(DATA_PATH)
print(f"Original dataset size: {len(df)}")

df = df.drop_duplicates(subset="content").reset_index(drop=True)
print(f"After deduplication: {len(df)}")

# encodam labelurile
le = LabelEncoder()
df["label"] = le.fit_transform(df["tag"])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)
print("Class distribution:")
print(df["tag"].value_counts())

#stratificam setul de date
# 80% train, 10% val, 10% test
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=RANDOM_SEED
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["label"], random_state=RANDOM_SEED
)

print(f"\nDataset splits:")
print(f"Train: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
print(f"Val: {len(val_df)} ({len(val_df)/len(df)*100:.1f}%)")
print(f"Test: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")

# definim functiile de noise pt augmentare
def word_dropout(text, p=0.1):
    """scoatem cuvinte random din text"""
    words = text.split()
    if len(words) <= 1:  # pt a nu modifica textele prea scurte
        return text

    kept_words = [w for w in words if random.random() > p]
    # ne asiguram ca macar un cuvant ramane
    if not kept_words:
        kept_words = [random.choice(words)]
    return " ".join(kept_words)

def random_swap(text, n=1):
    """facem swap intre cuvinte random"""
    words = text.split()
    if len(words) < 2:
        return text

    words_copy = words.copy()
    for _ in range(min(n, len(words_copy) - 1)):
        idx = random.randint(0, len(words_copy) - 2)
        words_copy[idx], words_copy[idx + 1] = words_copy[idx + 1], words_copy[idx]
    return " ".join(words_copy)

def noisy_augment(text):
    """aplicam noise dupa o probabilitate"""
    if pd.isna(text) or not isinstance(text, str):
        return text

    augmented_text = text
    if random.random() < 0.3:
        augmented_text = word_dropout(augmented_text, p=0.1)
    if random.random() < 0.3:
        augmented_text = random_swap(augmented_text, n=1)
    return augmented_text

# Augmentam datele de train
print("\nApplying data augmentation...")
augmented_train_df = train_df.copy()
augmented_train_df["content"] = augmented_train_df["content"].apply(noisy_augment)

# stergem orice duplicate facute de augmentare
augmented_train_df = augmented_train_df[
    ~augmented_train_df["content"].isin(train_df["content"])
].reset_index(drop=True)

print(f"Unique augmented samples: {len(augmented_train_df)}")

if AUGMENT_FRACTION > 0:
    samples_to_add = int(len(augmented_train_df) * AUGMENT_FRACTION)
    augmented_subset = augmented_train_df.sample(
        n=min(samples_to_add, len(augmented_train_df)),
        random_state=RANDOM_SEED
    )
    final_train_df = pd.concat([train_df, augmented_subset], ignore_index=True)
else:
    final_train_df = train_df.copy()

# shuffle la ttrain
final_train_df = final_train_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# -Validam si salvam setul de date
print("\nFinal dataset sizes:")
print(f"  Train (with augmentation): {len(final_train_df)} samples")
print(f"  Validation: {len(val_df)} samples")
print(f"  Test: {len(test_df)} samples")

# verificam pt leakage
train_content = set(final_train_df["content"])
val_content = set(val_df["content"])
test_content = set(test_df["content"])

train_val_overlap = len(train_content & val_content)
train_test_overlap = len(train_content & test_content)
val_test_overlap = len(val_content & test_content)

if train_val_overlap > 0 or train_test_overlap > 0 or val_test_overlap > 0:
    print(f" Data leakage detected:")
    print(f"  Train-Val overlap: {train_val_overlap}")
    print(f"  Train-Test overlap: {train_test_overlap}")
    print(f"  Val-Test overlap: {val_test_overlap}")
else:
    print(" No data leakage detected")

#salvam seturile noi
final_train_df.to_csv("train_noisy.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

import json
label_mapping_serializable = {k: int(v) for k, v in label_mapping.items()}
with open("label_mapping.json", "w") as f:
    json.dump(label_mapping_serializable, f, indent=2)

print("\n Data prepared and saved:")
print("  Files: train_noisy.csv, val.csv, test.csv, label_mapping.json")

# Pipeline SVC si MNB
Include si testarile pt setul de date tradus

Cautam hiperparametrii optimi si antrenam modelele

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# setup stopwords
try:
    stopwords_set = set(nltk.corpus.stopwords.words('romanian'))
except LookupError:
    print("Downloading 'stopwords' corpus...")
    nltk.download('stopwords')
    stopwords_set = set(nltk.corpus.stopwords.words('romanian'))

# functii de preprocesare si curatare

def clean_for_ml(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\săâîșț-]", "", text, flags=re.UNICODE)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def scrape_and_clean(url):
    try:
        page = requests.get(url)
        if page.status_code != 200:
            return {"error": f"Failed to fetch page. Status Code: {page.status_code}"}

        soup = BeautifulSoup(page.text, 'html.parser')
        paragraph = " ".join([p.text.strip() for p in soup.find_all('p')])

        # aplicam curatarea
        cleaned_text = clean_for_ml(paragraph)
        return cleaned_text

    except Exception as e:
        return {"error": str(e)}


# load dataset si splittuim datele

try:
    df = pd.read_csv("augmented_fake_rom_combined_correctly_split.csv")
except FileNotFoundError:
    print("Error: 'augmented_fake_rom_chunked_filtered_dedup.csv' not found. Please make sure the file is in the correct directory.")
    exit()

df = df.dropna(subset=["content"])
df["content_cleaned"] = df["content"].astype(str).apply(clean_for_ml)
df = df[df["content_cleaned"].str.strip() != ""]

X = df["content_cleaned"]
y_labels = df["tag"]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_labels)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print("-" * 30)


# tunarea hiperparametrilor cu gridsearch

# definim strategia de cross fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


print("Tuning Support Vector Machine...")
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(stopwords_set))),
    ('svm', SVC(kernel='linear', random_state=42))
])

# definim gridul de parametrii
param_grid_svm = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [ 1 , 0.01, 0.1, 10]
}


# instantiem gridul
search_svm = GridSearchCV(
    pipeline_svm,
    param_grid_svm,
    cv=cv,
    scoring='f1-macro',
)
search_svm.fit(X_train, y_train)
best_svm = search_svm.best_estimator_

print(f"\nBest parameters for SVM: {search_svm.best_params_}")
print(f"Best cross-validated F1-score for SVM: {search_svm.best_score_:.4f}")
print("-" * 30)


print("Tuning Multinomial Naive Bayes...")
pipeline_nb = Pipeline([
    ('countvec', CountVectorizer(stop_words=list(stopwords_set))),
    ('nb', MultinomialNB())
])

param_grid_nb = {
    #'countvec__ngram_range': [(1, 1)],
    'nb__alpha': [0.01, 0.1, 0.5, 1.0]
}


search_nb = GridSearchCV(
    pipeline_nb,
    param_grid_nb,
    cv=cv,
    scoring='f1-macro',
    verbose=1,
    n_jobs=-1
)
search_nb.fit(X_train, y_train)
best_nb = search_nb.best_estimator_

print(f"\nBest parameters for MNB: {search_nb.best_params_}")
print(f"Best cross-validated F1-score for MNB: {search_nb.best_score_:.4f}")
print("-" * 30)


# evaluarea finala pe test

print("\n--- Final Evaluation on the Unseen Test Set ---")

# SVM eval
y_pred_svm = best_svm.predict(X_test)
print("Test Set Classification Report for Best SVM:\n")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

# MNB eval
y_pred_nb = best_nb.predict(X_test)
print("Test Set Classification Report for Best Multinomial Naive Bayes:\n")
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))

svm_vectorizer = best_svm.named_steps['tfidf']
nb_vectorizer = best_nb.named_steps['countvec']

# salvam cel mai bun model si artefactele
print("\nSaving best models and artifacts...")
joblib.dump(best_svm, 'best_svm_pipeline.pkl')
joblib.dump(best_nb, 'best_nb_pipeline.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(svm_vectorizer, 'svm_vectorizer.pkl')
joblib.dump(nb_vectorizer, 'nb_vectorizer.pkl')
print("Vectorizers saved successfully.")
print("Artifacts saved successfully.")
print("-" * 30)


#functie pt consfusion matrix
def plot_conf_matrix(y_true, y_pred, model_name, labels):
    """Helper function to plot a confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", xticks_rotation=45)
    plt.title(f"Confusion Matrix - {model_name}")
    plt.tight_layout()
    plt.show()

plot_conf_matrix(y_test, y_pred_svm, "SVM original dataset", label_encoder.classes_)
plot_conf_matrix(y_test, y_pred_nb, "MultinomialNB original dataset", label_encoder.classes_)


# predictii pt 3 exemple
def predict_news_type(pipeline, text_input):

    pred_encoded = pipeline.predict([text_input])
    return label_encoder.inverse_transform(pred_encoded)[0]

print("\n--- Example Predictions on New Data ---")
try:
    url1 = 'https://www.timesnewroman.ro/politic/cine-este-coco-papagalul-care-a-invatat-o-pe-anamaria-gavrila-sa-spuna-pacea-i-pace/'
    url2 = 'https://www.digi24.ro/alegeri-prezidentiale-2025/alegeri-prezidentiale-2025-turul-2-3243031'
    url3 = 'https://www.activenews.ro/opinii/Kievul-tocmai-a-aruncat-in-aer-Negocierile-de-Pace-–-Romania-in-pericol-maxim-195903'

    text1 = scrape_and_clean(url1)
    text2 = scrape_and_clean(url2)
    text3 = scrape_and_clean(url3)

    print(f"Best SVM Prediction (case 1): {predict_news_type(best_svm, text1)}")
    print(f"Best SVM Prediction (case 2): {predict_news_type(best_svm, text2)}")
    print(f"Best SVM Prediction (case 3): {predict_news_type(best_svm, text3)}")
    print("-" * 15)
    print(f"Best NB Prediction (case 1): {predict_news_type(best_nb, text1)}")
    print(f"Best NB Prediction (case 2): {predict_news_type(best_nb, text2)}")
    print(f"Best NB Prediction (case 3): {predict_news_type(best_nb, text3)}")

except NotImplementedError as e:
    print(f"\nCould not run example predictions: {e}")
except Exception as e:
    print(f"\nAn error occurred during prediction: {e}")


Testam MNB pe setul tradus

In [None]:
import pandas as pd
import numpy as np
import re
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
#functia de preprocess
def clean_for_ml(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\săâîșț-]", "", text, flags=re.UNICODE)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# incarcam pipeline u pr mnb
print("Loading saved models and encoders...")
model = joblib.load("best_nb_pipeline.pkl")
original_classes = ['fake_news', 'misinformation', 'propaganda', 'real_news', 'satire']
label_encoder = LabelEncoder()
label_encoder.classes_ = np.array(original_classes)
print("Loaded model and label encoder.")

# load si preparam setul
print("Loading and preparing dataset...")
df = pd.read_csv("5_class_back_translated_dataset.csv")
df = df.dropna(subset=["back_translated_content"])
df["content_cleaned"] = df["back_translated_content"].astype(str).apply(clean_for_ml)
df = df[df["content_cleaned"].str.strip() != ""]

X = df["content_cleaned"]
y_str = df["tag"]

# ne asiguram ca labelurile sunt bune
y = label_encoder.transform(y_str)

# stratificam setul la fel ca mai sus
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


print("Running predictions...")
y_pred = model.predict(X_test)

# evaluare
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

def plot_conf_matrix(y_true, y_pred, model_name, labels):
    """Helper function to plot a confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    plot_obj = disp.plot(cmap="OrRd", xticks_rotation=45)
    plot_obj.ax_.grid(False)
    plot_obj.ax_.set_title(f"Confusion Matrix - MultinomialNB back-translated set", pad=20)
    plt.tight_layout()
    plt.show()


plot_conf_matrix(y_test, y_pred, "MNB back-translated set", label_encoder.classes_)

Testam SVC pe setul tradus

In [None]:
import pandas as pd
import numpy as np
import re
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder

# preprocesam cu ac functie
def clean_for_ml(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\săâîșț-]", "", text, flags=re.UNICODE)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# incarcam pipeline u
print("Loading saved models and encoders...")
model = joblib.load("best_svm_pipeline.pkl")
original_classes = ['fake_news', 'misinformation', 'propaganda', 'real_news', 'satire']
label_encoder = LabelEncoder()
label_encoder.classes_ = np.array(original_classes)
print("Loaded model and label encoder.")

# load si pregatim setul
print("Loading and preparing dataset...")
df = pd.read_csv("5_class_back_translated_dataset.csv")
df = df.dropna(subset=["back_translated_content"])
df["content_cleaned"] = df["back_translated_content"].astype(str).apply(clean_for_ml)
df = df[df["content_cleaned"].str.strip() != ""]

X = df["content_cleaned"]
y_str = df["tag"]

y = label_encoder.transform(y_str)

# facem split ca mai sus
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Running predictions...")
y_pred = model.predict(X_test)

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

def plot_conf_matrix(y_true, y_pred, model_name, labels):
    """Helper function to plot a confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    plot_obj = disp.plot(cmap="OrRd", xticks_rotation=45)
    plot_obj.ax_.grid(False)
    plot_obj.ax_.set_title(f"Confusion Matrix - {model_name}", pad=20)
    plt.tight_layout()
    plt.show()


plot_conf_matrix(y_test, y_pred, "SVC back-translated set", label_encoder.classes_)


# Fine tuning bert
1. Experimente cu Optuna
2. BERT final
3. Testare bert pe setul tradus

In [None]:
#!pip install transformers[torch] datasets scikit-learn pandas matplotlib optuna --quiet

import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    BertForSequenceClassification
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from datasets import Dataset
import pandas as pd
import torch
import numpy as np
import random
import sys
from datetime import datetime
import matplotlib.pyplot as plt

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = open(f"training_log_{timestamp}.txt", "w")
sys.stdout = log_file
sys.stderr = log_file

# seeduri reproductibilitate
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# apelam la gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# incarcam si preprocesam
df = pd.read_csv("augmented_fake_rom_combined_correctly_split.csv")
df = df.dropna(subset=["content", "tag"])
df["content"] = df["content"].astype(str)

le = LabelEncoder()
df["label"] = le.fit_transform(df["tag"])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)

# afisam distributia claselor
for label, count in df["tag"].value_counts().items():
    print(f"{label}: {count} entries")

df = df[["content", "label"]]
dataset = Dataset.from_pandas(df)

MODEL_CHECKPOINT = "dumitrescustefan/bert-base-romanian-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_function(example):
    return tokenizer(example["content"], padding="max_length", truncation=True, max_length=512)


print(type(dataset[0]["content"]), dataset[0]["content"])
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# splittuim datele
split_1 = tokenized_dataset.train_test_split(test_size=0.2, seed=SEED)
train_dataset = split_1["train"]
temp_dataset = split_1["test"]
split_2 = temp_dataset.train_test_split(test_size=0.5, seed=SEED)
val_dataset = split_2["train"]
test_dataset = split_2["test"]

# functia de metrici
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_weighted": f1_score(labels, predictions, average="weighted"),
        "f1_macro": f1_score(labels, predictions, average="macro"),
    }

def model_init():
    return BertForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=len(label_mapping)
    )

# definim spatiul de hiperparam
def optuna_hp_space(trial: optuna.Trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 6),
        "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.3),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 500),
    }

# rulam cautarea
hpo_training_args = TrainingArguments(
    output_dir="./bert_results_hpo",
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    report_to='none',
    seed=SEED,
)


hpo_trainer = Trainer(
    args=hpo_training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    model_init=model_init,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)


print("--- Starting Rigorous Hyperparameter Search ---")
try:
    best_run = hpo_trainer.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=optuna_hp_space,
        n_trials=7,
        compute_objective=lambda metrics: metrics["eval_accuracy"],
    )

    print("\n--- Hyperparameter Search Complete ---")
    print(f"Best objective value (Accuracy): {best_run.objective}")
    print(f"Best hyperparameters: {best_run.hyperparameters}")

except Exception as e:
    print(f"Error during hyperparameter search: {e}")
    best_run = type('BestRun', (), {
        'objective': 0.0,
        'hyperparameters': {
            'learning_rate': 2e-5, 'num_train_epochs': 4, 'weight_decay': 0.01,
            'per_device_train_batch_size': 16, 'warmup_steps': 100
        }
    })()
    print("Using default hyperparameters due to search failure.")


# antrenam modelul final
print("\n--- Training final model with best hyperparameters ---")

final_training_args = TrainingArguments(
    output_dir="./bert_results_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to='none',
    metric_for_best_model="accuracy", #verificam acuratetea
    greater_is_better=True,
    save_total_limit=2,
    seed=SEED,
    **best_run.hyperparameters
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

# folosim trainer in loc de custom trainer
final_trainer = Trainer(
    model=model_init(),
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    callbacks=callbacks
)

print("Training final model...")
train_result = final_trainer.train()

# evaluare
print("\n--- Evaluating final model on test set ---")
test_metrics = final_trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")
print("Final Test set metrics:", test_metrics)

# Generate predictions
predictions = final_trainer.predict(test_dataset)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=-1)

print("\nFinal Classification Report:")
print(classification_report(y_true, y_pred, target_names=list(label_mapping.keys())))

# plot metrici
logs = final_trainer.state.log_history
train_loss = [x['loss'] for x in logs if 'loss' in x]
eval_loss = [x['eval_loss'] for x in logs if 'eval_loss' in x]
accuracy = [x['eval_accuracy'] for x in logs if 'eval_accuracy' in x]
f1_macro = [x['eval_f1_macro'] for x in logs if 'eval_f1_macro' in x]
f1_weighted = [x['eval_f1_weighted'] for x in logs if 'eval_f1_weighted' in x]
epochs = list(range(1, len(accuracy) + 1))

plt.figure()
plt.plot(train_loss, label="Train Loss")
plt.plot(eval_loss, label="Eval Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Evaluation Loss")
plt.legend()
plt.savefig("loss_curve.png")

plt.figure()
plt.plot(epochs, accuracy, label="Accuracy")
plt.plot(epochs, f1_macro, label="F1 Macro")
plt.plot(epochs, f1_weighted, label="F1 Weighted")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.title("Evaluation Metrics over Epochs")
plt.legend()
plt.savefig("metrics_curve.png")

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(label_mapping.keys()))
disp_plot = disp.plot(xticks_rotation=45, cmap='Blues')
disp_plot.ax_.grid(False)
disp_plot.ax_.set_title("Confusion Matrix - Final Test Set", pad=20)
plt.tight_layout()
plt.savefig("confusion_matrix.png")

# salvam modelul la final
print("\nSaving final model...")
final_trainer.save_model("./fine_tuned_bert_optimized")
tokenizer.save_pretrained("./fine_tuned_bert_optimized")

print("Training complete!")

sys.stdout = sys.__stdout__
sys.stderr = sys.__stderr__
log_file.close()

print("Training complete! Log saved.")

In [None]:
#!pip install transformers[torch] datasets scikit-learn pandas matplotlib optuna --quiet

import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    BertForSequenceClassification
)
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from datasets import Dataset
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
import random
import sys
from datetime import datetime
import matplotlib.pyplot as plt

# --- Setup Logging ---
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = open(f"training_log_{timestamp}.txt", "w")
sys.stdout = log_file
sys.stderr = log_file

# --- Reproducibility ---
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# --- Basic Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Data Loading and Preprocessing ---
df = pd.read_csv("augmented_fake_rom_combined_correctly_split.csv")
df = df.dropna(subset=["content", "tag"])
df["content"] = df["content"].astype(str)

le = LabelEncoder()
df["label"] = le.fit_transform(df["tag"])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)

# Print class distribution
for label, count in df["tag"].value_counts().items():
    print(f"{label}: {count} entries")

df = df[["content", "label"]]
dataset = Dataset.from_pandas(df)

MODEL_CHECKPOINT = "dumitrescustefan/bert-base-romanian-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_function(example):
    return tokenizer(example["content"], padding="max_length", truncation=True, max_length=512)


print(type(dataset[0]["content"]), dataset[0]["content"])
tokenized_dataset = dataset.map(tokenize_function, batched=True)

#splittuim datele
split_1 = tokenized_dataset.train_test_split(test_size=0.2, seed=SEED)
train_dataset = split_1["train"]
temp_dataset = split_1["test"]
split_2 = temp_dataset.train_test_split(test_size=0.5, seed=SEED)
val_dataset = split_2["train"]
test_dataset = split_2["test"]

#greutati
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df["label"]),
    y=df["label"]
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class Weights:", class_weights_tensor)

# metrici
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_weighted": f1_score(labels, predictions, average="weighted"),
        "f1_macro": f1_score(labels, predictions, average="macro"),
    }

# custom trainer sa includa greutatile
class CustomTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

#initializam modelul
def model_init():
    return BertForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=len(label_mapping)
    )

#definim spatiul de hiperparametrii
def optuna_hp_space(trial: optuna.Trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 6),
        "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.3),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 500),
    }

# rulam cautaea
hpo_training_args = TrainingArguments(
    output_dir="./bert_results_hpo",
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    report_to='none',
    seed=SEED,
)

hpo_trainer = CustomTrainer(
    class_weights=class_weights_tensor,
    args=hpo_training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    model_init=model_init,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

print("--- Starting Rigorous Hyperparameter Search (Objective: Maximize F1-Macro) ---")
try:
    best_run = hpo_trainer.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=optuna_hp_space,
        n_trials=10,
        compute_objective=lambda metrics: metrics["eval_f1_macro"],
    )

    print("\n--- Hyperparameter Search Complete ---")
    print(f"Best objective value (F1-Macro): {best_run.objective}")
    print(f"Best hyperparameters: {best_run.hyperparameters}")

except Exception as e:
    print(f"Error during hyperparameter search: {e}")
    best_run = type('BestRun', (), {
        'objective': 0.0,
        'hyperparameters': {
            'learning_rate': 2e-5, 'num_train_epochs': 4, 'weight_decay': 0.01,
            'per_device_train_batch_size': 16, 'warmup_steps': 100
        }
    })()
    print("Using default hyperparameters due to search failure.")

#antrenam la final cu hiperparametrii buni
print("\n--- Training final model with best hyperparameters ---")

final_training_args = TrainingArguments(
    output_dir="./bert_results_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to='none',
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=2,
    seed=SEED,
    **best_run.hyperparameters
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

final_trainer = CustomTrainer(
    class_weights=class_weights_tensor,
    model=model_init(),
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    callbacks=callbacks
)

print("Training final model...")
train_result = final_trainer.train()
#evaluam
print("\n--- Evaluating final model on test set ---")
test_metrics = final_trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")
print("Final Test set metrics:", test_metrics)

#generam predictiile
predictions = final_trainer.predict(test_dataset)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=-1)

print("\nFinal Classification Report:")
print(classification_report(y_true, y_pred, target_names=list(label_mapping.keys())))

# plot metrici
logs = final_trainer.state.log_history
train_loss = [x['loss'] for x in logs if 'loss' in x]
eval_loss = [x['eval_loss'] for x in logs if 'eval_loss' in x]
accuracy = [x['eval_accuracy'] for x in logs if 'eval_accuracy' in x]
f1_macro = [x['eval_f1_macro'] for x in logs if 'eval_f1_macro' in x]
f1_weighted = [x['eval_f1_weighted'] for x in logs if 'eval_f1_weighted' in x]
epochs = list(range(1, len(accuracy) + 1))

plt.figure()
plt.plot(train_loss, label="Train Loss")
plt.plot(eval_loss, label="Eval Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Evaluation Loss")
plt.legend()
plt.savefig("loss_curve.png")

plt.figure()
plt.plot(epochs, accuracy, label="Accuracy")
plt.plot(epochs, f1_macro, label="F1 Macro")
plt.plot(epochs, f1_weighted, label="F1 Weighted")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.title("Evaluation Metrics over Epochs")
plt.legend()
plt.savefig("metrics_curve.png")

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(label_mapping.keys()))
disp_plot = disp.plot(xticks_rotation=45, cmap='Blues')
disp_plot.ax_.grid(False)
disp_plot.ax_.set_title("Confusion Matrix - Final Test Set", pad=20)
plt.tight_layout()
plt.savefig("confusion_matrix.png")

# salvam modelul final
print("\nSaving final model...")
final_trainer.save_model("./fine_tuned_bert_optimized")
tokenizer.save_pretrained("./fine_tuned_bert_optimized")

print("Training complete!")
sys.stdout = sys.__stdout__
sys.stderr = sys.__stderr__
log_file.close()

print("Training complete! Log saved.")


BERT varianta finala

In [None]:

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    BertForSequenceClassification
)
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from datasets import Dataset
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
import random
import sys
from datetime import datetime
import matplotlib.pyplot as plt


timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = open(f"training_log_{timestamp}.txt", "w")
sys.stdout = log_file
sys.stderr = log_file

# reproductibilitate
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# ne conectam la gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


df = pd.read_csv("augmented_fake_rom_combined_correctly_split.csv")
df = df.dropna(subset=["content", "tag"])
df["content"] = df["content"].astype(str)

le = LabelEncoder()
df["label"] = le.fit_transform(df["tag"])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)

# afisam distributia claselor
for label, count in df["tag"].value_counts().items():
    print(f"{label}: {count} entries")

df = df[["content", "label"]]
dataset = Dataset.from_pandas(df)

MODEL_CHECKPOINT = "dumitrescustefan/bert-base-romanian-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_function(example):
    return tokenizer(example["content"], padding="max_length", truncation=True, max_length=512)


print(type(dataset[0]["content"]), dataset[0]["content"])
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# splittuim datele
split_1 = tokenized_dataset.train_test_split(test_size=0.2, seed=SEED)
train_dataset = split_1["train"]
temp_dataset = split_1["test"]
split_2 = temp_dataset.train_test_split(test_size=0.5, seed=SEED)
val_dataset = split_2["train"]
test_dataset = split_2["test"]

# greutati
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df["label"]),
    y=df["label"]
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class Weights:", class_weights_tensor)

# metricile verificate
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_weighted": f1_score(labels, predictions, average="weighted"),
        "f1_macro": f1_score(labels, predictions, average="macro"),
    }

#clasa custim pt greutati (n am mai folosit o in fina)
class CustomTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

def model_init():
    return BertForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=len(label_mapping),
        #hidden_dropout_prob=0.1,
        #attention_probs_dropout_prob=0.1,
        classifier_dropout=0.3
    )

#definim hiperparametrii
hyperparameters= {
    'learning_rate': 4.021289548621635e-05,
    'num_train_epochs': 4,
    'weight_decay': 0.27790532029550763,
    'per_device_train_batch_size': 16,
    'warmup_steps': 427
}
print(f"Using fixed hyperparameters: {hyperparameters}")


# antrenam modelul final
print("\n--- Training model with specified hyperparameters ---")

training_args = TrainingArguments(
    output_dir="./bert_results_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to='none',
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
    seed=SEED,
    **hyperparameters
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

trainer = CustomTrainer(
    class_weights=class_weights_tensor,
    model=model_init(),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    callbacks=callbacks
)

print("Starting model training...")
train_result = trainer.train()

# evaluam
print("\n--- Evaluating final model on test set ---")
test_metrics = trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")
print("Final Test set metrics:", test_metrics)

#generam predictiile
predictions = trainer.predict(test_dataset)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=-1)


final_report_str = classification_report(y_true, y_pred, target_names=list(label_mapping.keys()))
print("\nFinal Classification Report:")
print(final_report_str)


# Plot training metrics
logs = trainer.state.log_history
train_loss = [x['loss'] for x in logs if 'loss' in x]
eval_loss = [x['eval_loss'] for x in logs if 'eval_loss' in x]
accuracy = [x['eval_accuracy'] for x in logs if 'eval_accuracy' in x]
f1_macro = [x['eval_f1_macro'] for x in logs if 'eval_f1_macro' in x]
f1_weighted = [x['eval_f1_weighted'] for x in logs if 'eval_f1_weighted' in x]
epochs = list(range(1, len(accuracy) + 1))

plt.figure()
plt.plot(train_loss, label="Train Loss")
plt.plot(eval_loss, label="Eval Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Evaluation Loss")
plt.legend()
plt.grid(True)
plt.savefig("loss_curve.png")

plt.figure()
plt.plot(epochs, accuracy, label="Accuracy")
plt.plot(epochs, f1_macro, label="F1 Macro")
plt.plot(epochs, f1_weighted, label="F1 Weighted")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.title("Evaluation Metrics over Epochs")
plt.legend()
plt.savefig("metrics_curve.png")

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(label_mapping.keys()))
disp_plot = disp.plot(xticks_rotation=45, cmap='Blues')
disp_plot.ax_.grid(False)
disp_plot.ax_.set_title("Confusion Matrix - Final Test Set", pad=20)
plt.tight_layout()
plt.savefig("confusion_matrix.png")

# salvam modelul la final
print("\nSaving final model...")
trainer.save_model("./fine_tuned_bert_optimized")
tokenizer.save_pretrained("./fine_tuned_bert_optimized")

print("Training complete!")


sys.stdout = sys.__stdout__
sys.stderr = sys.__stderr__
log_file.close()

print("Training complete! Log saved to file.")
print("\n" + "="*50)
print("Final Classification Report (Test Set)")
print("="*50)
print(final_report_str)

Testare BERT pe setul tradus

In [None]:
# Install dependencies
#!pip install transformers datasets scikit-learn matplotlib

import pandas as pd
import torch
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import matplotlib.pyplot as plt


df = pd.read_csv("5_class_back_translated_dataset.csv")
df = df.dropna(subset=["content", "tag"])

tokenizer = BertTokenizer.from_pretrained("/content/fine_tuned_bert_optimized")
model = BertForSequenceClassification.from_pretrained("/content/fine_tuned_bert_optimized")

def tokenize_function(example):
    return tokenizer(example["back_translated_content"], padding="max_length", truncation=True, max_length=512)

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenized_dataset = tokenized_dataset.class_encode_column("tag")
tokenized_dataset = tokenized_dataset.rename_column("tag", "label")  # <-- ADD THIS
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


# stratificam setul ca in cealalta situatie

split_1 = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_1["train"]
temp_dataset = split_1["test"]
split_2 = temp_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = split_2["train"]
test_dataset = split_2["test"]


training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
    do_train=False,
    do_eval=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Predict
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids
# metrici
print("Classification Report:\n", classification_report(true_labels, pred_labels))


label_names = ["fake_news", "misinformation", "propaganda", "real_news", "satire"]
cm = confusion_matrix(true_labels, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
disp.plot(cmap='OrRd', xticks_rotation=45)
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()
