In [None]:
!pip install transformers torch datasets


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import math

In [None]:
model_name = "readerbench/RoBERT-base"  # Romanian RoBERTa

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


In [None]:
def compute_metrics(model, tokenizer, text, verbose=False):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    neg_log_likelihood = -torch.mean(torch.log(probabilities.max(dim=-1).values + 1e-9))

    # Only print debugging info if verbose=True
    if verbose:
        print(f"Text: {text[:50]}... | NLL: {neg_log_likelihood:.4f}")

    return neg_log_likelihood.item(), neg_log_likelihood.item()


In [None]:
import json
from tqdm import tqdm

In [None]:
# regions = [
#     'Ardeal',
#     'Banat',
#     'Bucovina',
#     'Canada_EN',
#     'Canada_Quebec',
#     'Crisana',
#     'Dobrogea',
#     'Germania',
#     'Italia',
#     'Maramures',
#     'Moldova',
#     'Muntenia',
#     'Oltenia',
#     'Serbia',
#     'Spania',
#     'Ucraina',
#     'UK'
# ]
regions = [
    'Balti',
    'Cahul',
    'Calarasi',
    'Causeni',
    'Comrat',
    'Criuleni',
    'Hincesti',
    'Ialoveni',
    'Orhei',
    'Sangerei',
    'Soroca',
    'Ungheni',
]

In [None]:
import json
from tqdm import tqdm
import numpy as np

with open("/kaggle/input/dataset-tari/dataset/Italia.json") as f:
    italia = json.load(f)

In [None]:
import json
import numpy as np
from tqdm import tqdm

results_regions = {}

for region in regions:
    results_regions[region] = []
    results_content = {"perplexity": [], "neg_log_likelihood": []}
    results_titles = {"perplexity": [], "neg_log_likelihood": []}

    with open(f"/kaggle/input/dataset-tari/dataset/Rep_Moldova/{region}.json") as f:
        region_json = json.load(f)

    for row in tqdm(region_json, disable=True):  # Disable progress bar output
        row_cnt_result = compute_metrics(model, tokenizer, row['content'] if 'content' in row else row['text'], verbose=False)
        results_content['perplexity'].append(row_cnt_result[0])
        results_content['neg_log_likelihood'].append(row_cnt_result[1])
        
        row_title_result = compute_metrics(model, tokenizer, row['title'], verbose=False)
        results_titles['perplexity'].append(row_title_result[0])
        results_titles['neg_log_likelihood'].append(row_title_result[1])

    perp_content_mean = np.array(results_content['neg_log_likelihood']).mean()
    perp_titles_mean = np.array(results_titles['neg_log_likelihood']).mean()

    results_regions[region].append({
        'content': results_content,
        'titles': results_titles,
        'perp_mean_content': perp_content_mean,
        'perp_mean_titles': perp_titles_mean
    })

with open("/kaggle/working/results_regions.json", "w", encoding="utf-8") as f:
    json.dump(results_regions, f, indent=4)

print("\n✅ Results successfully saved to /kaggle/working/results_regions.json")


In [None]:
import json
import numpy as np

# Load results
with open("/kaggle/working/results_regions.json", "r", encoding="utf-8") as f:
    results_regions = json.load(f)

# Store all perplexities for overall statistics
all_perplexities_content = []
all_perplexities_titles = []

print("\n📊 **Perplexity Statistics Per Region:**")
for region, data in results_regions.items():
    region_perplexities_content = [p for entry in data for p in entry["content"]["perplexity"]]
    region_perplexities_titles = [p for entry in data for p in entry["titles"]["perplexity"]]

    if not region_perplexities_content or not region_perplexities_titles:
        print(f"⚠️ No data available for {region}. Skipping...")
        continue  # Avoid crashing if a region has no perplexities

    # Compute mean, min, and max correctly from all values
    mean_content = np.mean(region_perplexities_content)
    min_content = np.min(region_perplexities_content)
    max_content = np.max(region_perplexities_content)

    mean_titles = np.mean(region_perplexities_titles)
    min_titles = np.min(region_perplexities_titles)
    max_titles = np.max(region_perplexities_titles)

    # Print per-region statistics
    print(f"📍 {region}:")
    print(f"   - **Content**: Mean = {mean_content:.4f}, Min = {min_content:.4f}, Max = {max_content:.4f}")
    print(f"   - **Titles**:  Mean = {mean_titles:.4f}, Min = {min_titles:.4f}, Max = {max_titles:.4f}\n")

    # Collect for overall dataset statistics
    all_perplexities_content.extend(region_perplexities_content)
    all_perplexities_titles.extend(region_perplexities_titles)

# Compute overall dataset statistics
if all_perplexities_content and all_perplexities_titles:
    dataset_mean_content = np.mean(all_perplexities_content)
    dataset_min_content = np.min(all_perplexities_content)
    dataset_max_content = np.max(all_perplexities_content)

    dataset_mean_titles = np.mean(all_perplexities_titles)
    dataset_min_titles = np.min(all_perplexities_titles)
    dataset_max_titles = np.max(all_perplexities_titles)

    # Print overall dataset statistics
    print("\n📊 **Overall Dataset Perplexity Statistics:**")
    print(f"   - **Content**: Mean = {dataset_mean_content:.4f}, Min = {dataset_min_content:.4f}, Max = {dataset_max_content:.4f}")
    print(f"   - **Titles**:  Mean = {dataset_mean_titles:.4f}, Min = {dataset_min_titles:.4f}, Max = {dataset_max_titles:.4f}")
else:
    print("⚠️ No perplexity data found in the dataset.")


In [None]:
import os
print("File exists:", os.path.exists("/kaggle/working/results_regions.json"))


In [None]:
import json
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import os

nltk.download('punkt')

dataset_path = "/kaggle/input/dataset-tari/dataset/Rep_Moldova"

region_files = [f for f in os.listdir(dataset_path) if f.endswith(".json")]

all_perplexities_sentences = []

print("\n📊 **Perplexity Statistics Per Region (Sentence-Level):**")
for region_file in region_files:
    region_name = region_file.replace(".json", "")  # Extract region name
    region_perplexities_sentences = []

    # Load JSON data
    with open(os.path.join(dataset_path, region_file), "r", encoding="utf-8") as f:
        region_data = json.load(f)

    for entry in region_data:
        content_text = entry.get("content", "")
        title_text = entry.get("title", "")

        combined_text = f"{content_text} {title_text}".strip()  # Combine content and title text

        if not combined_text:
            print(f"⚠️ No text data found for {region_name}. Skipping...")
            continue  # Skip empty text regions

        # Split text into sentences
        sentences = sent_tokenize(combined_text)

        for sentence in sentences:
            if sentence.strip():  # Ignore empty sentences
                # Compute perplexity for each sentence
                sentence_result = compute_metrics(model, tokenizer, sentence)
                region_perplexities_sentences.append(sentence_result[0])  # Perplexity value

    if not region_perplexities_sentences:
        print(f"⚠️ No valid sentence-level data for {region_name}. Skipping...")
        continue  # Avoid crashing if no sentences exist

    # Compute mean, min, and max correctly from all values
    mean_sentences = np.mean(region_perplexities_sentences)
    min_sentences = np.min(region_perplexities_sentences)
    max_sentences = np.max(region_perplexities_sentences)

    # Print per-region statistics
    print(f"📍 {region_name}:")
    print(f"   - **Sentences**: Mean = {mean_sentences:.4f}, Min = {min_sentences:.4f}, Max = {max_sentences:.4f}\n")

    # Collect for overall dataset statistics
    all_perplexities_sentences.extend(region_perplexities_sentences)

# Compute overall dataset statistics
if all_perplexities_sentences:
    dataset_mean_sentences = np.mean(all_perplexities_sentences)
    dataset_min_sentences = np.min(all_perplexities_sentences)
    dataset_max_sentences = np.max(all_perplexities_sentences)

    print("\n📊 **Overall Dataset Perplexity Statistics (Sentence-Level):**")
    print(f"   - **Sentences**: Mean = {dataset_mean_sentences:.4f}, Min = {dataset_min_sentences:.4f}, Max = {dataset_max_sentences:.4f}")
else:
    print("⚠️ No perplexity data found in the dataset.")
