In [4]:
import ast
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset

In [5]:
# ==========================
# Configuration Variables
# ==========================
INPUT_FILE_PATH = "../data/merged_dataframe_with_language.csv"  # Input file path
NUM_FOLDS = 5  # Number of folds for cross-validation
SEED = 42  # Random seed for reproducibility

In [6]:
# Load the DataFrame
df = pd.read_csv(INPUT_FILE_PATH)

# Filter the DataFrame to only Portuguese ("PT") entries

# Prepare the text and labels
texts = df["content"].tolist()

# Convert 'domain' into three categories: 'UKR', 'CC', and 'Other'
df['label'] = df['domain'].apply(lambda x: 0 if x == "URW" else 1 if x == "CC" else 2)
labels = df['label'].tolist()

NUM_LABELS = 3  # For ternary classification: UKR, CC, Other

In [7]:
# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)

# Summary of each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels), start=1):
    # Split data for the current fold
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]
    
    # Language distribution in validation set
    lang_counts = val_df["language"].value_counts().to_dict()
    
    # Label distribution in validation set
    label_counts = val_df["label"].value_counts().to_dict()
    
    print(f"\nFold {fold}/{NUM_FOLDS}")
    print("Language distribution in validation set:", lang_counts)
    print("Label distribution in validation set:", label_counts)


Fold 1/5
Language distribution in validation set: {'BG': 49, 'PT': 39, 'EN': 35, 'HI': 23}
Label distribution in validation set: {0: 86, 2: 30, 1: 30}

Fold 2/5
Language distribution in validation set: {'EN': 49, 'PT': 38, 'BG': 35, 'HI': 23}
Label distribution in validation set: {0: 86, 2: 30, 1: 29}

Fold 3/5
Language distribution in validation set: {'EN': 44, 'PT': 41, 'BG': 37, 'HI': 23}
Label distribution in validation set: {0: 86, 1: 30, 2: 29}

Fold 4/5
Language distribution in validation set: {'PT': 43, 'EN': 41, 'BG': 38, 'HI': 23}
Label distribution in validation set: {0: 86, 1: 30, 2: 29}

Fold 5/5
Language distribution in validation set: {'BG': 52, 'PT': 39, 'EN': 31, 'HI': 23}
Label distribution in validation set: {0: 86, 1: 30, 2: 29}
