In [None]:
from datasets import load_dataset
import pandas as pd
import glob

In [None]:
!pip install langdetect

In [None]:
# access Catalan dataset
splits = {'train': 'catalan/train-00000-of-00001.parquet', 'test': 'catalan/test-00000-of-00001.parquet', 'validation': 'catalan/validation-00000-of-00001.parquet'}

df_train = pd.read_parquet("hf://datasets/mteb/CataloniaTweetClassification/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/mteb/CataloniaTweetClassification/" + splits["test"])
df_val = pd.read_parquet("hf://datasets/mteb/CataloniaTweetClassification/" + splits["validation"])

# save locally
df_train.to_csv("train_cat.csv", index=False)
df_test.to_csv("test_cat.csv", index=False)
df_val.to_csv("val_cat.csv", index=False)

In [None]:
# access Spanish dataset
splits = {'train': 'spanish/train-00000-of-00001.parquet', 'test': 'spanish/test-00000-of-00001.parquet', 'validation': 'spanish/validation-00000-of-00001.parquet'}

df_train = pd.read_parquet("hf://datasets/mteb/CataloniaTweetClassification/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/mteb/CataloniaTweetClassification/" + splits["test"])
df_val = pd.read_parquet("hf://datasets/mteb/CataloniaTweetClassification/" + splits["validation"])

# save locally
df_train.to_csv("train_es.csv", index=False)
df_test.to_csv("test_es.csv", index=False)
df_val.to_csv("val_es.csv", index=False)

## Dataset exploration

In [None]:
# open dataset as df
df = pd.read_csv(path, encoding="utf-8")

In [None]:
# first few rows
display(df.head())

# dataset shape
print("\n Dataset shape:", df.shape)

# column names
print("\n Columns:", df.columns.tolist())

# label distribution 
print("\n Label distribution:")
print(df['label'].value_counts())
print("\n Label distribution (%)")
print(df['label'].value_counts(normalize=True).round(3) * 100)

# average tweet length (in characters and words)
df['char_len'] = df['text'].str.len()
df['word_len'] = df['text'].str.split().str.len()

print("\n Average tweet length (chars):", df['char_len'].mean())
print(" Average tweet length (words):", df['word_len'].mean())

# check for duplicates and missing values
print("\n Missing values per column:")
print(df.isnull().sum())

print("\n Number of duplicate tweets:", df.duplicated(subset='text').sum())

# find duplicates
duplicates = df[df.duplicated(subset='text', keep=False)]

print(f" Found {len(duplicates)} duplicate rows")

# print duplicates
duplicates = duplicates.sort_values(by="text")
display(duplicates)


### Duplicate removal 

In [None]:
# removing duplicates from Spanish dataset
df_es = pd.read_csv(path, encoding="utf-8")

print("Before removing duplicates:", df_es.shape)

# dropping all duplicated tweets 
df_es = df_es[df_es.duplicated(subset="text", keep=False) == False].reset_index(drop=True)

print("After removing duplicates:", df_es.shape)

### Portuguese tweets removal

In [None]:
import pandas as pd
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 42

In [None]:
# function to drop portuguese tweets
def drop_portuguese(df, text_col="text"):
    langs = []
    for t in df[text_col]:
        try:
            langs.append(detect(t))
        except:
            langs.append("error")
    
    df["lang_detected"] = langs
    df_filtered = df[df["lang_detected"] != "pt"].reset_index(drop=True)
    
    return df_filtered

In [None]:
# removing portuguese tweets 
df_es_clean = drop_portuguese(df_es, text_col="text")
print("After removing portuguese tweets:", df_es_clean.shape)

In [None]:
# check new label distribution 
print("\n Label distribution:")
print(df_es_clean['label'].value_counts())
print("\n Label distribution (%)")
print(df_es_clean['label'].value_counts(normalize=True).round(3) * 100)

In [None]:
df_es_clean.to_csv("train_es_clean.csv", index=None)

### Subset selection

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# create 5 nested stratified chunks from the cleaned dataset

def stratified_chunks(csv_path, output_prefix, total_size=2000, chunk_size=100, random_state=42):
        
    # load data
    df = pd.read_csv(csv_path)
    
    print("Full dataset size:", len(df))
    print("Label distribution in full dataset (%):")
    print(df['label'].value_counts(normalize=True).round(3) * 100, "\n")
    
    # stratified sample of total_size
    df_sampled, _ = train_test_split(
        df,
        train_size=total_size,
        stratify=df['label'],
        random_state=random_state
    )
    
    # create 5 nested stratified chunks
    chunks = []
    remaining = df_sampled.copy()
    
    for i in range(0, total_size, chunk_size):
        chunk, remaining = train_test_split(
            remaining,
            train_size=chunk_size,
            stratify=remaining['label'],
            random_state=random_state + i  
        )
        chunks.append(chunk)
        
        # save individual chunk
        chunk.to_csv(f"{output_prefix}_chunk{i//chunk_size + 1}.csv", index=False)
    
    # save cumulative subsets
    for i in range(len(chunks)):
        cumulative = pd.concat(chunks[:i+1])
        cumulative.to_csv(f"{output_prefix}_{(i+1)*chunk_size}.csv", index=False)
    
    print(f"Created {len(chunks)} chunks")

In [None]:
# Catalan subset selection
stratified_chunks(r"C:\Users\emmar\Documents\GitHub\coannotating-catalan\data\CAT_dataset\train_cat.csv", output_prefix="subset_cat")

In [None]:
# Spanish subsets selection
stratified_chunks(r"C:\Users\emmar\Documents\GitHub\coannotating-catalan\data\ES_dataset\train_es_clean.csv", output_prefix="subset_es")

### Chunk cleaning + indexing

In [None]:
import pandas as pd
import glob
import os

In [None]:
# function to remove line breaks and tabs and normalize multiple spaces
def clean_text(text):
    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    text = " ".join(text.split())
    return text

In [None]:
# function to unique IDs across multiple chunk CSVs for a given language
def clean_and_ids(chunk_path_pattern, lang_prefix, save_overwrite=False):
        
    chunk_files = sorted(glob.glob(chunk_path_pattern))
    if not chunk_files:
        print("⚠️ No files found with that pattern.")
        return []
    
    current_id = 0
    all_chunks = []
    
    for file in chunk_files:
        df = pd.read_csv(file)
        n_rows = len(df)
        
        # unique IDs
        df["id"] = [f"{lang_prefix}_{str(i).zfill(3)}" for i in range(current_id, current_id + n_rows)]
        current_id += n_rows
                
        # clean text column
        df["text"] = df["text"].apply(lambda text: " ".join(text.replace("\n", " ")
                                                           .replace("\r", " ")
                                                           .replace("\t", " ")
                                                           .split()))
        # reorder columns
        df = df[["id", "text", "label"]]
        
        all_chunks.append(df)
        
        # save file
        if save_overwrite:
            df.to_csv(file, index=False)
        else:
            base, ext = os.path.splitext(file)
            df.to_csv(f"{base}_final.csv", index=False)
       
    return all_chunks

#### Catalan

In [None]:
clean_and_ids(r"C:\Users\emmar\Documents\GitHub\coannotating-catalan\data\CAT_dataset\subset_cat_chunk*.csv", "CAT")

#### Spanish

In [None]:
clean_and_ids(r"C:\Users\emmar\Documents\GitHub\coannotating-catalan\data\ES_dataset\subset_es_chunk*.csv", "ES")