In [None]:
# Load and preprocess all data
# Data loaded:
# - Standard Lithuanin Wikipedia
# - Samogitian Wikipedia
# - C4 Lithuanian
# - Additional Samogitian prose (from CSV files)
# - Samogitian crawler (from previous code block)
# Uses:
# - toxiclt.csv (available from https://huggingface.co/datasets/PeterGraebner/LDNOOBW_V2)

# Outputs: processed_corpus contains:
#-- processed_samogitian.json     # Samogitian corpus with metadata, json
#-- processed_lithuanian.json     # Lithuanian corpus with metadata, json
#-- samogitian_corpus.txt         # Samogitian corpus, Plain text version with one doc/entry
#-- lithuanian_corpus.txt         # Lithuanian corpus, Plain text version with one doc/entry
#-- samogitian_dataset/           # Samogitian corpus, HuggingFace Dataset 
#-- lithuanian_dataset/           # Lithuanian corpus, HuggingFace Dataset 
#-- processing_stats.json         # Statistics about preprocessing, json
#-- corpus_summary.md             # Summary

In [None]:
# Load necessary packages
import numpy as np
import pandas as pd
import fasttext
from datasets import load_dataset, Dataset
from tqdm.auto import tqdm
import os
import re
import json
import hashlib
import logging
import unicodedata
from glob import glob
from pathlib import Path
from typing import Any, Dict, List, Tuple
from dataclasses import dataclass

In [None]:
# configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("corpus_processing.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("corpus_processor")

In [None]:
# Loads all data sources
# - Lithuanian Wikipedia
# - Samogitian Wikipedia
# - Lithuanian C4
# - Additional Samogitian prose text (e.g.The Little Prince)

def load_all_data_sources() -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    samogitian_data = []
    lithuanian_data = []
    
    logger.info("Loading Wikipedia datasets...")
    # Load Samogitian nad Lithuanian Wikipedia
    ds_smg = load_dataset("wikimedia/wikipedia", "20231101.bat-smg", split="train")   
    ds_lt = load_dataset("wikimedia/wikipedia", "20231101.lt", split="train") 

    for item in ds_smg:
        samogitian_data.append({
            'text': item['text'],
            'title': item.get('title', ''),
            'source': 'wikipedia_smg',
            'url': item.get('url', ''),
            'id': item.get('id', '')
        })
    logger.info(f"Loaded {len(ds_smg)} Samogitian Wikipedia documents")
        
    # Standard Lithuanian Wikipedia
    for item in ds_lt:
        lithuanian_data.append({
            'text': item['text'],
            'title': item.get('title', ''),
            'source': 'wikipedia_lt',
            'url': item.get('url', ''),
            'id': item.get('id', '')
        })
    logger.info(f"Loaded {len(ds_lt)} Lithuanian Wikipedia documents")
    
    # Additional Samogitian prose
    logger.info("Loading additional Samogitian prose...")
     for fname, src in [("moresmg1.csv", "little_prince_smg"), ("moresmg.csv", "prose_smg")]:
        try:
            df = pd.read_csv(fname)
            for _, row in df.iterrows():
                samogitian_data.append({
                    "text":  row["text"],
                    "source": src,
                    "id":     str(row.get("id", "")),
                    "label":  row.get("label", "")
                })
            logger.info(f"Loaded {len(df)} {src} docs")
        except Exception as e:
            logger.error(f"Error loading {fname}: {e}")
    
    # C4 Lithuanian data
    logger.info("Loading C4 Lithuanian data...")
    ds_c4 = load_dataset("allenai/c4", "lt", split="train")
    sample_size = min(100000, len(ds_c4))
    ds_sample = ds_c4.select(np.random.choice(len(ds_c4), sample_size, replace=False))
    mapped_c4 = ds_sample.map(
        lambda ex: {"text": ex["text"], "url": ex.get("url", ""), "source": "c4_lt"},
        remove_columns=ds_sample.column_names
    )
    lithuanian_data.extend(list(mapped_c4))
    logger.info(f"Loaded {sample_size} C4 Lithuanian docs")
    
    # Files from Samogitian crawler
    logger.info("Loading targeted crawler data...")
    crawler_docs = []
    try:
        json_files = []
        if os.path.exists("samogitian_corpus"):
            json_files.extend(glob("samogitian_corpus/*.json"))
            for sub in os.listdir("samogitian_corpus"):
                subp = os.path.join("samogitian_corpus", sub)
                if os.path.isdir(subp):
                    json_files.extend(glob(f"{subp}/*.json"))
        json_files = [f for f in json_files if not f.endswith(("stats.json", "summary.json", "corpus.json"))]

        for jf in json_files:
            with open(jf, "r", encoding="utf-8") as f:
                doc = json.load(f)
            if isinstance(doc, dict) and "text" in doc:
                crawler_docs.append(doc)
            elif isinstance(doc, list):
                crawler_docs.extend([d for d in doc if isinstance(d, dict) and "text" in d])
    except Exception as e:
        logger.error(f"Error loading targeted crawler data: {e}")

    for doc in crawler_docs:
        samogitian_data.append({
            "text":   doc["text"],
            "title":  doc.get("title", ""),
            "url":    doc.get("url", ""),
            "source": doc.get("source", "targeted_crawl"),
            "domain": doc.get("domain", "")
        })
    logger.info(f"Loaded {len(crawler_docs)} crawler docs")

    logger.info(f"Total: {len(samogitian_data)} Samogitian, {len(lithuanian_data)} Lithuanian")
    return samogitian_data, lithuanian_data

In [None]:
# Preprocessing
# Checks for quality:
# - Quality: reasonable text length, not excessively repetitive or filled with special characters
# - Removes toxic words
# - Ensures only relevant languages (Lithuanian and Samogitian dialect)
# - Ensures document not already repeated
# - Boilerplate removal

@dataclass
class Config:
    """Configuration for corpus processing"""
    min_text_length: int = 60 
    max_text_length: int = 100000 
    lang_detection_threshold: float = 0.6 # confidence of language identification 
    repetition_thresh: float = 0.4 
    special_char_thresh: float = 0.6 
    toxic_csv_path: str = "toxiclt.csv" #words from https://huggingface.co/datasets/PeterGraebner/LDNOOBW_V2

class CorpusProcessor:    
    def __init__(self, output_dir: str = "processed_data", config: Config = Config()):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True, parents=True)
        self.config = config

        #Load Fasttext language identification model (glotlid best for Samogitian)
        model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir=None)
        self.lang_model = fasttext.load_model(model_path)

        # Load toxic words
        nsfw_df = pd.read_csv(self.config.toxic_csv_path)
        self.nsfw_terms = {
            term
            for col in nsfw_df.columns
            for term in nsfw_df[col].dropna().str.lower()
        }
    
    def identify_language(self, text: str) -> Tuple[str, float]:
        text = re.sub(r'\s+', ' ', text).strip()
        labels, probabilities = self.lang_model.predict(text, k=1) #only keep top k=1 predicted language
        lang = labels[0].replace('__label__', '').split('_')[0]
        return lang, probabilities[0]
    
    def quality_check(self, text: str) -> Tuple[float, str]:
        if len(text) < self.config.min_text_length:
            return 0.0, "too_short"
        if len(text) > self.config.max_text_length:
            return 0.0, "too_long"

        #check for excessive repetitions using Jaccard similarity nad word diversity
        text_block_size = 100
        if len(text) < text_block_size * 2:
            return 0.0, "short"
        text_blocks = [text[i:i+text_block_size] for i in range(0, len(text) - text_block_size, text_block_size)]
        repetitions = 0
        for i in range(len(text_blocks) - 1):
            for j in range(i + 1, min(i + 5, len(text_blocks))):
                set1 = set(text_blocks[i])
                set2 = set(text_blocks[j])
                intersection = len(set1.intersection(set2))
                union = len(set1.union(set2))
                jaccard_sim = intersection / max(1, union)
                if jaccard_sim > 0.8:
                    repetitions += 1
        rep_ratio = repetitions/max(1, len(text_blocks) - 1)
        if rep_ratio > self.config.repetition_thresh:
            return 0.0, "repetitive_content"
        words = text.split() 
        word_diversity = len(set(words))/max(1,len(words))
        
        # Check for excessive special characters
        special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / max(1, len(text))
        if special_char_ratio > self.config.special_char_thresh:
            return 0.0, "excessive_special_chars"

        # remove documents with toxic words
        text_lower = text.lower()
        for term in nsfw_terms:
            if term in text_lower:
                return 0.0, "toxic"
        
        quality_score = min(1.0, word_diversity * 2)
        return quality_score, "acceptable"
        
    def remove_boilerplate(self, text: str) -> str:
        indicators = [
            r'Copyright © \d{4}.*?(\n|$)',
            r'All rights reserved.*?(\n|$)',
            r'Visos teisės saugomos.*?(\n|$)',
            
            r'Privacy Policy.*?(\n|$)',
            r'Terms of Service.*?(\n|$)',
            r'Privatumo politika.*?(\n|$)',
            r'Naudojimo sąlygos.*?(\n|$)',
            r'Slapukų politika.*?(\n|$)',
            r'Cookies.*?(\n|$)',
            r'Slapukai.*?(\n|$)',
            
            # Citations and references
            r'\[\d+\]', 
            r'References\s*:.*?(\n\n|$)',
            r'Nuorodos\s*:.*?(\n\n|$)',
            r'Nūruodas\s*:.*?(\n\n|$)',
            r'Šaltiniai\s*:.*?(\n\n|$)',
            r'Bibliography\s*:.*?(\n\n|$)',
            r'Literatūra\s*:.*?(\n\n|$)',
            r'Šaltenē\s*:.*?(\n\n|$)'
        ]
        
        for indicator in indicators:
            text = re.sub(indicator, ' ', text, flags=re.IGNORECASE)
        return re.sub(r'\s+', ' ', text).strip()
    
    def compute_text_hash(self, text):
        t = unicodedata.normalize("NFKC", text).lower()
        t = re.sub(r"\s+", " ", t).strip()
        return hashlib.md5(t.encode()).hexdigest()
    
    def _process_language_corpus(self, corpus: List[Dict[str, Any]], language_name: str) -> List[Dict[str, Any]]:
        filtered_corpus = []
        text_hashes = set()
        expected_lang = "sgs" if language_name == "samogitian" else "lit"
        
        for doc in tqdm(corpus, desc=f"Filtering {language_name}"):
            clean_text = self.remove_boilerplate(doc['text'])
            quality_score, reason = self.quality_check(clean_text)
            if quality_score < 0.5:
                continue
            # Skip duplicates
            text_hash = self.compute_text_hash(clean_text)
            if text_hash in text_hashes:
                continue
            text_hashes.add(text_hash)

            lang, confidence = self.identify_language(clean_text)
            if lang != expected_lang and confidence > self.config.lang_detection_threshold:
                continue

            processed_doc = {
                'text': clean_text,
                'language_confidence': confidence,
                'text_hash': text_hash
            }
            for key in ['title', 'url', 'source', 'id']:
                if key in doc:
                    processed_doc[key] = doc[key]
            
            filtered_corpus.append(processed_doc)
        
        output_file = self.output_dir / f"processed_{language_name}.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(filtered_corpus, f, ensure_ascii=False, indent=2)
        text_file = self.output_dir / f"{language_name}_corpus.txt"
        with open(text_file, 'w', encoding='utf-8') as f:
            for doc in filtered_corpus:
                f.write(doc['text'] + "\n\n")
        
        dataset = Dataset.from_dict({
            "text": [doc["text"] for doc in filtered_corpus],
            "source": [doc.get("source", "unknown") for doc in filtered_corpus],
            "id": [doc.get("id", str(i)) for i, doc in enumerate(filtered_corpus)]
        })
        dataset.save_to_disk(str(self.output_dir / f"{language_name}_dataset"))
        
        return filtered_corpus

In [None]:
def main():
    samogitian_data, lithuanian_data = load_all_data_sources()
    processor = CorpusProcessor(output_dir="processed_corpus")
    processed_samogitian = processor._process_language_corpus(samogitian_data, "samogitian")
    processed_lithuanian = processor._process_language_corpus(lithuanian_data, "lithuanian")
    
    processor.logger.info(f"Processing complete. saved to: {"processed_corpus"}")
    processor.logger.info(f"Samogitian corpus: {len(processed_samogitian):,} documents")
    processor.logger.info(f"Lithuanian corpus: {len(processed_lithuanian):,} documents")


if __name__ == "__main__":
    main()