In [None]:
# Get data: Crawl for Samogitian Language
# Results stored in samogitian_corpus.json and samogitian_corpus.txt
# Stats available: crawler_stats.json

In [None]:
## Part 0: Load packages
import os
import time
import json
import logging
from pathlib import Path
from collections import defaultdict
from urllib.parse import urlparse, urljoin
import requests
import fasttext
from huggingface_hub import hf_hub_download
from bs4 import BeautifulSoup

In [None]:
# Targeted common crawler for Samogitian Language
    
# Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("samogitian_crawler.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("samogitian_crawler")

class SamogitianWebCrawler:    
    def __init__(self, output_dir="samogitian_corpus"):
        self.output_dir = output_dir
        self.language_threshold = 0.6
        os.makedirs(output_dir, exist_ok=True)
        
        # statistics for logging
        self.stats = defaultdict(lambda: {
            "pages_visited": 0,
            "samogitian_pages": 0,
            "errors": defaultdict(int)
        })
        
        # Load fasttext model (glotlid works best with Samogitian)
        logger.info("Loaded lang id model.")
        model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir=None)
        self.lang_model = fasttext.load_model(model_path)
        
    def is_samogitian(self, text, threshold=None):
        if not text or len(text.strip()) < 20:
        return False
        text = text.replace('\n', ' ').strip()
        labels, probabilities = self.lang_model.predict(text[:1000], k=2)
        return labels[0] == '__label__sgs_Latn' and probabilities[0] > threshold
    
    def extract_text(self, html):
        try:
            soup = BeautifulSoup(html, 'html.parser')
            for tag in soup(['script', 'style', 'header', 'footer', 'nav', 'meta', 'noscript']):
                tag.decompose()
            
            # Get text with paragraph breaks
            paragraphs = []
            for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'article', 'section', 'div']):
                text = p.get_text(strip=True)
                if text and len(text) > 10: 
                    paragraphs.append(text)
            return "\n\n".join(paragraphs)
    
    def crawl_website(self, start_url, max_pages=500, max_depth=5):
        logger.info(f"Crawling {start_url}")
        # Parse domain
        parsed_url = urlparse(start_url)
        base_domain = parsed_url.netloc
        
        # Initialize crawl (url, depth)
        visited = set()
        to_visit = [(start_url, 0)]
        samogitian_pages = []
        
        # Create domain directory
        domain_dir = os.path.join(self.output_dir, base_domain.replace('.', '_'))
        os.makedirs(domain_dir, exist_ok=True)
        
        while to_visit and len(visited) < max_pages:
            url, depth = to_visit.pop(0)
            #skip repeats
            if url in visited:
                continue
            self.stats[base_domain]["pages_visited"] += 1
            
            if depth > max_depth:
                continue
            
            visited.add(url)
            logger.info(f"Visiting: {url} (depth {depth})")
            
            try:
                # fortimeouts
                response = requests.get(
                    url, 
                    timeout=30,
                    verify=False,  
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
                    }
                )
                
                # Check if successful
                if response.status_code != 200:
                    logger.warning(f"Got status code {response.status_code} for {url}")
                    continue
                
                # Extract text
                text = self.extract_text(response.text)
                if self.is_samogitian(text):
                    logger.info(f"Found Samogitian content: {url}")
                    self.stats[base_domain]["samogitian_pages"] += 1
                    
                    page_number = self.stats[base_domain]["samogitian_pages"]
                    filename = f"{domain_dir}/{page_number}.json"
                    title = ""
                    try:
                        soup = BeautifulSoup(resp.text, "html.parser")
                        if soup.title:
                            title = soup.title.get_text(strip=True)
                    except Exception:
                        pass                
                    page_data = {
                        "url": url,
                        "text": text,
                        "title": title,
                        "domain": base_domain,
                        "source": "targeted_crawl"
                    }
                    
                    with open(filename, 'w', encoding='utf-8') as f:
                        json.dump(page_data, f, ensure_ascii=False, indent=2)
                    samogitian_pages.append(page_data)

                # extract links with same domain
                soup = BeautifulSoup(resp.text, "html.parser")
                for a in soup.find_all("a", href=True):
                    href = a["href"]
                    if href.startswith("#") or href.startswith("javascript:"):
                        continue
                    full = urljoin(url, href)
                    p = urlparse(full)
                    if p.netloc == base_domain:
                        norm = f"{p.scheme}://{p.netloc}{p.path}"
                        if p.query:
                            norm += "?" + p.query
                        if norm not in visited:
                            to_visit.append((norm, depth + 1))
                            
                for link in links:
                    if link not in visited:
                        to_visit.append((link, depth + 1)) #adds new link
            
            except requests.exceptions.SSLError as e:
                logger.error(f"SSL Error processing {url}: {e}")
                self.stats[base_domain]["errors"]["ssl"] += 1
            
            except requests.exceptions.Timeout as e:
                logger.error(f"Timeout processing {url}: {e}")
                self.stats[base_domain]["errors"]["timeout"] += 1
            
            except requests.exceptions.ConnectionError as e:
                logger.error(f"Connection error processing {url}: {e}")
                self.stats[base_domain]["errors"]["connection"] += 1
            
            except Exception as e:
                logger.error(f"Error processing {url}: {e}")
                self.stats[base_domain]["errors"]["other"] += 1
            
            time.sleep(2)
        
        self._save_stats()

        with open(os.path.join(self.output_dir, "crawler_stats.json"), 'w', encoding='utf-8') as f:
            json.dump(dict(self.stats), f, ensure_ascii=False, indent=2)
        
        logger.info(f"Found {len(samogitian_pages)} Samogitian pages on {start_url}")
        return samogitian_pages
    
    def crawl_multiple(self, websites):
        all_pages = []
        for website in websites:
            url = site if site.startswith("http") else f"http://{site}"
            try:
                all_pages.extend(self.crawl_website(website))
            except Exception as e:
                logger.error(f"Error crawling {website}: {e}")
        
        combined_json = self.output_dir / "samogitian_corpus.json"
        with combined_json.open("w", encoding="utf-8") as f:
            json.dump(all_pages, f, ensure_ascii=False, indent=2)

        combined_txt = self.output_dir / "samogitian_corpus.txt"
        with combined_txt.open("w", encoding="utf-8") as f:
            for pg in all_pages:
                f.write(pg["text"].replace("\n", " ") + "\n")
        
        return all_pages

def main():
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
   
    crawler = SamogitianWebCrawler(output_dir="samogitian_corpus")
    websites = [
        "https://zkd.lt",
        "https://zemaitiuzeme.lt",
        "https://zemaiciukalba.lt",
        "https://skouds.lt"
    ]
    all_pages = crawler.crawl_multiple(websites)
    
    # Report results
    print(f"\nCrawling complete!")
    print(f"Found {len(all_pages)} Samogitian pages across {len(websites)} websites")
    
    # Print stats
    print("\nPages by domain:")
    for domain, stats in crawler.stats.items():
        s, v = stats["samogitian_pages"], stats["pages_visited"]
        print(f"  {domain}: {s}/{v} pages ({s/max(1,v)*100:.1f}%)")
    
    print("\nErrors by type:")
    all_errors = defaultdict(int)
    for domain_stats in crawler.stats.values():
        for error_type, count in domain_stats["errors"].items():
            all_errors[error_type] += count
    for error_type, count in all_errors.items():
        print(f"  {error_type}: {count}")

if __name__ == "__main__":
    main()