In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time

BASE_URL = "https://www.lrt.lt"
HEADERS = {"User-Agent": "Mozilla/5.0"}
MAX_ARTICLES = 250
CRAWL_DELAY = 0.5  # seconds

visited_pages = set()
article_links = set()

def get_full_url(href):
    if href.startswith("http"):
        return href
    elif href.startswith("/"):
        return BASE_URL + href
    return None

def extract_links_from_page(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        print(f"Failed to load {url}: {e}")
        return []

    links = []
    for a in soup.find_all("a", href=True):
        href = a['href']
        full_url = get_full_url(href)
        if not full_url or BASE_URL not in full_url:
            continue
        if "/naujienos/" in full_url and full_url.count("/") > 4:
            article_links.add(full_url)
        elif full_url not in visited_pages:
            links.append(full_url)

    return links

def parse_article(url):
    try:
        res = requests.get(url, headers=HEADERS, timeout=10)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, "html.parser")
        title = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No Title"
        paragraphs = soup.find_all("p")
        text = "\n".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 40)
        return {"title": title, "url": url, "text": text}
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Start crawling from homepage
to_visit = [BASE_URL]

while to_visit and len(article_links) < MAX_ARTICLES:
    current_url = to_visit.pop(0)
    if current_url in visited_pages:
        continue
    visited_pages.add(current_url)

    print(f"Visiting: {current_url} | Articles found: {len(article_links)}")
    new_links = extract_links_from_page(current_url)
    to_visit.extend(new_links)
    time.sleep(CRAWL_DELAY)

print(f"Collected {len(article_links)} article links. Starting to scrape content...")

# Scrape article content
articles = []
for i, url in enumerate(list(article_links)[:MAX_ARTICLES]):
    print(f"[{i+1}] Scraping: {url}")
    article = parse_article(url)
    if article:
        articles.append(article)
    time.sleep(CRAWL_DELAY)

# Save to CSV
with open("/content/drive/MyDrive/lrt_articles.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "url", "text"])
    writer.writeheader()
    writer.writerows(articles)

print("Saved lrt.lt articles to lrt_articles.csv")

Visiting: https://www.lrt.lt | Articles found: 0
Visiting: https://www.lrt.lt/programa | Articles found: 178
Visiting: https://www.lrt.lt/ | Articles found: 178
Visiting: https://www.lrt.lt/en/news-in-english | Articles found: 180
Visiting: https://www.lrt.lt/ru/novosti | Articles found: 180
Visiting: https://www.lrt.lt/pl/wiadomosci | Articles found: 180
Visiting: https://www.lrt.lt/ua/novini | Articles found: 180
Visiting: https://www.lrt.lt/paieska | Articles found: 180
Visiting: https://www.lrt.lt/naujienos/lietuvoje | Articles found: 180
Visiting: https://www.lrt.lt/lituanica/aktualijos/751/2553116/pakistano-karacio-mieste-atidarytas-lietuvos-garbes-konsulatas | Articles found: 187
Visiting: https://www.lrt.lt/lituanica/aktualijos | Articles found: 188
Visiting: https://www.lrt.lt/naujienos/verslas | Articles found: 188
Visiting: https://www.lrt.lt/naujienos/sveikata | Articles found: 204
Visiting: https://www.lrt.lt/naujienos/svietimas | Articles found: 204
Visiting: https://www.

In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Load CSV
df = pd.read_csv("/content/drive/MyDrive/lrt_articles.csv", encoding="utf-8")

# Basic Lithuanian stopwords list
lithuanian_stopwords = set([
    "ir", "kad", "bet", "nes", "tai", "taip", "čia", "aš", "tu", "jis", "ji",
    "mes", "jūs", "jie", "jos", "man", "tau", "jam", "jai", "mums", "jums",
    "juos", "jas", "ar", "kur", "kada", "kaip", "kas", "kuo", "kodėl", "jog",
    "tą", "tas", "tie", "tiek", "šitą", "šis", "šita", "ši", "šiuos", "su",
    "be", "nuo", "iki", "už", "ant", "po", "per", "apie", "prie", "už"
])

def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(str(text), "html.parser").get_text()

    # Remove known junk phrases
    junk_patterns = [
        r"skaitykite daugiau.*",
        r"daugiau straipsnių.*",
        r"nuotraukos.*",
        r"video.*"
    ]
    for pattern in junk_patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)

    # Remove non-alphabetic characters except Lithuanian letters
    text = re.sub(r"[^a-zA-ZąčęėįšųūžĄČĘĖĮŠŲŪŽ\s]", "", text)

    # Convert to lowercase after filtering
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Remove stopwords
    words = text.split()
    filtered = [word for word in words if word not in lithuanian_stopwords]

    return " ".join(filtered)

# Apply cleaning
df["clean_text"] = df["text"].astype(str).apply(clean_text)

# Filter too-short entries
df = df[df["clean_text"].apply(lambda x: len(x.split()) >= 50)]

# Save
df[["title", "url", "clean_text"]].to_csv("/content/drive/MyDrive/lrt_articles_cleaned.csv", index=False, encoding="utf-8")

print("Cleaned and saved to lrt_articles_cleaned.csv")

Cleaned and saved to lrt_articles_cleaned.csv


In [None]:
!pip install sentence-transformers faiss-cpu transformers datasets

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load and embed cleaned article data
df = pd.read_csv("/content/drive/MyDrive/lrt_articles_cleaned.csv")

texts = df["clean_text"].tolist()
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embedder = SentenceTransformer(model_name)

print("Generating embeddings...")
embeddings = embedder.encode(texts, convert_to_numpy=True)

# Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print("Stored embeddings in FAISS index.")

Generating embeddings...
Stored embeddings in FAISS index.


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Model
llm_name = "google/flan-t5-small"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(llm_name)
model = AutoModelForSeq2SeqLM.from_pretrained(llm_name)

# Create generator pipeline
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# RAG-style query function
def query_rag(question, top_k=5, max_context_chars=1500):
    query_embedding = embedder.encode([question])
    D, I = index.search(query_embedding, top_k)
    retrieved_docs = [texts[i] for i in I[0]]

    context = "\n\n".join(retrieved_docs)
    context = context[:max_context_chars]

    prompt = f"Atsakyk i klausima remdamasis sia informacija:\n{context}\n\nKlausimas: {question}"

    result = generator(prompt, max_new_tokens=200)[0]["generated_text"]
    print("\n--- RESPONSE ---")
    print(result.strip())

Device set to use cpu


In [None]:
query_rag("Ką Ukraina paskelbė apie rusų naikintuvą?")


--- RESPONSE ---
Atsakyk i klausima remdamasis sia informacija: kremliaus urnalistas pavelas zarubinas rusijos valstybiniame televizijos kanale rossija paskelb dokumentin film kuriame vladimiras putinas pabr rusijos vidaus auditorijos param karui propagavo pasiaukojimo ideal karo nam frontuose greiiausiai siekdamas paruoti rusijos visuomen ilgesniam karui  invazij
