In [11]:
# Simplified and optimized for better Greek text processing

import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import warnings
warnings.filterwarnings('ignore')

# LangChain imports
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document

print("All dependencies imported successfully!")


# Configuration
LLM_MODEL = "ilsp/meltemi-instruct"
ARTIFACTS_FILE = "artefacts.json"
VECTOR_STORE_PATH = "museum_faiss_langchain"


All dependencies imported successfully!


In [12]:

# STEP 1: Load and preprocess data
def extract_field_text(field_data):
    """Extract text from multilingual field"""
    if isinstance(field_data, dict):
        return field_data.get('gr', '') or str(field_data.get(list(field_data.keys())[0] if field_data.keys() else '', ''))
    elif isinstance(field_data, str):
        return field_data
    else:
        return str(field_data) if field_data else ''

def clean_html_text(text):
    """Clean HTML tags and normalize text"""
    if not text:
        return ""
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Decode HTML entities
    text = text.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
    text = text.replace('&nbsp;', ' ').replace('&quot;', '"')
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def load_museum_data(artifacts_file):
    """Load museum data from JSON file with improved preprocessing"""
    print(f"Loading data from {artifacts_file}...")
    
    with open(artifacts_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    processed = []
    for item in data.get('data', []):
        if not item.get('attributes'):
            continue
            
        attrs = item['attributes']
        
        # Extract and clean text content
        title = clean_html_text(extract_field_text(attrs.get('title', '')))
        description = clean_html_text(extract_field_text(attrs.get('description', '')))
        creator = clean_html_text(extract_field_text(attrs.get('creator', '')))
        subtitle = clean_html_text(extract_field_text(attrs.get('subtitle', '')))
        
        # Skip items with no meaningful content
        if not title and not description:
            continue
        
        artifact = {
            "id": item['id'],
            "code": attrs.get('code', ''),
            "title": title,
            "creator": creator,
            "subtitle": subtitle,
            "description": description,         
            "room": attrs.get('mapRoom', ''),
            "category": "MUSEUM_ARTIFACT"
        }
        processed.append(artifact)
    
    print(f"Loaded {len(processed)} artifacts")
    return processed


In [13]:

# STEP 2: Create Documents with better chunking
def create_documents(artifacts):
    """Create LangChain documents from artifacts with improved chunking"""
    print("Creating LangChain documents...")
    
    # Greek-optimized text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,  # Larger chunks for better context
        chunk_overlap=100,  # Good overlap for context preservation
        length_function=len,
        separators=["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""],
        keep_separator=True
    )
    
    all_documents = []
    
    for artifact in tqdm(artifacts, desc="Processing artifacts"):
        # Combine artifact information in a structured way
        full_text = f"""Title: {artifact['title']}
Creator: {artifact['creator']}
Subtitle: {artifact['subtitle']}
Code: {artifact['code']}
Room: {artifact['room']}
Description: {artifact['description']}
Category: {artifact['category']}"""
        
        # Split text into chunks
        chunks = text_splitter.split_text(full_text.strip())
        
        # Create Document objects with meaningful chunks only
        for i, chunk in enumerate(chunks):
            if len(chunk.strip()) > 50:  # Filter very short chunks
                doc = Document(
                    page_content=chunk,
                    metadata={
                        'artifact_id': artifact['id'],
                        'title': artifact['title'],
                        'creator': artifact['creator'],
                        'room': artifact['room'],
                        'chunk_id': i,
                        'source': 'museum_artifact'
                    }
                )
                all_documents.append(doc)
    
    print(f"Created {len(all_documents)} document chunks")
    return all_documents


In [14]:

# STEP 3: Setup improved embeddings
def setup_embeddings():
    """Setup embedding model optimized for Greek text"""
    print("Loading embedding model...")
    
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )
    
    print("Embedding model loaded successfully!")
    return embeddings

# STEP 4: Create vector store with better retrieval
def create_vector_store(documents, embeddings):
    """Create FAISS vector store from documents with improved settings"""
    print("Creating vector store...")
    
    if not documents:
        print("No documents found!")
        return None, None
    
    # Create vector store with cosine similarity
    vectorstore = FAISS.from_documents(
        documents=documents,
        embedding=embeddings
    )
    
    # Create retriever with better parameters
    retriever = vectorstore.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 5,  # Retrieve top 5 chunks
            "score_threshold": 0.3  # Minimum relevance threshold
        }
    )
    
    print("Vector store created successfully!")
    return vectorstore, retriever

def save_vector_store(vectorstore, path):
    """Save vector store to disk"""
    if vectorstore:
        vectorstore.save_local(path)
        print(f"Vector store saved to {path}")

def load_vector_store(path, embeddings):
    """Load vector store from disk"""
    vectorstore = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
    retriever = vectorstore.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"k": 5, "score_threshold": 0.3}
    )
    print(f"Vector store loaded from {path}")
    return vectorstore, retriever

In [15]:
# STEP 5: Setup LLM
def setup_llm(llm_model):
    """Setup Ollama LLM with optimized parameters"""
    print(f"Loading LLM: {llm_model}...")
    
    llm = OllamaLLM(
        model=llm_model, 
        temperature=0.1,
        num_predict=512,  # Reasonable response length
        num_thread=4
    )
    
    print("LLM loaded successfully!")
    return llm


In [16]:
# STEP 6: Create RAG chain
def create_rag_chain(llm, retriever):
    """Create RAG chain with Greek language support and better prompting"""
    print("Setting up Greek-aware RAG chain...")

    template = """Είσαι ένας ειδικός βοηθός μουσείου στη Βιβλιοθήκη Ωνάση που απαντά σε ερωτήσεις για αντικείμενα και έργα τέχνης.

Χρησιμοποίησε τις παρακάτω πληροφορίες για να απαντήσεις στην ερώτηση:
{context}

Ερώτηση: {question}

Οδηγίες:
- Εάν η ερώτηση είναι στα ελληνικά, απάντησε στα ελληνικά
- Εάν η ερώτηση είναι στα αγγλικά, απάντησε στα αγγλικά
- Χρησιμοποίησε τις πληροφορίες από το πλαίσιο για να δώσεις μια λεπτομερή απάντηση
- Παρέχε εκπαιδευτικές και ενδιαφέρουσες πληροφορίες
- Εάν το πλαίσιο δεν περιέχει σχετικές πληροφορίες, πες το στην κατάλληλη γλώσσα

Απάντηση:"""
    
    # Create prompt
    prompt = PromptTemplate(
        template=template,
        input_variables=["question", "context"]
    )
    
    # Create QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True
    )
    
    print("RAG chain setup complete!")
    return qa_chain

In [17]:


# STEP 7: Query functions with improved processing
def preprocess_question(question):
    """Preprocess question for better retrieval"""
    
    question = question.strip()
    
    # Expand common Greek terms with synonyms for better retrieval
    expansions = {
        'Ωνάσης': 'Ωνάσης Αριστοτέλης Onassis',
        'Κάλλας': 'Κάλλας Μαρία Callas',
        'μουσείο': 'μουσείο έκθεση συλλογή',
        'αντικείμενο': 'αντικείμενο έργο artifact',
        'ζωγραφικός': 'ζωγραφικός πίνακας painting',
        'γλυπτό': 'γλυπτό sculpture άγαλμα',
        'καθρέφτης': 'καθρέφτης mirror καθρέπτης'
    }
    
    processed = question
    for key, value in expansions.items():
        if key in processed:
            processed = processed.replace(key, value)
    
    return processed

def query_with_rag(qa_chain, question):
    """Query using RAG with improved error handling"""
    if not qa_chain:
        return "QA chain not setup!"
    
    try:
        # Preprocess question
        processed_question = preprocess_question(question)
        
        # Get answer
        result = qa_chain.invoke({"query": processed_question})
        answer = result["result"]
        source_docs = result["source_documents"]
        
        # Clean answer from template artifacts
        answer = clean_answer(answer)
        
        return {
            "answer": answer,
            "source_documents": source_docs,
            "question": question,
            "processed_question": processed_question
        }
    except Exception as e:
        print(f"Error during query: {e}")
        return {
            "answer": f"Συγγνώμη, προέκυψε σφάλμα: {str(e)}",
            "source_documents": [],
            "question": question,
            "error": str(e)
        }

def clean_answer(answer):
    """Clean the answer from template artifacts"""
    if not answer or len(answer.strip()) < 5:
        return "Δεν βρέθηκαν επαρκείς πληροφορίες για την ερώτηση αυτή."
    
    answer = answer.strip()
    
    # Remove common template artifacts
    unwanted_prefixes = [
        "Είσαι ένας ειδικός βοηθός",
        "Χρησιμοποίησε τις παρακάτω",
        "Πλαίσιο:",
        "Ερώτηση:",
        "Απάντηση:",
        "Οδηγίες:"
    ]
    
    for prefix in unwanted_prefixes:
        if answer.startswith(prefix):
            answer = answer[len(prefix):].strip()
    
    # Ensure proper capitalization
    if answer and not answer[0].isupper():
        answer = answer[0].upper() + answer[1:]
    
    # Add period if missing
    if answer and not answer.endswith(('.', '!', '?')):
        answer += '.'
    
    return answer


In [18]:

# STEP 8: Main workflow function
def create_greek_qa_system(artifacts_file):
    """Create the complete Greek QA system"""
    
    print("Initializing Greek QA RAG System...")
    print("=" * 60)
    
    # Load and preprocess data
    print("Step 1: Loading and preprocessing documents...")
    artifacts = load_museum_data(artifacts_file)
    
    if not artifacts:
        print("No artifacts loaded. Please check your JSON file.")
        return None
    
    # Create documents
    print("Step 2: Creating document chunks...")
    documents = create_documents(artifacts)
    
    # Setup embeddings
    print("Step 3: Setting up embeddings...")
    embeddings = setup_embeddings()
    
    # Create vector store
    print("Step 4: Building vector database...")
    vectorstore, retriever = create_vector_store(documents, embeddings)
    save_vector_store(vectorstore, VECTOR_STORE_PATH)
    
    # Setup LLM
    print("Step 5: Initializing LLM...")
    llm = setup_llm(LLM_MODEL)
    
    # Create QA chain
    print("Step 6: Building QA chain...")
    qa_chain = create_rag_chain(llm, retriever)
    
    print("Greek QA RAG System ready!")
    print("=" * 60)
    
    return qa_chain, vectorstore, embeddings

In [19]:

# STEP 9: Test function
def test_system(qa_chain):
    """Test the system with Greek questions"""
    
    test_questions = [
        "Δώσε μου πληροφορίες για τα γλυπτά του μουσείου",
        "Ποιοι είναι οι διάσημοι ζωγράφοι που έχουν έργα στη συλλογή",
        "Δώσε μου πληροφορίες για τους πίνακες του μουσείου",
        "Δώσε μου πληροφορίες για τον Ωνάση",
        "Δώσε μου πληροφορίες για τα εκθέματα που αφορούν τον Ωνάση"
    ]
    
    results = []
    print("Testing RAG Pipeline:")
    print("=" * 60)
    
    for i, question in enumerate(test_questions):
        print(f"\nQUESTION {i+1}: {question}")
        print("-" * 60)
        
        try:
            # RAG query
            rag_result = query_with_rag(qa_chain, question)
            print(f"RAG Answer: {rag_result['answer']}")
            
            print(f"\nSources ({len(rag_result['source_documents'])}):") 
            for j, doc in enumerate(rag_result['source_documents'][:3]):
                print(f"  {j+1}. {doc.metadata.get('title', 'Unknown')}")
                print(f"     Creator: {doc.metadata.get('creator', 'Unknown')}")
                print(f"     Room: {doc.metadata.get('room', 'Unknown')}")
                print(f"     Content: {doc.page_content[:100]}...")
            
            results.append(rag_result)
            
        except Exception as e:
            print(f"Error: {e}")
        
        print("\n" + "=" * 60)
    
    # Print summary
    successful = sum(1 for r in results if 'error' not in r)
    print(f"\nTest Results: {successful}/{len(test_questions)} successful")
    
    return results

In [None]:

# USAGE EXAMPLE
if __name__ == "__main__":
    # Initialize the system
    qa_chain, vectorstore, embeddings = create_greek_qa_system(ARTIFACTS_FILE)
    
    if qa_chain:
        # Test the system
        test_results = test_system(qa_chain)
        
        # Start interactive mode (uncomment to use)
        # interactive_qa(qa_chain)
        
        print("\nGreek QA RAG System Complete!")
        #print("Use interactive_qa(qa_chain) for interactive questions.")

Initializing Greek QA RAG System...
Step 1: Loading and preprocessing documents...
Loading data from artefacts.json...
Loaded 86 artifacts
Step 2: Creating document chunks...
Creating LangChain documents...


Processing artifacts: 100%|██████████| 86/86 [00:00<00:00, 7569.52it/s]

Created 156 document chunks
Step 3: Setting up embeddings...
Loading embedding model...





Embedding model loaded successfully!
Step 4: Building vector database...
Creating vector store...
Vector store created successfully!
Vector store saved to museum_faiss_langchain
Step 5: Initializing LLM...
Loading LLM: ilsp/meltemi-instruct...
LLM loaded successfully!
Step 6: Building QA chain...
Setting up Greek-aware RAG chain...
RAG chain setup complete!
Greek QA RAG System ready!
Testing RAG Pipeline:

QUESTION 1: Δώσε μου πληροφορίες για τα γλυπτά του μουσείου
------------------------------------------------------------
RAG Answer: Δώσε μου πληροφορίες για τα γλυπτά του μουσείου έκθεση συλλογήυ

Γεια σας! Θα χαρούμε να σας δώσω πληροφορίες σχετικά με τη συλλογή γλυπτών της Βιβλιοθήκης Ωνάσση.

Η Βιβλιοθήκη Ωνάσση διαθέτει μια εντυπωσιακή συλλογή γλυπτών, η οποία περιλαμβάνει έργα από μερικούς από τους πιο σημαντικούς Έλληνες και διεθνείς καλλιτέχνες. Ακολουθούν μερικά παραδείγματα:

1. "Altars 3" του Silvina Der Meguerditchian: Αυτό το έργο τέχνης δημιουργήθηκε το 2018 και είναι κ