In [None]:
# Embedding Evaluation Notebook
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

# Example Tulu-English pairs (make up or translate a few simple ones)
pairs = [
    ("ತುಳು ನಾಡು ಸುಂದರವಾಗಿದೆ.", "Tulu Nadu is beautiful."),
    ("ಅವನು ಶಾಲೆಗೆ ಹೋದನು.", "He went to school."),
    ("ಮಳೆ ಬರುತ್ತಿದೆ.", "It is raining."),
    ("ಆಕೆ ಹಾಡು ಹಾಡುತ್ತಾಳೆ.", "She sings a song."),
]

# Get embeddings for both languages
tulu_texts = [p[0] for p in pairs]
english_texts = [p[1] for p in pairs]

emb_tulu = model.encode(tulu_texts, convert_to_tensor=True)
emb_eng = model.encode(english_texts, convert_to_tensor=True)

# Compute cosine similarities
from torch import nn
cos = nn.CosineSimilarity(dim=1)
similarities = cos(emb_tulu, emb_eng)

for i, sim in enumerate(similarities):
    print(f"{pairs[i][0]}  <->  {pairs[i][1]}  → Similarity: {sim.item():.3f}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Combine both embeddings for visualization
combined = np.vstack([emb_tulu, emb_eng])
labels = ["Tulu"]*len(emb_tulu) + ["English"]*len(emb_eng)

# Reduce to 2D
tsne = TSNE(n_components=2, random_state=42)
reduced = tsne.fit_transform(combined)

# Plot
plt.figure(figsize=(6,5))
for lang in ["Tulu", "English"]:
    idx = [i for i, l in enumerate(labels) if l == lang]
    plt.scatter(reduced[idx,0], reduced[idx,1], label=lang, s=60)
plt.legend()
plt.title("Tulu-English Embedding Visualization (t-SNE)")
plt.show()


In [None]:
import faiss, json
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
index = faiss.read_index("data/processed/faiss_index.bin")

texts = []
with open("data/processed/passages.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        texts.append(json.loads(line)["text"])

query = "Tulu Nadu is located in Karnataka"
q_vec = model.encode([query])
D, I = index.search(q_vec, 3)

print("\nTop similar results:")
for idx in I[0]:
    print("→", texts[idx][:200], "\n")
