In [None]:
!pip install -U FlagEmbedding transformers accelerate pandas scikit-learn sentencepiece


Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.0/12.0 MB[0m [31m120.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.2
    Uninstalling transformers-4.57.2:
      Successfully uninstalled transformers-4.57.2
Successfully installed transformers-4.57.3


In [None]:
from google.colab import files
uploaded = files.upload()   # choose hindi_bollywood_songs_100_clean.csv


Saving hindi_bollywood_songs_100_clean.csv to hindi_bollywood_songs_100_clean (1).csv


In [None]:
# ============================================================
# HINDI SONG RAG RECOMMENDER (BGE-M3 + HF LLM + COSINE SIM)
# ============================================================

import pandas as pd
from FlagEmbedding import BGEM3FlagModel
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# ------------------------------------------------------------
# 0. HUGGING FACE LLM SETUP (runs locally in Colab)
# ------------------------------------------------------------

HF_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"


print("üîÅ Loading Hugging Face LLM (this may take a minute the first time)‚Ä¶")
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

hf_tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
hf_model = AutoModelForCausalLM.from_pretrained(
    HF_MODEL_NAME,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None
)

# ------------------------------------------------------------
# FUNCTION 1: Load Hindi Songs Dataset
# ------------------------------------------------------------
def load_song_dataset(csv_path="hindi_bollywood_songs_100_clean.csv"):
    df = pd.read_csv(csv_path)

    # fill NaNs for safety
    for col in ["title", "artist", "movie", "genre", "mood",
                "tempo", "decade", "lyrics", "tags"]:
        if col in df.columns:
            df[col] = df[col].fillna("")

    # human-readable description for the LLM context
    df["description"] = (
        "Title: " + df["title"] +
        " | Artist: " + df["artist"] +
        " | Movie: " + df["movie"] +
        " | Genre: " + df["genre"] +
        " | Mood: " + df["mood"] +
        " | Tempo: " + df["tempo"] +
        " | Decade: " + df["decade"]
    )

    # text that will be embedded by BGE-M3
    df["embed_text"] = (
        df["title"] + " " + df["artist"] + " " + df["movie"] + " " +
        df["genre"] + " " + df["mood"] + " " + df["tempo"] + " " +
        df["decade"] + " " + df["tags"] + " " + df["lyrics"]
    )

    return df

# ------------------------------------------------------------
# FUNCTION 2: Load Embedding Model + Generate Embeddings
# ------------------------------------------------------------
def embed_songs(df):
    # First call will download BGE-M3 from HuggingFace
    print("üîÅ Loading BGE-M3 embedding model‚Ä¶")
    model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)

    print("‚öôÔ∏è Encoding songs into embeddings‚Ä¶")
    embed_texts = df["embed_text"].tolist()
    # encode one by one (100 songs ‚Üí fine)
    embeddings = [
        model.encode(text, batch_size=12, max_length=8192)["dense_vecs"]
        for text in embed_texts
    ]
    print("‚úÖ Embeddings ready.")
    return model, embeddings

# ------------------------------------------------------------
# FUNCTION 3: Retrieve Top-K Songs via Cosine Similarity
# ------------------------------------------------------------
def retrieve_top_songs(user_query, model, df, song_embeddings, top_k=5):
    print("üîç Retrieving songs for query:", user_query)
    query_vec = model.encode(user_query, batch_size=12, max_length=8192)["dense_vecs"]

    similarity_scores = []
    for i, emb in enumerate(song_embeddings):
        sim = cosine_similarity([query_vec], [emb])[0][0]
        similarity_scores.append((i, sim))

    # sort by similarity desc
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_scores = similarity_scores[:top_k]

    results = []
    for idx, score in top_scores:
        desc = df.loc[idx, "description"]
        results.append((desc, score))

    print("\nTop retrieved songs (for debugging):")
    for desc, score in results:
        print(f"  ‚Ä¢ {desc}  | sim={score:.3f}")

    return results

# ------------------------------------------------------------
# FUNCTION 4: Build RAG Prompt for the LLM
# ------------------------------------------------------------
def build_rag_prompt(user_query, retrieved_songs):
    context_text = "\n".join(
        [f"- {song_desc}" for song_desc, _ in retrieved_songs]
    )

    prompt = f"""
You are a Hindi music recommendation expert.

USER PREFERENCE / QUERY:
{user_query}

RETRIEVED SONGS (Your ONLY knowledge base):
{context_text}

TASK:
Using ONLY the above songs as your knowledge:
1. Recommend 3‚Äì5 songs that best match the user's mood/genre/decade request.
2. For each song, explain in 1‚Äì2 lines why it fits what the user asked.
3. Use simple, friendly language.
4. Do NOT talk about embeddings, vectors, cosine similarity, or technical stuff.
Answer directly to the user.
"""
    return prompt

# ------------------------------------------------------------
# FUNCTION 5: Generate Response with Hugging Face LLM
# ------------------------------------------------------------
def generate_llm_response(prompt):
    inputs = hf_tokenizer(prompt, return_tensors="pt").to(hf_model.device)

    outputs = hf_model.generate(
        **inputs,
        max_new_tokens=350,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=hf_tokenizer.eos_token_id
    )

    full_text = hf_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Many instruct models echo the prompt; simple heuristic to cut off:
    if prompt in full_text:
        return full_text[len(prompt):].strip()
    return full_text.strip()

# ------------------------------------------------------------
# FUNCTION 6: Full RAG Pipeline
# ------------------------------------------------------------
def run_song_rag_pipeline():
    # 1) Load dataset
    df = load_song_dataset()

    # 2) Build embeddings
    embed_model, song_embeddings = embed_songs(df)

    # 3) User input
    print("\nExample query ideas:")
    print("- 'romantic slow heartbreak song from 2010s'")
    print("- 'high energy party dance song'")
    print("- 'patriotic emotional songs'")
    user_query = input("\nEnter your mood / genre / vibe: ")

    # 4) Retrieve top songs with cosine similarity
    retrieved = retrieve_top_songs(user_query, embed_model, df, song_embeddings, top_k=5)

    # 5) Build prompt
    prompt = build_rag_prompt(user_query, retrieved)

    # 6) Ask LLM
    answer = generate_llm_response(prompt)

    print("\n====================== RECOMMENDATIONS ======================\n")
    print(answer)
    print("\n=============================================================")

# ------------------------------------------------------------
# RUN PIPELINE
# ------------------------------------------------------------
run_song_rag_pipeline()


üîÅ Loading Hugging Face LLM (this may take a minute the first time)‚Ä¶
Using device: cuda


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



üîÅ Loading BGE-M3 embedding model‚Ä¶


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

onnx/model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

onnx/tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

‚öôÔ∏è Encoding songs into embeddings‚Ä¶


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


‚úÖ Embeddings ready.

Example query ideas:
- 'romantic slow heartbreak song from 2010s'
- 'high energy party dance song'
- 'patriotic emotional songs'

Enter your mood / genre / vibe: romantic slow emotional
üîç Retrieving songs for query: romantic slow emotional

Top retrieved songs (for debugging):
  ‚Ä¢ Title: Muskurane | Artist: Arijit Singh | Movie: CityLights | Genre: romantic | Mood: calm;warm | Tempo: slow | Decade: 2010s  | sim=0.596
  ‚Ä¢ Title: Mann Bhareya 2.0 | Artist: B Praak | Movie: Shershaah | Genre: romantic | Mood: heartbreak;intense | Tempo: slow | Decade: 2020s  | sim=0.593
  ‚Ä¢ Title: Tum Hi Ho | Artist: Arijit Singh | Movie: Aashiqui 2 | Genre: romantic | Mood: emotional;heartbreak;intense | Tempo: slow | Decade: 2010s  | sim=0.583
  ‚Ä¢ Title: Sun Raha Hai | Artist: Shreya Ghoshal | Movie: Aashiqui 2 | Genre: romantic | Mood: intense;heartbreak | Tempo: slow | Decade: 2010s  | sim=0.581
  ‚Ä¢ Title: Teri Mitti | Artist: B Praak | Movie: Kesari | Genre: patrio