In [None]:
import pandas as pd
df = pd.read_csv("/content/ConvoProject_CustomMadeDataset.csv")
df.columns = df.columns.str.strip().str.lower()
print(df.head())

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(
    df["description"].tolist(),
    show_progress_bar=True
)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(embeddings)


In [None]:
def recommend_by_query(query, top_k=3):
    q_emb = model.encode([query])
    scores = cosine_similarity(q_emb, embeddings)[0]
    top_idx = scores.argsort()[::-1][:top_k]

    for i in top_idx:
        print(df.iloc[i]["scheme name"], "| score:", round(scores[i], 3))


In [None]:
recommend_by_query("student")
recommend_by_query("farmer")
recommend_by_query("insurance")


In [None]:
df["scheme_name_norm"] = (
    df["scheme name"]
    .str.lower()
    .str.replace(r"[^a-z0-9 ]", "", regex=True)
    .str.strip()
)

df["category_norm"] = df["category"].str.lower().str.strip()


In [None]:
def smart_search(user_input, top_k=5):
    user_input = user_input.lower().strip()
    user_input_norm = (
        user_input
        .replace("-", " ")
        .replace("(", "")
        .replace(")", "")
    )

    if user_input_norm in df["scheme_name_norm"].values:
        print(" Mode: Scheme-based recommendation\n")

        idx = df[df["scheme_name_norm"] == user_input_norm].index[0]
        scores = list(enumerate(similarity_matrix[idx]))
        scores = sorted(scores, key=lambda x: x[1], reverse=True)

        for i, score in scores[1:top_k+1]:
            print(f"{df.iloc[i]['scheme name']} | score: {round(score, 3)}")
        return

    if user_input in df["category_norm"].values:
        print(" Mode: Category-based search\n")

        results = df[df["category_norm"] == user_input]["scheme name"]
        for name in results:
            print(name)
        return

    print(" Mode: Semantic keyword-based search\n")

    query_embedding = model.encode([user_input])
    scores = cosine_similarity(query_embedding, embeddings)[0]
    top_indices = scores.argsort()[::-1][:top_k]

    for i in top_indices:
        print(f"{df.iloc[i]['scheme name']} | score: {round(scores[i], 3)}")


In [None]:
smart_search("student", 5)
smart_search("farmer", 5)
smart_search("insurance", 5)


In [None]:
smart_search("agriculture")
smart_search("education")


In [None]:
smart_search("PM KISAN SAMMAN NIDHI (PM-KISAN)", 3)
