In [None]:
from google.colab import files

uploaded = files.upload()  # pick your train.csv
train_path = next(iter(uploaded.keys()))
print("Using train file:", train_path)

Saving train.csv to train.csv
Using train file: train.csv


In [None]:
uploaded = files.upload()  # pick your test.csv
test_path = next(iter(uploaded.keys()))
print("Using test file:", test_path)

Saving test.csv to test.csv
Using test file: test.csv


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random


# ---------- 1. Preprocessing ----------
def preprocess(text: str) -> str:
    """
    Basic text cleaning: lowercase, remove weird chars, normalize spaces.
    Adjust this if you want more sophisticated preprocessing.
    """
    if not isinstance(text, str):
        text = ""
    text = text.lower()
    # keep letters, digits, punctuation common in chat
    text = re.sub(r"[^a-z0-9'?!.:, ]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# ---------- 2. Load data ----------
def load_data(train_path: str = "train.csv"):
    df = pd.read_csv(train_path)

    # Combine question + answer into a single text field for retrieval
    df["combined_text"] = (
        df["Question"].fillna("") + " " + df["Answer"].fillna("")
    ).apply(preprocess)

    return df


# ---------- 3. Build retrieval model ----------
class RetrievalChatbot:
    def __init__(self, train_df: pd.DataFrame, min_sim: float = 0.15, top_k: int = 3):
        """
        train_df must have columns: Question, Answer, category, combined_text.
        min_sim: minimum cosine similarity to trust a retrieved answer.
        top_k: number of top candidates to choose from (adds some variety).
        """
        self.train_df = train_df.reset_index(drop=True)
        self.min_sim = min_sim
        self.top_k = top_k

        # Vectorize combined_text using TFâ€“IDF (unigrams + bigrams)
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        self.X_train = self.vectorizer.fit_transform(self.train_df["combined_text"])

        # Optionally keep conversation history
        self.history = []  # list of (user_utterance, bot_answer)

    def _fallback_response(self, user_input: str) -> str:
        """
        Generic replies for when similarity is too low.
        You can make this smarter / more varied.
        """
        generic_replies = [
            "I'm not completely sure how to answer that, but I'm listening.",
            "Interesting! Tell me more about that.",
            "That's a good question. What do you think?",
            "I don't have a perfect answer, but I'm here to chat!",
        ]
        return random.choice(generic_replies)

    def get_best_match(self, user_input: str):
        """
        Returns: (answer, category, best_similarity)
        """
        processed = preprocess(user_input)
        if not processed:
            return "Could you rephrase that?", None, 0.0

        user_vec = self.vectorizer.transform([processed])
        sims = cosine_similarity(user_vec, self.X_train)[0]

        best_idx = int(np.argmax(sims))
        best_sim = float(sims[best_idx])

        # Choose among top_k for some diversity
        if self.top_k > 1:
            top_indices = sims.argsort()[-self.top_k:][::-1]
            chosen_idx = int(random.choice(top_indices))
        else:
            chosen_idx = best_idx

        row = self.train_df.iloc[chosen_idx]
        answer = row["Answer"]
        category = row.get("category", None)

        return answer, category, best_sim

    def respond(self, user_input: str) -> str:
        """
        High-level response function used in the chat loop.
        """
        answer, category, sim = self.get_best_match(user_input)

        if sim < self.min_sim:
            answer = self._fallback_response(user_input)
            category = None

        # store in history
        self.history.append((user_input, answer))

        # If you want to show category in debug/logging, you could print it here.
        # For now we just return the answer.
        return answer, category, sim


# ---------- 4. Simple command-line chat loop ----------
def chat_loop():
    print("Loading data and building chatbot model...")
    train_df = load_data("train.csv")
    bot = RetrievalChatbot(train_df, min_sim=0.15, top_k=3)

    print("Chatbot is ready! Type 'exit' or 'quit' to stop.\n")

    while True:
        user_input = input("You: ").strip()
        if user_input.lower() in {"exit", "quit"}:
            print("Bot: Goodbye! ðŸ‘‹")
            break

        bot_answer, category, sim = bot.respond(user_input)
        if category:
            # optionally show the category (useful for debugging / explanation)
            print(f"Bot ({category}, sim={sim:.2f}): {bot_answer}")
        else:
            print(f"Bot: {bot_answer}")

In [None]:
if __name__ == "__main__":
    chat_loop()

Loading data and building chatbot model...
Chatbot is ready! Type 'exit' or 'quit' to stop.

You: What is your name?
Bot (professional, sim=0.78): I don't have a name.
You: How old are you?
Bot (friendly, sim=0.70): I don't really have an age. 
You: exit
Bot: Goodbye! ðŸ‘‹
