In [1]:
from google.colab import files

uploaded = files.upload()  # pick your train.csv
train_path = next(iter(uploaded.keys()))
print("Using train file:", train_path)

Saving train.csv to train.csv
Using train file: train.csv


In [2]:
uploaded = files.upload()  # pick your test.csv
test_path = next(iter(uploaded.keys()))
print("Using test file:", test_path)

Saving test.csv to test.csv
Using test file: test.csv


In [5]:
import json
import random
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# ==========================
# 1. Preprocessing
# ==========================

def preprocess(text: str) -> str:
    """
    Basic text cleaning: lowercase, remove weird chars, normalize spaces.
    """
    if not isinstance(text, str):
        text = ""
    text = text.lower()
    # keep letters, digits, and common punctuation
    text = re.sub(r"[^a-z0-9'?!.:, ]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# ==========================
# 2. Data loading
# ==========================

def load_train_data(path: str = "train.csv") -> pd.DataFrame:
    """
    Load train.csv and create a 'combined_text' column for retrieval.
    IMPORTANT CHANGE: Use ONLY Question text for retrieval to reduce bias
    toward generic answers.
    """
    df = pd.read_csv(path)

    required_cols = {"Question", "Answer"}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"train.csv must contain columns: {required_cols}")

    df["Question"] = df["Question"].fillna("")
    df["Answer"] = df["Answer"].fillna("")

    # Use only the question text for retrieval
    df["combined_text"] = df["Question"].apply(preprocess)

    return df


# ==========================
# 3. Base retrieval chatbot
# ==========================

class RetrievalChatbot:
    """
    A retrieval-based chatbot that:
    - learns from train.csv Q/A pairs
    - retrieves the most similar past example using TFâ€“IDF + cosine similarity
    - avoids repeating very recent answers for a given user
    """

    def __init__(
        self,
        train_df: pd.DataFrame,
        min_sim: float = 0.15,
        max_candidates: int = 100,
        max_history_turns: int = 3,
        repeat_window: int = 5,
    ):
        """
        train_df must have:
        - Question
        - Answer
        - category (optional)
        - combined_text (preprocessed Question)

        min_sim: minimum similarity to trust retrieval. Below this â†’ fallback.
        max_candidates: how many top candidates to consider before choosing.
        max_history_turns: number of past turns used as short-term context.
        repeat_window: how many last bot answers to avoid repeating.
        """
        self.train_df = train_df.reset_index(drop=True)
        self.min_sim = min_sim
        self.max_candidates = max_candidates
        self.max_history_turns = max_history_turns
        self.repeat_window = repeat_window

        # Vectorizer for retrieval (on combined_text = Question-only)
        self.vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),
            stop_words="english",
            min_df=2,
            max_df=0.95,
        )
        self.X_train = self.vectorizer.fit_transform(self.train_df["combined_text"])

        # history: dict[user_id] = list of dicts: {"user": str, "bot": str}
        self.histories: Dict[str, List[Dict[str, str]]] = {}

    # ---------- history helpers ----------

    def _get_history(self, user_id: str) -> List[Dict[str, str]]:
        return self.histories.get(user_id, [])

    def _append_history(self, user_id: str, user_msg: str, bot_msg: str) -> None:
        hist = self.histories.setdefault(user_id, [])
        hist.append({"user": user_msg, "bot": bot_msg})

    def _build_context(self, user_id: str) -> str:
        """
        Turn last few exchanges into a short context string.
        """
        recent = self._get_history(user_id)[-self.max_history_turns :]
        pieces = []
        for turn in recent:
            pieces.append(f"user: {turn['user']}")
            pieces.append(f"bot: {turn['bot']}")
        return " ".join(pieces)

    def _recent_answers_set(self, user_id: str) -> set:
        """
        Collect last few bot answers to avoid repeating them.
        """
        hist = self._get_history(user_id)
        recent = hist[-self.repeat_window :]
        return {turn["bot"] for turn in recent}

    # ---------- generic replies ----------

    def _fallback_response(self, user_input: str) -> str:
        """
        Generic replies when similarity is too low.
        """
        generic_replies = [
            "I'm not completely sure how to answer that, but I'm listening.",
            "Interesting! Tell me more about that.",
            "That's a good question. What do you think?",
            "I'm not sure, but I'm happy to chat about it!",
        ]
        return random.choice(generic_replies)

    # ---------- retrieval core ----------

    def _retrieve_best_match(
        self,
        contextualized_input: str,
        user_id: str,
    ) -> Tuple[str, Optional[str], float]:
        """
        Retrieve a good, non-recent answer from train_df.
        Returns (answer, category, best_similarity).
        """
        processed = preprocess(contextualized_input)
        if not processed:
            return "Could you rephrase that?", None, 0.0

        user_vec = self.vectorizer.transform([processed])
        sims = cosine_similarity(user_vec, self.X_train)[0]

        # sort indices by similarity (descending)
        ranked_indices = sims.argsort()[::-1]

        # consider only the top max_candidates
        ranked_indices = ranked_indices[: self.max_candidates]

        recent_answers = self._recent_answers_set(user_id)
        chosen_idx = None
        chosen_sim = 0.0

        # Choose first highly similar answer that isn't recently used
        for idx in ranked_indices:
            sim = float(sims[idx])
            candidate_answer = self.train_df.iloc[idx]["Answer"]

            if sim < self.min_sim:
                # beyond this point, all sims will be lower
                break

            if candidate_answer in recent_answers:
                # skip recently used answers
                continue

            chosen_idx = int(idx)
            chosen_sim = sim
            break

        # If we didn't find a non-recent candidate above min_sim,
        # fall back to the single best match (even if repeated),
        # so we don't always fallback to generic.
        if chosen_idx is None:
            best_idx = int(ranked_indices[0])
            chosen_idx = best_idx
            chosen_sim = float(sims[best_idx])

        row = self.train_df.iloc[chosen_idx]
        answer = row["Answer"]
        category = row.get("category", None)

        return answer, category, chosen_sim

    def respond(
        self,
        user_input: str,
        user_id: str = "default",
    ) -> Tuple[str, Optional[str], float]:
        """
        Main response function.
        Uses short-term conversation context + retrieval + repetition control.
        """
        context = self._build_context(user_id)
        if context:
            contextualized_input = context + " user: " + user_input
        else:
            contextualized_input = user_input

        answer, category, sim = self._retrieve_best_match(contextualized_input, user_id)

        if sim < self.min_sim:
            # If even our chosen candidate is low-sim, go generic
            answer = self._fallback_response(user_input)
            category = None

        self._append_history(user_id, user_input, answer)
        return answer, category, sim


# ==========================
# 4. Personalization: user profiles & tone
# ==========================

class PersonalizedRetrievalChatbot(RetrievalChatbot):
    """
    Extension of RetrievalChatbot with:
    - long-term user profiles stored in JSON
    - tone personalization (friendly / enthusiastic / formal / witty)
    """

    def __init__(
        self,
        train_df: pd.DataFrame,
        profile_path: str = "profiles.json",
        **kwargs,
    ):
        super().__init__(train_df, **kwargs)
        self.profile_path = Path(profile_path)
        self.user_profiles: Dict[str, Dict] = self._load_profiles()

    # ---------- profiles ----------

    def _load_profiles(self) -> Dict[str, Dict]:
        if self.profile_path.exists():
            try:
                return json.loads(self.profile_path.read_text(encoding="utf-8"))
            except Exception:
                return {}
        return {}

    def _save_profiles(self) -> None:
        self.profile_path.write_text(
            json.dumps(self.user_profiles, indent=2), encoding="utf-8"
        )

    def get_profile(self, user_id: str) -> Dict:
        return self.user_profiles.get(user_id, {})

    def update_profile(self, user_id: str, **updates) -> None:
        profile = self.user_profiles.get(user_id, {})
        profile.update(updates)
        self.user_profiles[user_id] = profile
        self._save_profiles()

    # ---------- tone shaping ----------

    def _apply_tone(self, answer: str, tone: Optional[str]) -> str:
        """
        Simple tone adjustment.
        """
        if not tone:
            return answer

        tone = tone.lower()
        if tone == "friendly":
            return answer + " ðŸ˜Š"
        elif tone == "enthusiastic":
            return answer + " ðŸŽ‰"
        elif tone == "formal":
            if not answer.lower().startswith(("certainly", "of course", "indeed")):
                return "Certainly. " + answer
            return answer
        elif tone == "witty":
            extras = [
                " (At least that's what my circuits think.)",
                " (Spoken like a true chatbot philosopher.)",
                " I promise I didn't just make that up. Well, maybe a little.",
            ]
            return answer + random.choice(extras)
        else:
            return answer

    # ---------- respond override ----------

    def respond(
        self,
        user_input: str,
        user_id: str = "default",
    ) -> Tuple[str, Optional[str], float]:
        """
        Personalized response:
        - uses history/context + retrieval + repetition control
        - then adjusts tone based on user profile
        """
        profile = self.get_profile(user_id)
        preferred_tone = profile.get("preferred_tone")

        answer, category, sim = super().respond(user_input, user_id=user_id)

        answer = self._apply_tone(answer, preferred_tone)

        # Replace last history bot text with toned version
        history = self._get_history(user_id)
        if history:
            history[-1]["bot"] = answer

        return answer, category, sim


# ==========================
# 5. Simple CLI chat loop
# ==========================

def chat_loop():
    print("Loading training data...")
    train_df = load_train_data("train.csv")

    print("Building personalized retrieval chatbot...")
    bot = PersonalizedRetrievalChatbot(
        train_df,
        profile_path="profiles.json",
        min_sim=0.15,
        max_candidates=100,
        max_history_turns=3,
        repeat_window=5,
    )

    # --- user onboarding / personalization ---
    print("\nWelcome! Let's personalize your experience a bit.")
    user_id = input("Choose a username (or press Enter for 'default'): ").strip()
    if not user_id:
        user_id = "default"

    profile = bot.get_profile(user_id)
    if profile:
        print(f"\nWelcome back, {profile.get('name', user_id)}!")
        if "preferred_tone" in profile:
            print(f"Your current preferred tone is: {profile['preferred_tone']}")
    else:
        # New profile
        name = input("What should I call you? (name or nickname): ").strip()
        if not name:
            name = user_id
        print("\nHow would you like me to talk to you?")
        print("Options: friendly / enthusiastic / formal / witty (or leave blank)")
        tone = input("Preferred tone: ").strip().lower() or None

        bot.update_profile(user_id, name=name, preferred_tone=tone)
        print(f"\nNice to meet you, {name}!")

    print("\nChatbot is ready! Type 'exit' or 'quit' to stop.")
    print("-" * 50)

    # --- main loop ---
    while True:
        user_input = input(f"{user_id}: ").strip()
        if user_input.lower() in {"exit", "quit"}:
            print("Bot: Goodbye! ðŸ‘‹")
            break

        answer, category, sim = bot.respond(user_input, user_id=user_id)

        if category:
            print(f"Bot ({category}, sim={sim:.2f}): {answer}")
        else:
            print(f"Bot: {answer}")

In [None]:
if __name__ == "__main__":
    chat_loop()

Loading training data...
Building personalized retrieval chatbot...

Welcome! Let's personalize your experience a bit.
Choose a username (or press Enter for 'default'): John
What should I call you? (name or nickname): Johnny

How would you like me to talk to you?
Options: friendly / enthusiastic / formal / witty (or leave blank)
Preferred tone: witty

Nice to meet you, Johnny!

Chatbot is ready! Type 'exit' or 'quit' to stop.
--------------------------------------------------
John: Hi!
Bot (caring, sim=1.00): Hello there! (Spoken like a true chatbot philosopher.)
John: How was your day?
Bot (professional, sim=0.64): Hello. (At least that's what my circuits think.)
John: How old are you?
Bot (witty, sim=0.57): Hey. (Spoken like a true chatbot philosopher.)
