In [None]:
from google.colab import files

uploaded = files.upload()  # pick your train.csv
train_path = next(iter(uploaded.keys()))
print("Using train file:", train_path)

Saving train.csv to train.csv
Using train file: train.csv


In [None]:
uploaded = files.upload()  # pick your test.csv
test_path = next(iter(uploaded.keys()))
print("Using test file:", test_path)

Saving test.csv to test.csv
Using test file: test.csv


# Overall idea
We already have a dataset (`train.csv`) containing:


*   A `Question` (what a user might say).
*   An `Answer` (how the bot could respond).


*   A `category` (type of response, like ‚Äúwitty‚Äù, ‚Äúfriendly‚Äù, etc.).
The chatbot does **not generate new sentences from scratch**. Instead, when the user types something:


*   t converts that text into a vector using **TF‚ÄìIDF**.
*   It compares this vector to all vectors of training `combined_text` (Question + Answer).


*   It finds the **most similar** training example **using cosine similarity**.
*   It returns the **stored Answer** from that example.









In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# The `preprocess` function
What it does, step by step:


*   Type check: If `text` is not a string (maybe `NaN` or `None` from the CSV), we replace it with empty string `""` to avoid errors.
*   Lowercasing: This makes the matching case-insensitive: ‚ÄúHello‚Äù and ‚Äúhello‚Äù are treated the same.


*   Remove unwanted characters: Uses `re.sub` (regular expression) to remove characters that are not: **a)** lowercase letters, **b)** digits **c)** some punctuation commonly used in chat, **d)** everything else (emojis, weird symbols, etc.) is replaced with a space
*   Normalize whitespace: a) `re.sub(r"\s+", " ", text)` collapses multiple spaces (or tabs/newlines) into a single space. b) `re.sub(r"\s+", " ", text)` collapses multiple spaces (or tabs/newlines) into a single space.

Result: a clean, normalized string that‚Äôs consistent for vectorization.



In [None]:
# ---------- 1. Preprocessing ----------
def preprocess(text: str) -> str:
    """
    Basic text cleaning: lowercase, remove weird chars, normalize spaces.
    Adjust this if you want more sophisticated preprocessing.
    """
    if not isinstance(text, str):
        text = ""
    text = text.lower()
    # keep letters, digits, punctuation common in chat
    text = re.sub(r"[^a-z0-9'?!.:, ]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

## Loading the Data: `load_data`
* R ead the CSV:
  * df = pd.read_csv(train_path) loads train.csv into a pandas DataFrame.
  * This DataFrame has at least `Question`, `Answer`, `category`.

* Create combined_text
  * df["Question"].fillna("") replaces missing questions with an empty string.

  * df["Answer"].fillna("") does the same for answers.

  * They‚Äôre concatenated with a space in between: Question + " " + Answer.

  * Why? Because we want the retrieval model to look at both the question and answer together when learning similarity. That way, if the user asks something like the original question, or something that resembles that Q/A pair, we can match it.

* Preprocess combined text

  * `.apply(preprocess)` runs the cleaning function over each combined string.

  * The result is stored in df["combined_text"].

So `load_data` returns a DataFrame where each row has:

* `Question`

* `Answer`

* `category`

* `combined_text` (cleaned ‚ÄúQuestion + Answer‚Äù string)

In [None]:
# ---------- 2. Load data ----------
def load_data(train_path: str = "train.csv"):
    df = pd.read_csv(train_path)

    # Combine question + answer into a single text field for retrieval
    df["combined_text"] = (
        df["Question"].fillna("") + " " + df["Answer"].fillna("")
    ).apply(preprocess)

    return df

## The `RetrievalChatbot` Class

What it sets up:

* Store training data

  * `self.train_df = train_df.reset_index(drop=True)`

  * `reset_index(drop=True)` ensures a clean 0..N-1 index.

* Hyperparameters

  * `min_sim`: If the best similarity is below this threshold, we consider the match ‚Äútoo weak‚Äù and instead use a generic fallback response.

  * `top_k`: Instead of always choosing the single best match, we pick randomly among the top k matches. This introduces slight variety in answers.

* TF‚ÄìIDF Vectorizer

  * `self.vectorizer = TfidfVectorizer(ngram_range=(1, 2))`

  * TF‚ÄìIDF stands for Term Frequency ‚Äì Inverse Document Frequency.

  * `ngram_range=(1, 2)` means we consider:

    * unigrams: single words like ‚Äúhello‚Äù

    * bigrams: pairs of words like ‚Äúgood morning‚Äù

  * This helps capture small phrases as well as single words.

* Fitting the vectorizer

* `self.X_train = self.vectorizer.fit_transform(self.train_df["combined_text"])`

* `fit_transform`:

    * learns a vocabulary and IDF weights from `combined_text`

    * creates a sparse matrix representation of all `combined_text` rows.

* `self.X_train` is a matrix where:

    * each row corresponds to a training example,

    * each column is a TF‚ÄìIDF feature for a specific word/phrase.

* Conversation history (optional)`

  * `self.history = []` is a list of `(user_utterance, bot_answer)` pairs.

  * In this base version, we don‚Äôt use it for logic; it‚Äôs just there to keep a record.
---------------------------
### `_fallback_response`

  Purpose: handle cases where the model doesn‚Äôt find a good match. This avoids giving a completely irrelevant stored answer when we‚Äôre not confident in the similarity.

Steps:

  * Define a list of generic responses.

  * Use random.choice to return one of them.


---------------------------
### `get_best_match`
* Preprocess the user input

  * `processed = preprocess(user_input)`

  * Cleaned version of what the user typed.

* Edge case: empty text

  * If after preprocessing the string is empty (e.g., user typed only punctuation), we immediately return:

    * a polite message asking to rephrase

    * `category = None`

    * similarity `0.0`

* Vectorize user input

  * `user_vec = self.vectorizer.transform([processed])`

  * Uses the same TF‚ÄìIDF mapping as the training data:

    * creates a 1-row vector for this input.

* Compute cosine similarities

  * `sims = cosine_similarity(user_vec, self.X_train)[0]`

  * This produces an array of similarity scores:

    * length = number of training examples.

    * Each value is between -1 and 1, but with TF‚ÄìIDF and positive weights, it typically ranges from 0 to 1.

  * Higher score = more similar.

* Find best match

  * `best_idx = int(np.argmax(sims))` gives index of highest similarity.

  * `best_sim = float(sims[best_idx])` gives that highest score.

* Optional: choose among top-k matches

  * If top_k > 1:

    * `sims.argsort()` returns indices sorted by similarity.

    * `[-self.top_k:]` takes the last `top_k` indices = top `k` highest scores.

    * `[::-1]` reverses them so they are in descending order.

    * `random.choice(top_indices)` picks one of these top candidates.

  * Else (top_k == 1), we just take the best one: chosen_idx = best_idx.

This introduces small randomness to avoid repeating exactly the same answer every time for similar inputs.

* Retrieve the data row

  * `row = self.train_df.iloc[chosen_idx]`

  * `answer = row["Answer"]`

  * `category = row.get("category", None)` gets the category if the column exists.

* Return values

  * `answer`: the text that the bot should say.

  * `category`: label like ‚Äúwitty‚Äù, ‚Äúfriendly‚Äù, etc. (useful for logging/debugging).

  * `best_sim`: the similarity of the best match (for trust/confidence).

---------------------------
### `respond`

This is what the chat loop calls directly.

Steps:

* Find best match

  * Calls `get_best_match(user_input)`.

  * Gets back `(answer, category, sim)`.

* Check similarity threshold

  * If `sim < self.min_sim`, then:

    * We decide the match isn't trustworthy.

    * Replace answer with a generic fallback using `_fallback_response`.

    * Set `category` to `None` because this is no longer a specific categorized answer from the dataset.

* Update conversation history

  * `self.history.append((user_input, answer))`

  * Appends the pair `(user_input, final_answer)`.

* Return:

  * Returns `answer`, `category`, and `sim` for the outer code to use (e.g. for printing).

---------------------------
### `chat_loop`
* Initialization

  * Prints a message.

  * `train_df = load_data("train.csv")` loads and preprocesses the data.

  * `bot = RetrievalChatbot(train_df, min_sim=0.15, top_k=3)` builds the model.

  * `min_sim=0.15` means if similarity is < 0.15, we use fallback responses.

  * `top_k=3` means we randomly pick among the top 3 best matches for variety.

* User prompt

  * Prints instructions to the user.

* Infinite loop for conversation

  * `user_input = input("You: ").strip()` reads user message from the terminal.

  * If the user types exit or quit (any case), the loop breaks and the program ends.

* Get bot response

  * `bot_answer, category, sim = bot.respond(user_input)`

  * If `sim` is high enough, this will be an answer from the dataset.

  * If `sim` is too low, bot.respond will have switched to a generic fallback.

* Print answer

  * If category is not `None`, we print debug info:

    * Bot (category, sim=0.xx): answer.

    * This is helpful for understanding what kind of response it chose and how confident it was.

  * Otherwise, we just print:

    * Bot: answer.


### Summary in One Sentence

The code builds a retrieval chatbot that:

* Reads example Q/A pairs from train.csv,

* Converts them into TF‚ÄìIDF vectors,

* For each new user input, finds the most similar existing Q/A pair using cosine similarity,

* Returns the stored answer (or a fallback if similarity is too low),

* And runs this in a simple command-line chat loop.

In [None]:
# ---------- 3. Build retrieval model ----------
class RetrievalChatbot:
    def __init__(self, train_df: pd.DataFrame, min_sim: float = 0.15, top_k: int = 3):
        """
        train_df must have columns: Question, Answer, category, combined_text.
        min_sim: minimum cosine similarity to trust a retrieved answer.
        top_k: number of top candidates to choose from (adds some variety).
        """
        self.train_df = train_df.reset_index(drop=True)
        self.min_sim = min_sim
        self.top_k = top_k

        # Vectorize combined_text using TF‚ÄìIDF (unigrams + bigrams)
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        self.X_train = self.vectorizer.fit_transform(self.train_df["combined_text"])

        # Optionally keep conversation history
        self.history = []  # list of (user_utterance, bot_answer)

    def _fallback_response(self, user_input: str) -> str:
        """
        Generic replies for when similarity is too low.
        You can make this smarter / more varied.
        """
        generic_replies = [
            "I'm not completely sure how to answer that, but I'm listening.",
            "Interesting! Tell me more about that.",
            "That's a good question. What do you think?",
            "I don't have a perfect answer, but I'm here to chat!",
        ]
        return random.choice(generic_replies)

    def get_best_match(self, user_input: str):
        """
        Returns: (answer, category, best_similarity)
        """
        processed = preprocess(user_input)
        if not processed:
            return "Could you rephrase that?", None, 0.0

        user_vec = self.vectorizer.transform([processed])
        sims = cosine_similarity(user_vec, self.X_train)[0]

        best_idx = int(np.argmax(sims))
        best_sim = float(sims[best_idx])

        # Choose among top_k for some diversity
        if self.top_k > 1:
            top_indices = sims.argsort()[-self.top_k:][::-1]
            chosen_idx = int(random.choice(top_indices))
        else:
            chosen_idx = best_idx

        row = self.train_df.iloc[chosen_idx]
        answer = row["Answer"]
        category = row.get("category", None)

        return answer, category, best_sim

    def respond(self, user_input: str) -> str:
        """
        High-level response function used in the chat loop.
        """
        answer, category, sim = self.get_best_match(user_input)

        if sim < self.min_sim:
            answer = self._fallback_response(user_input)
            category = None

        # store in history
        self.history.append((user_input, answer))

        # If you want to show category in debug/logging, you could print it here.
        # For now we just return the answer.
        return answer, category, sim


# ---------- 4. Simple command-line chat loop ----------
def chat_loop():
    print("Loading data and building chatbot model...")
    train_df = load_data("train.csv")
    bot = RetrievalChatbot(train_df, min_sim=0.15, top_k=3)

    print("Chatbot is ready! Type 'exit' or 'quit' to stop.\n")

    while True:
        user_input = input("You: ").strip()
        if user_input.lower() in {"exit", "quit"}:
            print("Bot: Goodbye! üëã")
            break

        bot_answer, category, sim = bot.respond(user_input)
        if category:
            # optionally show the category (useful for debugging / explanation)
            print(f"Bot ({category}, sim={sim:.2f}): {bot_answer}")
        else:
            print(f"Bot: {bot_answer}")

In [None]:
if __name__ == "__main__":
    chat_loop()

Loading data and building chatbot model...
Chatbot is ready! Type 'exit' or 'quit' to stop.

You: What is your name?
Bot (professional, sim=0.78): I don't have a name.
You: How old are you?
Bot (friendly, sim=0.70): I don't really have an age. 
You: exit
Bot: Goodbye! üëã
