# Install dependencies

In [1]:
# CELL 1 – Install dependencies (with full NLTK fix)

import subprocess
import sys
import nltk

print("Installing required packages...")
packages = [
    "streamlit",
    "scikit-learn",
    "nltk",
    "pandas",
    "datasets",
    "sentence-transformers",
    "torch",
]

for pkg in packages:
    print(f"Installing {pkg}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

print("\nDownloading NLTK data (punkt, punkt_tab, stopwords)...")
nltk.download("punkt_tab", quiet=False)
nltk.download("punkt", quiet=False)
nltk.download("stopwords", quiet=False)

print("\n======================================")
print("✓ All packages and NLTK data installed successfully!")
print("======================================")


Installing required packages...
Installing streamlit...
Installing scikit-learn...
Installing nltk...
Installing pandas...
Installing datasets...
Installing sentence-transformers...
Installing torch...

Downloading NLTK data (punkt, punkt_tab, stopwords)...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



✓ All packages and NLTK data installed successfully!


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# NLTK FIX for Colab – run once

import nltk

print("Downloading NLTK tokenizers (punkt + punkt_tab + stopwords)...")
print("=" * 60)

nltk.download("punkt_tab", quiet=False)
nltk.download("punkt", quiet=False)
nltk.download("stopwords", quiet=False)

print("=" * 60)
print("✓ NLTK tokenizers installed correctly on Colab!")
print("=" * 60)


Downloading NLTK tokenizers (punkt + punkt_tab + stopwords)...
✓ NLTK tokenizers installed correctly on Colab!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Imports

In [3]:
# CELL 2 – Imports

import streamlit as st
import pandas as pd
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

print("✓ All libraries imported successfully!")




✓ All libraries imported successfully!


#Text Analyzer (English only)

In [4]:
# CELL 3 – English-only text analyzer

class EnglishTextAnalyzer:
    """Analyze English user queries: intent + sentiment + keywords"""

    def __init__(self):
        self.stopwords_en = set(stopwords.words("english"))

    def analyze_intent(self, text: str):
        """Simple rule‑based intent detection"""
        t = text.lower()

        patterns = {
            "track_order": ["track", "tracking", "where is my order", "order status"],
            "return_policy": ["return", "refund", "exchange", "money back"],
            "shipping": ["shipping", "delivery", "ship", "arrive"],
            "account": ["account", "login", "password", "sign in"],
            "payment": ["pay", "payment", "card", "credit", "billing"],
            "complaint": ["angry", "upset", "bad", "problem", "issue"],
        }

        best_intent = "general"
        best_score = 0.0
        for intent, kws in patterns.items():
            score = sum(1 for k in kws if k in t)
            if score > best_score:
                best_score = score
                best_intent = intent

        confidence = 0.9 if best_score >= 2 else 0.6 if best_score == 1 else 0.4

        return {"type": best_intent, "confidence": confidence}

    def analyze_sentiment(self, text: str):
        """Very light sentiment detection based on keywords"""
        t = text.lower()
        neg = ["angry", "upset", "bad", "terrible", "hate", "problem", "issue", "late"]
        pos = ["great", "good", "amazing", "thanks", "thank you", "love"]

        neg_score = sum(1 for k in neg if k in t)
        pos_score = sum(1 for k in pos if k in t)

        if neg_score > pos_score:
            return {"type": "negative", "polarity": -0.6}
        if pos_score > neg_score:
            return {"type": "positive", "polarity": 0.6}
        return {"type": "neutral", "polarity": 0.0}

    def extract_keywords(self, text: str, max_keywords: int = 8):
        """Extract simple English keywords"""
        clean = re.sub(r"[^a-zA-Z0-9\s]", " ", text.lower())
        tokens = word_tokenize(clean)
        tokens = [t for t in tokens if t not in self.stopwords_en and len(t) > 2]

        freq = defaultdict(int)
        for t in tokens:
            freq[t] += 1

        sorted_kw = sorted(freq.items(), key=lambda x: x[1], reverse=True)
        return [w for w, _ in sorted_kw[:max_keywords]]

    def analyze(self, text: str):
        return {
            "intent": self.analyze_intent(text),
            "sentiment": self.analyze_sentiment(text),
            "keywords": self.extract_keywords(text),
        }


print("✓ EnglishTextAnalyzer defined")


✓ EnglishTextAnalyzer defined


#Response Generator (English only)

In [5]:
# CELL 4 – Context‑aware English response generator

class EnglishResponseGenerator:
    """Generate English responses with simple tone control"""

    def __init__(self, analyzer: EnglishTextAnalyzer):
        self.analyzer = analyzer

    def _build_prefix(self, analysis):
        intent = analysis["intent"]["type"]
        sentiment = analysis["sentiment"]["type"]

        if intent == "complaint" or sentiment == "negative":
            return "We are really sorry for the inconvenience. "
        if intent == "track_order":
            return "Here is how you can check your order status: "
        if intent == "return_policy":
            return "Here is an overview of our return policy: "
        return "Here is the information you requested: "

    def determine_tone(self, analysis):
        sentiment = analysis["sentiment"]["type"]
        intent = analysis["intent"]["type"]

        if intent == "complaint" or sentiment == "negative":
            return "empathetic"
        if intent in ["track_order", "return_policy", "shipping"]:
            return "informative"
        return "neutral"

    def generate(self, base_answer: str, analysis):
        prefix = self._build_prefix(analysis)
        full = prefix + base_answer
        tone = self.determine_tone(analysis)
        return full, tone


print("✓ EnglishResponseGenerator defined")


✓ EnglishResponseGenerator defined


# Load English FAQ data

In [6]:
# CELL 5 – Load English FAQ data (WebFAQ subset or fallback)

def load_english_faqs(max_rows: int = 20000):
    """Load a real English FAQ subset; fallback to small handcrafted set."""
    try:
        print("Loading WebFAQ English subset from Hugging Face...")
        ds = load_dataset("PaDaS-Lab/webfaq-retrieval", "eng", split=f"train[:{max_rows}]")
        faqs = []

        for row in ds:
            query = str(row.get("query", "")).strip()
            pos = row.get("positive_passages", [])
            if not query or not pos:
                continue
            # get first positive passage text
            passage = pos[0]
            if isinstance(passage, dict):
                ans = str(passage.get("passage_text", "")).strip()
            else:
                ans = str(passage).strip()

            if len(query) < 8 or len(ans) < 20:
                continue

            faqs.append(
                {
                    "question_en": query,
                    "answer_en": ans,
                    "category": row.get("category", "General"),
                    "source": "WebFAQ-eng",
                }
            )

        print(f"✓ Loaded {len(faqs)} English FAQs from WebFAQ")
        return faqs
    except Exception as e:
        print(f"Failed to load WebFAQ, using fallback. Error: {e}")
        return [
            {
                "question_en": "How can I track my order?",
                "answer_en": "Log in to your account, open the 'Your Orders' page, and click on the order to see detailed tracking information.",
                "category": "Shipping",
                "source": "fallback",
            },
            {
                "question_en": "What is your return policy?",
                "answer_en": "You can return most items within 30 days of delivery as long as they are in their original condition.",
                "category": "Returns",
                "source": "fallback",
            },
            {
                "question_en": "How long does shipping take?",
                "answer_en": "Standard shipping usually takes between 5 and 7 business days.",
                "category": "Shipping",
                "source": "fallback",
            },
            {
                "question_en": "How do I reset my password?",
                "answer_en": "Click on 'Forgot password' on the login page and follow the instructions sent to your email.",
                "category": "Account",
                "source": "fallback",
            },
            {
                "question_en": "What payment methods do you accept?",
                "answer_en": "We accept major credit cards, debit cards, and PayPal.",
                "category": "Payment",
                "source": "fallback",
            },
        ]


print("Loading English FAQs...")
faqs_data = load_english_faqs(max_rows=20000)
print(f"✓ Total English FAQs: {len(faqs_data)}")


Loading English FAQs...
Loading WebFAQ English subset from Hugging Face...


README.md: 0.00B [00:00, ?B/s]

Failed to load WebFAQ, using fallback. Error: BuilderConfig 'eng' not found. Available: ['ara-qrels', 'ara-corpus', 'ara-queries', 'aze-qrels', 'aze-corpus', 'aze-queries', 'ben-qrels', 'ben-corpus', 'ben-queries', 'bul-qrels', 'bul-corpus', 'bul-queries', 'cat-qrels', 'cat-corpus', 'cat-queries', 'ces-qrels', 'ces-corpus', 'ces-queries', 'dan-qrels', 'dan-corpus', 'dan-queries', 'deu-qrels', 'deu-corpus', 'deu-queries', 'ell-qrels', 'ell-corpus', 'ell-queries', 'eng-qrels', 'eng-corpus', 'eng-queries', 'est-qrels', 'est-corpus', 'est-queries', 'fas-qrels', 'fas-corpus', 'fas-queries', 'fin-qrels', 'fin-corpus', 'fin-queries', 'fra-qrels', 'fra-corpus', 'fra-queries', 'heb-qrels', 'heb-corpus', 'heb-queries', 'hin-qrels', 'hin-corpus', 'hin-queries', 'hrv-qrels', 'hrv-corpus', 'hrv-queries', 'hun-qrels', 'hun-corpus', 'hun-queries', 'ind-qrels', 'ind-corpus', 'ind-queries', 'isl-qrels', 'isl-corpus', 'isl-queries', 'ita-qrels', 'ita-corpus', 'ita-queries', 'jpn-qrels', 'jpn-corpus', 'j

#BERT-based English FAQ matcher

In [7]:
# CELL 6 – BERT-based English matcher (English only)

class EnglishBERTFAQMatcher:
    """Semantic FAQ matcher using Sentence-BERT (English only)."""

    def __init__(self, faqs, analyzer: EnglishTextAnalyzer, generator: EnglishResponseGenerator):
        self.faqs = faqs
        self.analyzer = analyzer
        self.generator = generator

        print("Loading Sentence-BERT model for English (all-MiniLM-L6-v2)...")
        # Small, fast, strong English embedding model
        self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
        print("✓ Model loaded")

        self._build_index()

    def _build_index(self):
        """Encode all FAQ questions into embeddings."""
        questions = [f["question_en"] for f in self.faqs]
        print(f"Encoding {len(questions)} FAQ questions...")
        self.embeddings = self.model.encode(
            questions,
            convert_to_tensor=True,
            show_progress_bar=True,
        )
        print("✓ FAQ embeddings ready")
        if torch.cuda.is_available():
            print("✓ Using GPU acceleration")
        else:
            print("✓ Using CPU")

    def _find_best(self, user_question: str):
        """Find best matching FAQ using cosine similarity."""
        query_emb = self.model.encode(user_question, convert_to_tensor=True)
        sims = util.pytorch_cos_sim(query_emb, self.embeddings)[0]
        best_idx = int(torch.argmax(sims).item())
        best_score = float(sims[best_idx])

        # Clamp into [0, 1] range for confidence display
        confidence = max(0.0, min(best_score, 1.0))
        return best_idx, confidence

    def get_intelligent_answer(self, user_question: str):
        """Full pipeline: analyze → match → answer → enhance."""
        analysis = self.analyzer.analyze(user_question)
        best_idx, confidence = self._find_best(user_question)
        faq = self.faqs[best_idx]

        base_answer = faq.get("answer_en", "")
        enhanced_answer, tone = self.generator.generate(base_answer, analysis)

        return {
            "answer": enhanced_answer,
            "confidence": confidence,
            "category": faq.get("category", "General"),
            "analysis": analysis,
            "tone": tone,
            "matched_question": faq.get("question_en", ""),
            "source": faq.get("source", "unknown"),
        }


print("✓ EnglishBERTFAQMatcher defined")


✓ EnglishBERTFAQMatcher defined


#Initialize system

In [8]:
# CELL 7 – Initialize full English system

print("\n======================================")
print("INITIALIZING ENGLISH FAQ SYSTEM")
print("======================================")

analyzer = EnglishTextAnalyzer()
print("✓ EnglishTextAnalyzer initialized")

generator = EnglishResponseGenerator(analyzer)
print("✓ EnglishResponseGenerator initialized")

matcher = EnglishBERTFAQMatcher(faqs_data, analyzer, generator)
print("✓ EnglishBERTFAQMatcher initialized")

print("\n======================================")
print("✓ SYSTEM READY (ENGLISH ONLY)")
print(f"Total FAQs: {len(faqs_data)}")
print("Model: all-MiniLM-L6-v2 (Sentence-BERT)")
print("======================================")



INITIALIZING ENGLISH FAQ SYSTEM
✓ EnglishTextAnalyzer initialized
✓ EnglishResponseGenerator initialized
Loading Sentence-BERT model for English (all-MiniLM-L6-v2)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Model loaded
Encoding 5 FAQ questions...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✓ FAQ embeddings ready
✓ Using CPU
✓ EnglishBERTFAQMatcher initialized

✓ SYSTEM READY (ENGLISH ONLY)
Total FAQs: 5
Model: all-MiniLM-L6-v2 (Sentence-BERT)


#Test (English only)

In [9]:
# CELL 8 – Quick English tests

print("\n" + "=" * 80)
print("ENGLISH FAQ CHATBOT – TESTS")
print("=" * 80)

test_queries = [
    "How can I track my order?",
    "My order is late and I am very upset!",
    "What is your return policy?",
    "How long does shipping usually take?",
    "How can I reset my password?",
]

for q in test_queries:
    print("\n----------------------------------------")
    print(f"Question: {q}")
    result = matcher.get_intelligent_answer(q)
    print(f"Matched FAQ: {result['matched_question'][:80]}...")
    print(f"Confidence: {result['confidence']:.1%}")
    print(f"Tone: {result['tone']}")
    print(f"Answer: {result['answer'][:200]}...")

print("\n" + "=" * 80)
print("✓ TESTS COMPLETED")
print("=" * 80)



ENGLISH FAQ CHATBOT – TESTS

----------------------------------------
Question: How can I track my order?
Matched FAQ: How can I track my order?...
Confidence: 100.0%
Tone: informative
Answer: Here is how you can check your order status: Log in to your account, open the 'Your Orders' page, and click on the order to see detailed tracking information....

----------------------------------------
Question: My order is late and I am very upset!
Matched FAQ: How can I track my order?...
Confidence: 45.6%
Tone: empathetic
Answer: We are really sorry for the inconvenience. Log in to your account, open the 'Your Orders' page, and click on the order to see detailed tracking information....

----------------------------------------
Question: What is your return policy?
Matched FAQ: What is your return policy?...
Confidence: 100.0%
Tone: informative
Answer: Here is an overview of our return policy: You can return most items within 30 days of delivery as long as they are in their original conditi