<a href="https://colab.research.google.com/github/cn8972/Echo-Bot/blob/main/AIIntegratedChatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import hashlib
import json
import re
from pathlib import Path
from typing import List, Dict

from bs4 import BeautifulSoup
from pypdf import PdfReader
from tqdm import tqdm

import numpy as np
import faiss
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import pickle

# ✅ Use your absolute Windows path to the data_raw folder
DATA_RAW = Path(r"C:\Users\cn897\OneDrive\Nobles Research Portfolio\Master in Artificial Intelligence\MSAI 631 Artificial Intelligence for Human-Computer Interaction\HFCHATBOT\data_raw")

DATA_PROCESSED = Path("data_processed")
INDEX_DIR = Path("index")

EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
CHUNK_SIZE = 900          # words
CHUNK_OVERLAP = 140       # words

def _read_txt(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def _read_html(path: Path) -> str:
    html = path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "nav", "header", "footer", "aside"]):
        tag.decompose()
    text = soup.get_text("\n")
    return re.sub(r"\n{2,}", "\n", text)

def _read_pdf(path: Path) -> str:
    reader = PdfReader(str(path))
    pages = []
    for page in reader.pages:
        try:
            pages.append(page.extract_text() or "")
        except Exception:
            pages.append("")
    return "\n".join(pages)

def _normalize(text: str) -> str:
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()

def _file_id(path: Path) -> str:
    digest = hashlib.sha256(path.read_bytes()).hexdigest()[:16]
    return f"{path.name}::{digest}"

def _chunk(text: str, source_id: str) -> List[Dict]:
    words = text.split()
    chunks: List[Dict] = []
    start = 0
    while start < len(words):
        end = min(len(words), start + CHUNK_SIZE)
        payload = " ".join(words[start:end])
        chunks.append({"source_id": source_id, "chunk_id": len(chunks), "text": payload})
        if end == len(words):
            break
        start = end - CHUNK_OVERLAP
    return chunks

def load_docs() -> List[Dict]:
    docs: List[Dict] = []
    for p in DATA_RAW.rglob("*"):
        if p.is_dir():
            continue
        ext = p.suffix.lower()
        if ext in {".txt", ".md"}:
            raw = _read_txt(p)
        elif ext in {".html", ".htm"}:
            raw = _read_html(p)
        elif ext == ".pdf":
            raw = _read_pdf(p)
        else:
            continue
        sid = _file_id(p)
        clean = _normalize(raw)
        docs.extend(_chunk(clean, sid))
    return docs

def build_indexes(chunks: List[Dict]) -> None:
    DATA_PROCESSED.mkdir(exist_ok=True)
    INDEX_DIR.mkdir(exist_ok=True)

    with open(DATA_PROCESSED / "chunks.jsonl", "w", encoding="utf-8") as f:
        for c in chunks:
            f.write(json.dumps(c, ensure_ascii=False) + "\n")

    tokenized = [c["text"].split() for c in chunks]
    bm25 = BM25Okapi(tokenized)
    with open(INDEX_DIR / "bm25.pkl", "wb") as f:
        pickle.dump({"bm25": bm25, "docs": chunks}, f)

    model = SentenceTransformer(EMB_MODEL)
    corpus = [c["text"] for c in chunks]
    embeddings = model.encode(corpus, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
    embeddings = embeddings.astype(np.float32)

    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)

    faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
    with open(INDEX_DIR / "docs.pkl", "wb") as f:
        pickle.dump(chunks, f)

def main() -> None:
    if not DATA_RAW.exists():
        print(f"DATA_RAW path does not exist: {DATA_RAW}")
        return
    docs = load_docs()
    if not docs:
        print("No ingestable files found in data_raw/.")
        return
    print(f"Building indexes for {len(docs)} chunks…")
    build_indexes(docs)
    print("Ingest complete.")

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'pypdf'

In [9]:
import os

# ✅ Set environment variables manually for Colab
os.environ["MicrosoftAppId"] = "your-app-id-here"
os.environ["MicrosoftAppPassword"] = "your-app-password-here"
os.environ["MicrosoftAIServicesEndpoint"] = "https://t6languageservicev1.cognitiveservices.azure.com/"
os.environ["MicrosoftAIServicesKey"] = "your-api-key-here"

# ⚙️ Configuration class
class DefaultConfig():
    # Bot Configuration
    PORT = 3978
    APP_ID = os.environ.get("MicrosoftAppId", "")
    APP_PASSWORD = os.environ.get("MicrosoftAppPassword", "")
    # Added to support interaction with Azure AI language API
    ENDPOINT_URI = os.environ.get("MicrosoftAIServicesEndpoint", "")
    API_KEY = os.environ.get("MicrosoftAIServicesKey", "")

# app.py

# From DOCS EchoBot example
from bots import EchoBot
from config import DefaultConfig

# Azure Cognitive Services imports (from Microsoft Docs)
# https://learn.microsoft.com/en-us/python/api/overview/azure/ai-textanalytics-readme
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient


# Load configuration (keys, endpoints, etc.)
CONFIG = DefaultConfig()

# Extended for T6 Project in MSAI 631 - Adding sentiment analysis to the bot
credential = AzureKeyCredential(CONFIG.API_KEY)
endpoint = CONFIG.ENDPOINT_URI
text_analytics_client = TextAnalyticsClient(
    endpoint=endpoint,
    credential=credential
)
# End extension for T6 Project


# -------------------------------
# Listen for incoming messages
# -------------------------------
from fastapi import APIRouter, Request
from fastapi.responses import Response
from starlette import status

from botbuilder.schema import Activity
# Assumes you created these earlier in app startup:
#   - ADAPTER: BotFrameworkAdapter (or compatible)
#   - BOT: your bot instance with an async on_turn(turn_context) method
#   - text_analytics_client: Azure Text Analytics TextAnalyticsClient
#   - CONFIG: DefaultConfig holding keys, etc.

router = APIRouter()

@router.post("/api/messages")
async def messages(req: Request) -> Response:
    """
    Main bot message handler.
    Parses the incoming activity, performs sentiment analysis on user text,
    and forwards the activity to the Bot Framework adapter.
    """
    # Validate content type
    if "application/json" not in req.headers.get("Content-Type", ""):
        return Response(status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE)

    # Parse request body
    body = await req.json()

    # ----------------------------------------------------------------------
    # 2024/1/5 – MSAI 631 – Start Perform Sentiment Analysis Here
    # ----------------------------------------------------------------------
    # Only attempt sentiment analysis if a text field is present.
    user_text = body.get("text", "")
    if isinstance(user_text, str) and user_text.strip():
        print(f"[MSAI 631] textToUse: {user_text}")

        # Build Text Analytics document payload
        documents = [{"id": "1", "language": "en", "text": user_text}]

        try:
            # Call Azure Text Analytics
            response = text_analytics_client.analyze_sentiment(documents)

            # Extract the first successful result
            first_ok = next((doc for doc in response if not doc.is_error), None)
            if first_ok:
                # Attach sentiment to the activity body without overwriting original text
                body["sentiment"] = {
                    "overall": first_ok.sentiment,                       # "positive" | "neutral" | "negative" | "mixed"
                    "scores": {
                        "positive": first_ok.confidence_scores.positive,
                        "neutral":  first_ok.confidence_scores.neutral,
                        "negative": first_ok.confidence_scores.negative,
                    },
                }
            else:
                body["sentiment"] = {"overall": "unknown", "scores": {}}

        except Exception as ex:  # Defensive: do not block the conversation if NLP fails
            print(f"[MSAI 631] Sentiment analysis error: {ex}")
            body["sentiment"] = {"overall": "error", "reason": str(ex)}
    # ----------------------------------------------------------------------
    # 2024/1/5 – MSAI 631 – END Perform Sentiment Analysis Here
    # ----------------------------------------------------------------------

    # Deserialize into a Bot Framework Activity and continue normal processing
    activity = Activity().deserialize(body)
    auth_header = req.headers.get("Authorization", "")

    # Hand off to the Bot Framework adapter
    response = await ADAPTER.process_activity(
        auth_header, activity, lambda turn_context: BOT.on_turn(turn_context)
    )

    # Return appropriate HTTP status
    if response is None:
        return Response(status_code=status.HTTP_200_OK)
    return Response(status_code=response.status)



# 🔍 Test the config
config = DefaultConfig()
print("App ID:", config.APP_ID)
print("Endpoint:", config.ENDPOINT_URI)

# 📅 2024/1/5 - START Extended for T6 Project in MSAI 631 - adding sentiment analysis to the bot
credential = AzureKeyCredential(CONFIG.API_KEY)
endpoint = CONFIG.ENDPOINT_URI
text_analytics_client = TextAnalyticsClient(endpoint=endpoint, credential=credential)
# 📅 2024/1/5 - STOP Extended for T6 Project in MSAI 631

# Main bot message handler
async def on_incoming_request(req: Request) -> Response:
    if "application/json" in req.headers["Content-Type"]:
        body = await req.json()

        # 📅 2024/1/5 - MSAI 631 - Start Perform Sentiment Analysis Here
        text = body["text"]
        print(f"text: {text}")
        documents = [{
            "id": "1",
            "language": "en",
            "text": text
        }]
        response = text_analytics_client.analyze_sentiment(documents)
        successful_responses = [doc for doc in response if not doc.is_error]
        body["text"] = successful_responses
        # 📅 2024/1/5 - MSAI 631 - End Perform Sentiment Analysis Here

        activity = Activity().deserialize(body)
        auth_header = req.headers["Authorization"] if "Authorization" in req.headers else ""
        response = await ADAPTER.process_activity(auth_header, activity, BOT.on_turn)
        return response
    else:
        return Response(status=HTTPStatus.UNSUPPORTED_MEDIA_TYPE)

ImportError: cannot import name 'EchoBot' from 'bots' (unknown location)

In [10]:
import os
from fastapi import APIRouter, Request
from fastapi.responses import Response
from starlette import status
from http import HTTPStatus

from botbuilder.schema import Activity
from botbuilder.core import BotFrameworkAdapter # Added import for BotFrameworkAdapter
from bots import EchoBot
from config import DefaultConfig

from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

# ✅ Set environment variables manually for Colab or local testing
os.environ["MicrosoftAppId"] = "your-app-id-here"
os.environ["MicrosoftAppPassword"] = "your-app-password-here"
os.environ["MicrosoftAIServicesEndpoint"] = "https://t6languageservicev1.cognitiveservices.azure.com/"
os.environ["MicrosoftAIServicesKey"] = "your-api-key-here"

# ⚙️ Configuration class
class DefaultConfig():
    PORT = 3978
    APP_ID = os.environ.get("MicrosoftAppId", "")
    APP_PASSWORD = os.environ.get("MicrosoftAppPassword", "")
    ENDPOINT_URI = os.environ.get("MicrosoftAIServicesEndpoint", "")
    API_KEY = os.environ.get("MicrosoftAIServicesKey", "")

# 🔧 Load configuration and initialize Azure Text Analytics client
CONFIG = DefaultConfig()
credential = AzureKeyCredential(CONFIG.API_KEY)
text_analytics_client = TextAnalyticsClient(endpoint=CONFIG.ENDPOINT_URI, credential=credential)

# Initialize BotFrameworkAdapter and EchoBot
ADAPTER = BotFrameworkAdapter(CONFIG.APP_ID, CONFIG.APP_PASSWORD) # Define ADAPTER
BOT = EchoBot() # Define BOT

# 🔄 FastAPI router setup
router = APIRouter()

@router.post("/api/messages")
async def messages(req: Request) -> Response:
    """Main bot message handler with sentiment analysis."""
    if "application/json" not in req.headers.get("Content-Type", ""):
        return Response(status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE)

    body = await req.json()

    # 📅 2024/1/5 – MSAI 631 – Start Perform Sentiment Analysis Here
    user_text = body.get("text", "")
    if isinstance(user_text, str) and user_text.strip():
        print(f"[MSAI 631] textToUse: {user_text}")
        documents = [{"id": "1", "language": "en", "text": user_text}]
        try:
            response = text_analytics_client.analyze_sentiment(documents)
            first_ok = next((doc for doc in response if not doc.is_error), None)
            if first_ok:
                body["sentiment"] = {
                    "overall": first_ok.sentiment,
                    "scores": {
                        "positive": first_ok.confidence_scores.positive,
                        "neutral": first_ok.confidence_scores.neutral,
                        "negative": first_ok.confidence_scores.negative,
                    },
                }
            else:
                body["sentiment"] = {"overall": "unknown", "scores": {}}
        except Exception as ex:
            print(f"[MSAI 631] Sentiment analysis error: {ex}")
            body["sentiment"] = {"overall": "error", "reason": str(ex)}
    # 📅 2024/1/5 – MSAI 631 – END Perform Sentiment Analysis Here

    activity = Activity().deserialize(body)
    auth_header = req.headers.get("Authorization", "")
    response = await ADAPTER.process_activity(auth_header, activity, BOT.on_turn)

    return Response(status_code=response.status if response else status.HTTP_200_OK)

# 🔍 Test the config
config = DefaultConfig()
print("App ID:", config.APP_ID)
print("Endpoint:", config.ENDPOINT_URI)

# 📅 2024/1/5 - START Extended for T6 Project in MSAI 631 - adding sentiment analysis to the bot
# This section is already covered by the initialization above, keeping for reference if needed
# credential = AzureKeyCredential(CONFIG.API_KEY)
# endpoint = CONFIG.ENDPOINT_URI
# text_analytics_client = TextAnalyticsClient(endpoint=endpoint, credential=credential)
# 📅 2024/1/5 - STOP Extended for T6 Project in MSAI 631

# Removed the duplicate on_incoming_request function

ImportError: cannot import name 'EchoBot' from 'bots' (unknown location)

In [3]:
import sys
import subprocess

# Install the Azure Text Analytics package
subprocess.check_call([sys.executable, "-m", "pip", "install", "azure-ai-textanalytics"])

0

In [4]:
# Create the 'bots' directory
import os
if not os.path.exists("bots"):
    os.makedirs("bots")

In [5]:
# Create the __init__.py file in the bots directory
init_py_content = """
from .echo_bot import EchoBot
"""
with open("bots/__init__.py", "w") as f:
    f.write(init_py_content)

In [11]:
# Create the echo_bot.py file in the bots directory
echo_bot_content = """
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

from botbuilder.core import ActivityHandler, TurnContext
from botbuilder.schema import ChannelAccount


class EchoBot(ActivityHandler):
    async def on_members_added(
        self,
        members_added: ChannelAccount,
        turn_context: TurnContext
    ):
        for member in members_added:
            if member.id != turn_context.activity.recipient.id:
                await turn_context.send_activity("Hello and welcome!")

    async def on_message(self, turn_context: TurnContext):
        await turn_context.send_activity(f"Echo: {turn_context.activity.text}")

"""
with open("bots/echo_bot.py", "w") as f:
    f.write(echo_bot_content)

In [7]:
!pip install botbuilder-core

Collecting botbuilder-core
  Downloading botbuilder_core-4.17.0-py3-none-any.whl.metadata (3.9 kB)
Collecting botbuilder-schema==4.17.0 (from botbuilder-core)
  Downloading botbuilder_schema-4.17.0-py2.py3-none-any.whl.metadata (3.7 kB)
Collecting botframework-connector==4.17.0 (from botbuilder-core)
  Downloading botframework_connector-4.17.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting botframework-streaming==4.17.0 (from botbuilder-core)
  Downloading botframework_streaming-4.17.0-py3-none-any.whl.metadata (3.8 kB)
Collecting jsonpickle<1.5,>=1.2 (from botbuilder-core)
  Downloading jsonpickle-1.4.2-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting msrest==0.7.* (from botbuilder-schema==4.17.0->botbuilder-core)
  Downloading msrest-0.7.1-py3-none-any.whl.metadata (21 kB)
Collecting msal>=1.31.1 (from botframework-connector==4.17.0->botbuilder-core)
  Downloading msal-1.33.0-py3-none-any.whl.metadata (11 kB)
Collecting azure-core>=1.24.0 (from msrest==0.7.*->botbuilder-schema==4

In [4]:
# Create the 'bots' directory
import os
if not os.path.exists("bots"):
    os.makedirs("bots")

In [5]:
# Create the __init__.py file in the bots directory
init_py_content = """
from .echo_bot import EchoBot
"""
with open("bots/__init__.py", "w") as f:
    f.write(init_py_content)

In [4]:
# Create the 'bots' directory
import os
if not os.path.exists("bots"):
    os.makedirs("bots")

# Create the echo_bot.py file in the bots directory
echo_bot_content = """
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

from botbuilder.core import ActivityHandler, TurnContext
from botbuilder.schema import ChannelAccount


class EchoBot(ActivityHandler):
    async def on_members_added(
        self,
        members_added: ChannelAccount,
        turn_context: TurnContext
    ):
        for member in members_added:
            if member.id != turn_context.activity.recipient.id:
                await turn_context.send_activity("Hello and welcome!")

    async def on_message(self, turn_context: TurnContext):
        await turn_context.send_activity(f"Echo: {turn_context.activity.text}")

"""
with open("bots/echo_bot.py", "w") as f:
    f.write(echo_bot_content)

In [5]:
# Create the config.py file
config_content = """
import os

class DefaultConfig():
    PORT = 3978
    APP_ID = os.environ.get("MicrosoftAppId", "")
    APP_PASSWORD = os.environ.get("MicrosoftAppPassword", "")
    ENDPOINT_URI = os.environ.get("MicrosoftAIServicesEndpoint", "")
    API_KEY = os.environ.get("MicrosoftAIServicesKey", "")
"""

with open("config.py", "w") as f:
    f.write(config_content)

In [8]:
import json
import uuid
import requests
from datetime import datetime, timezone

BOT_URL = "http://localhost:3978/api/messages"  # adjust if the bot runs on a different port

def build_activity(text: str) -> dict:
    """
    Construct a minimal Bot Framework 'message' activity
    that is compatible with a locally running EchoBot-style endpoint.
    """
    return {
        "type": "message",
        "id": str(uuid.uuid4()),
        "serviceUrl": "http://localhost:56180",   # placeholder; not used by most local handlers
        "channelId": "emulator",
        "from": {"id": "user1", "name": "User"},
        "recipient": {"id": "bot", "name": "Bot"},
        "conversation": {"id": str(uuid.uuid4())},
        "locale": "en-US",
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "text": text
    }

def main():
    payload = build_activity("I hated the movie. It was so slow!")
    headers = {"Content-Type": "application/json"}

    resp = requests.post(BOT_URL, headers=headers, data=json.dumps(payload))
    print("Status:", resp.status_code)
    try:
        print("Response JSON:", json.dumps(resp.json(), indent=2))
    except Exception:
        print("Raw response text:", resp.text)

if __name__ == "__main__":
    main()


ConnectionError: HTTPConnectionPool(host='localhost', port=3978): Max retries exceeded with url: /api/messages (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f6d95cbbf20>: Failed to establish a new connection: [Errno 111] Connection refused'))