In [1]:
!pip install gradio
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from sentence_transformers import SentenceTransformer
import gradio as gr
import time
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import h5py
from sentence_transformers import SentenceTransformer
import gradio as gr
import time
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from tensorflow.keras import layers, Model, Input

model_dir = '/content/'

try:
    # Load the saved data
    print("Loading question data...")
    with open(os.path.join(model_dir, 'question_data.pkl'), 'rb') as f:
        saved_data = pickle.load(f)
        questions_df = pd.DataFrame(saved_data['questions'])
        question_to_embedding = saved_data['question_to_embedding']
        optimal_threshold = saved_data.get('optimal_threshold', 0.6)  # Default if missing

    print("✅ Question data loaded successfully!")
    print(f"Found {len(questions_df)} questions in the database")

    # Get a sample embedding to determine dimension
    sample_embedding = list(question_to_embedding.values())[0]
    embedding_dim = sample_embedding.shape[0]
    print(f"Embedding dimension: {embedding_dim}")

    # Load the transformer model
    print("Loading transformer model...")
    transformer_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

    # Create a compatible model
    print("Creating compatible encoder model...")
    inputs = Input(shape=(embedding_dim,))
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dense(32, activation='tanh')(x)
    encoder_model = Model(inputs=inputs, outputs=x)
    encoder_model.compile(optimizer='adam', loss='mse')

    print("✅ Model created successfully!")

except Exception as e:
    print(f"Error in initialization: {e}")
    raise

def preprocess_text(text):
    text = text.lower()
    text = ' '.join(text.split())
    return text

def find_most_similar_question(query, questions_df, transformer_model, encoder_model, threshold=optimal_threshold):
    # Preprocess the query
    processed_query = preprocess_text(query)

    # Get transformer embedding for query
    query_transformer_embedding = transformer_model.encode([processed_query])[0]

    # Get Siamese embedding for query
    query_siamese_embedding = encoder_model.predict(np.array([query_transformer_embedding]).reshape(1, -1), verbose=0)[0]

    # Calculate most similar questions with original transformer embeddings
    similarities = []
    for q in questions_df['processed_question'] if 'processed_question' in questions_df.columns else questions_df['question']:
        q_processed = q if 'processed_question' in questions_df.columns else preprocess_text(q)
        if q_processed in question_to_embedding:
            q_embedding = question_to_embedding[q_processed]
            # Transform embedding through the model
            q_siamese_embedding = encoder_model.predict(np.array([q_embedding]).reshape(1, -1), verbose=0)[0]
            # Calculate similarity
            similarity = cosine_similarity([query_siamese_embedding], [q_siamese_embedding])[0][0]
            similarities.append(similarity)
        else:
            similarities.append(0)

    # Find the most similar question
    max_similarity_idx = np.argmax(similarities)
    max_similarity = similarities[max_similarity_idx]

    # Determine confidence level
    if max_similarity >= 0.8:
        confidence = "high"
        emoji = "✅"
        explanation = "I'm very confident about this answer."
    elif max_similarity >= 0.6:
        confidence = "medium"
        emoji = "⚠️"
        explanation = "I believe this is the correct answer, but please verify."
    else:
        confidence = "low"
        emoji = "❓"
        explanation = "I'm not entirely sure this answers your question. Consider rephrasing."

    # Get the best match
    best_match = questions_df.iloc[max_similarity_idx]

    return {
        "question": best_match['question'],
        "answer": best_match['answer'],
        "similarity": max_similarity,
        "confidence": confidence,
        "emoji": emoji,
        "explanation": explanation
    }

def visualize_query_similarity(query):
    # Preprocess the query
    processed_query = preprocess_text(query)

    # Get transformer embedding for query
    query_transformer_embedding = transformer_model.encode([processed_query])[0]

    # Get Siamese embedding for query
    query_siamese_embedding = encoder_model.predict(np.array([query_transformer_embedding]).reshape(1, -1), verbose=0)[0]

    # Get all questions
    processed_questions = questions_df['processed_question'].tolist() if 'processed_question' in questions_df.columns else [preprocess_text(q) for q in questions_df['question'].tolist()]
    question_texts = questions_df['question'].tolist()

    # Get embeddings for visualization
    question_embeddings = []
    for q in processed_questions:
        if q in question_to_embedding:
            q_transformer_embedding = question_to_embedding[q]
            q_siamese_embedding = encoder_model.predict(np.array([q_transformer_embedding]).reshape(1, -1), verbose=0)[0]
            question_embeddings.append(q_siamese_embedding)
        else:
            q_transformer_embedding = transformer_model.encode([q])[0]
            q_siamese_embedding = encoder_model.predict(np.array([q_transformer_embedding]).reshape(1, -1), verbose=0)[0]
            question_embeddings.append(q_siamese_embedding)

    # TSNE for visualization
    all_embeddings = np.vstack([question_embeddings, query_siamese_embedding])
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(5, len(all_embeddings)-1))
    embeddings_2d = tsne.fit_transform(all_embeddings)

    # Create plot
    plt.figure(figsize=(10, 8))

    # Calculate similarities for coloring
    similarities = [cosine_similarity([query_siamese_embedding], [q_emb])[0][0] for q_emb in question_embeddings]

    # Plot the points
    scatter = plt.scatter(
        embeddings_2d[:len(question_texts), 0],
        embeddings_2d[:len(question_texts), 1],
        c=similarities,
        cmap='coolwarm',
        alpha=0.7,
        s=100
    )

    # Plot the query point
    plt.scatter(
        embeddings_2d[-1, 0],
        embeddings_2d[-1, 1],
        marker='*',
        color='gold',
        s=200,
        edgecolor='black',
        linewidth=1,
        label='Your Query'
    )

    # Add colorbar and annotations
    plt.colorbar(scatter, label="Similarity to Query")
    top_indices = np.argsort(similarities)[-3:][::-1]
    for idx in top_indices:
        plt.annotate(
            f"Q{idx+1}: {question_texts[idx][:30]}...",
            (embeddings_2d[idx, 0], embeddings_2d[idx, 1]),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=8,
            bbox=dict(boxstyle="round,pad=0.3", fc="white", alpha=0.7)
        )

    plt.title(f"Query Similarity Map\nQuery: '{query}'")
    plt.tight_layout()

    return plt

# --- Test the Model ---
print("\nTesting the system with example queries:")
test_query = "How can I apply to Zewail City?"
result = find_most_similar_question(test_query, questions_df, transformer_model, encoder_model)
print(f"Query: {test_query}")
print(f"Best match: {result['question']}")
print(f"Confidence: {result['confidence']} ({result['similarity']:.2f})")
print(f"Answer: {result['answer'][:100]}...")

def gradio_answer_query(query, history, visualize=False):
    """Process user query and return response with updated chat history"""
    if not query.strip():
        return "", history, None

    # Get answer using the Siamese network
    start_time = time.time()
    result = find_most_similar_question(query, questions_df, transformer_model, encoder_model)
    response_time = time.time() - start_time

    # Format response based on confidence levels
    formatted_response = f"{result['emoji']} **{result['confidence'].title()} Confidence** (Score: {result['similarity']:.2f})\n\n"
    formatted_response += f"Your question matched: \"{result['question']}\"\n\n"
    formatted_response += f"**Answer:** {result['answer']}\n\n"
    formatted_response += f"*{result['explanation']}*"

    # Add response time
    formatted_response += f"\n\n<small>Response time: {response_time*1000:.1f} ms</small>"

    # Create visualization if requested
    visualization = None
    if visualize:
        visualization = visualize_query_similarity(query)

    # Update history with proper message format for type="messages"
    if history is None:
        history = []

    # Use dictionary format with 'role' and 'content' keys
    history.append({"role": "user", "content": query})
    history.append({"role": "assistant", "content": formatted_response})

    return "", history, visualization

def add_text(text):
    return text

def clear_chat():
    return None, None

# --- Gradio Interface ---
custom_css = """
body {
    font-family: 'Poppins', 'Segoe UI', sans-serif;
    background: linear-gradient(135deg, #f5f7fa 0%, #e4ecfb 100%);
}
.container {
    max-width: 1200px;
    margin: 0 auto;
}
.header {
    text-align: center;
    padding: 20px 0;
    background: linear-gradient(135deg, #1a365d 0%, #2c5282 100%);
    color: white;
    border-radius: 10px;
    margin-bottom: 20px;
    box-shadow: 0 4px 15px rgba(0,0,0,0.1);
}
.footer {
    text-align: center;
    margin-top: 30px;
    font-size: 0.8em;
    color: #718096;
}
.chatbox {
    border-radius: 10px !important;
    overflow: hidden;
    box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
}
.primary-btn {
    background: linear-gradient(135deg, #1a365d 0%, #2c5282 100%) !important;
    color: white !important;
    border: none !important;
    padding: 10px 20px !important;
    border-radius: 6px !important;
    transition: all 0.3s !important;
}
.primary-btn:hover {
    transform: translateY(-2px) !important;
    box-shadow: 0 4px 10px rgba(0,0,0,0.2) !important;
}
.example-chip {
    background: #e6f0ff !important;
    color: #1a365d !important;
    border: 1px solid #c3d8f5 !important;
    border-radius: 16px !important;
    padding: 5px 12px !important;
    font-size: 13px !important;
    transition: all 0.3s !important;
    cursor: pointer !important;
}
.example-chip:hover {
    background: #d1e2ff !important;
    transform: translateY(-1px) !important;
    box-shadow: 0 2px 5px rgba(0,0,0,0.1) !important;
}
.info-card {
    background: rgba(255,255,255,0.9);
    border-radius: 10px;
    padding: 15px;
    margin-bottom: 15px;
    box-shadow: 0 4px 15px rgba(0,0,0,0.05);
}
.visualization {
    margin-top: 20px;
    border-radius: 10px;
    overflow: hidden;
    box-shadow: 0 4px 15px rgba(0,0,0,0.1);
}
"""

# Create Gradio blocks interface
print("\nCreating Gradio interface...")
with gr.Blocks(css=custom_css) as demo:
    # Header
    gr.HTML("""
    <div class="header">
        <h1>Zewail City Admission Assistant</h1>
        <p>Powered by Siamese Neural Network and Transformer Embeddings</p>
    </div>
    """)

    with gr.Row():
        # Left column - Chat interface
        with gr.Column(scale=3):
            # Chat area
            chatbot = gr.Chatbot(
                label="Conversation",
                height=500,
                elem_id="chatbox",
                elem_classes="chatbox",
                type="messages",
            )

            # Input area
            with gr.Row():
                query_input = gr.Textbox(
                    placeholder="Type your question about Zewail City admissions...",
                    label="Your Question",
                    lines=2,
                    max_lines=5,
                    show_label=False,
                    scale=4
                )

                # Add visualization toggle
                visualize_toggle = gr.Checkbox(
                    label="Show Visualization",
                    value=False,
                    scale=1,
                    info="Display a visualization of query similarity"
                )

                submit_btn = gr.Button("Ask", variant="primary", elem_classes="primary-btn", scale=1)

            with gr.Row():
                clear_btn = gr.Button("Clear Chat", variant="secondary")

            # Visualization area (shows when visualization is enabled)
            visualization = gr.Plot(label="Query Similarity Visualization", visible=True, elem_classes="visualization")

            # Example questions
            gr.HTML("<h3>Example Questions</h3>")

            # Get unique example questions from the dataset (up to 8)
            example_questions = questions_df['question'].sample(min(8, len(questions_df))).tolist()

            with gr.Row():
                # First row of examples (4)
                for question in example_questions[:4]:
                    # Limit the length of displayed questions
                    display_question = question if len(question) < 40 else question[:37] + "..."
                    example_btn = gr.Button(display_question, elem_classes="example-chip")
                    example_btn.click(
                        fn=lambda q=question: q,  # Use default parameter to capture current value
                        inputs=[],
                        outputs=[query_input]
                    ).then(
                        fn=gradio_answer_query,
                        inputs=[query_input, chatbot, visualize_toggle],
                        outputs=[query_input, chatbot, visualization]
                    )

            with gr.Row():
                # Second row of examples (4)
                for question in example_questions[4:8]:
                    # Limit the length of displayed questions
                    display_question = question if len(question) < 40 else question[:37] + "..."
                    example_btn = gr.Button(display_question, elem_classes="example-chip")
                    example_btn.click(
                        fn=lambda q=question: q,  # Use default parameter to capture current value
                        inputs=[],
                        outputs=[query_input]
                    ).then(
                        fn=gradio_answer_query,
                        inputs=[query_input, chatbot, visualize_toggle],
                        outputs=[query_input, chatbot, visualization]
                    )

        # Right column - Info
        with gr.Column(scale=2):
            # About Zewail City
            gr.HTML("""
            <div class="info-card">
                <h3>About Zewail City</h3>
                <p>Zewail City of Science and Technology is a nonprofit, independent institution of learning,
                research and innovation founded by Nobel laureate Ahmed Zewail.</p>
                <p><b>Contact:</b> admissions@zewailcity.edu.eg<br>
                <b>Phone:</b> +20-1033077738<br>
                <b>Website:</b> <a href="https://www.zewailcity.edu.eg" target="_blank">zewailcity.edu.eg</a></p>
            </div>
            """)

            # How it works
            gr.HTML("""
            <div class="info-card">
                <h3>How it works</h3>
                <p>This chatbot uses advanced AI technology to understand and answer your questions:</p>
                <ol>
                    <li><b>Transformer Embeddings</b> - Your question is processed by a state-of-the-art language model</li>
                    <li><b>Siamese Neural Network</b> - A specialized neural network finds the most similar questions in our database</li>
                    <li><b>Confidence Scoring</b> - The system provides a confidence level with each answer</li>
                </ol>
                <p>The visualization shows where your query fits among known questions in the embedding space.</p>
            </div>
            """)

            # Model information
            gr.HTML(f"""
            <div class="info-card">
                <h3>AI Model Details</h3>
                <p>This assistant uses your data with a compatible Siamese network.</p>
                <ul>
                    <li><b>Knowledge base:</b> {len(questions_df)} question-answer pairs</li>
                    <li><b>Transformer model:</b> paraphrase-multilingual-mpnet-base-v2</li>
                    <li><b>Embedding dimension:</b> {embedding_dim} </li>
                    <li><b>Similarity threshold:</b> {optimal_threshold:.2f}</li>
                </ul>
            </div>
            """)

    # Footer
    gr.HTML("""
    <div class="footer">
        <p>Developed with ❤️ for Zewail City | © 2025 All Rights Reserved</p>
    </div>
    """)

    # Set up event handlers
    submit_btn.click(
        fn=gradio_answer_query,
        inputs=[query_input, chatbot, visualize_toggle],
        outputs=[query_input, chatbot, visualization]
    )

    query_input.submit(
        fn=gradio_answer_query,
        inputs=[query_input, chatbot, visualize_toggle],
        outputs=[query_input, chatbot, visualization]
    )

    clear_btn.click(
        fn=lambda: (None, None),  # Return None for both history and visualization
        inputs=None,
        outputs=[chatbot, visualization]
    )

# Launch the interface
demo.launch(share=True, debug=True)

Loading question data...
✅ Question data loaded successfully!
Found 30 questions in the database
Embedding dimension: 768
Loading transformer model...
Creating compatible encoder model...
✅ Model created successfully!

Testing the system with example queries:
Query: How can I apply to Zewail City?
Best match: How can I apply to Zewail City?
Confidence: high (1.00)
Answer: To apply to Zewail City, visit the official website, create an account, fill out the online applicat...

Creating Gradio interface...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://5adcbeea22069a8e1a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
