# **Educational Historical Chatbot**

A specialized chatbot built using the T5 transformer model, fine-tuned on historical Q&A data to provide accurate and informative responses about various historical topics, figures, and events.


# 1. Importing Dependencies

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
import langdetect
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
from transformers import create_optimizer
from sklearn.model_selection import train_test_split
from tf_keras.src.callbacks import EarlyStopping, ModelCheckpoint
from datasets import Dataset

In [3]:
# Download NLTK resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# 2. Mount Google Drive and Load Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
history_data = "/content/drive/MyDrive/2025/data/history_data_20000.csv"

# 3. Tokinezation of the data and Fine-tune the Model

This includes:

- **Data Collection:** Historical Q&A pairs from various sources
- **Preprocessing:** Formatting inputs as "question: {text}" and targets as direct answers
- **Fine-tuning:** Using the T5-small base model with custom - hyperparameters
- **Evaluation:** Testing on a held-out dataset with BLEU, ROUGE, and exact match metrics

In [6]:
# 1. Load and preprocess the dataset for CSV
def load_dataset(file_path):
    import pandas as pd

    # Load CSV file
    df = pd.read_csv(file_path)

    # Ensure required columns exist
    if 'user' not in df.columns or 'bot' not in df.columns:
        raise ValueError("CSV file must contain 'user' and 'bot' columns")

    # Format input-output pairs **without `</s>`**
    df['input_text'] = df['user'].apply(lambda x: f"question: {x}")
    df['target_text'] = df['bot']  # No need to append `</s>`

    return df

def tokenize_data(df, tokenizer, max_input_length=512, max_target_length=128):
    input_texts = df['input_text'].tolist()
    target_texts = df['target_text'].tolist()

    inputs = tokenizer(
        input_texts,
        truncation=True,
        padding='max_length',
        max_length=max_input_length,
        return_tensors='tf'
    )
    # Convert target texts to strings before tokenization
    target_texts = [str(text) for text in df['target_text'].tolist()]

    targets = tokenizer(
        target_texts,
        truncation=True,
        padding='max_length',
        max_length=max_target_length,
        return_tensors='tf'
    )

    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'decoder_input_ids': targets['input_ids'][:, :-1]  # Shifted left for decoder
        },
        targets['input_ids'][:, 1:]  # Shifted right as labels
    ))

    # Make sure labels are explicitly added to the dataset
    dataset = dataset.map(lambda x, y: ({**x, 'labels': y}, y))

    return dataset

# 3. Fine-tune the model using TensorFlow methods
def fine_tune_model(train_dataset, val_dataset, model_name='t5-small', epochs=6, batch_size=4):
    # Load pre-trained model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = TFT5ForConditionalGeneration.from_pretrained(model_name)

    # Prepare batched datasets
    train_dataset = train_dataset.shuffle(1000).batch(batch_size)
    val_dataset = val_dataset.batch(batch_size)

    # Number of training steps
    num_train_steps = len(list(train_dataset)) * epochs

    optimizer, lr_schedule = create_optimizer(
        init_lr=1e-4,
        num_train_steps=num_train_steps,
        num_warmup_steps=int(0.1 * num_train_steps),
        weight_decay_rate=0.01,
    )

    # Compile the model (ensure 'labels' is specified for loss calculation)
    model.compile(optimizer=optimizer)

    # Set up callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
        ModelCheckpoint(
            filepath='./checkpoints/model_{epoch}.keras',
            save_best_only=True,
            monitor='val_loss'
        )
    ]

    # Train the model
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=epochs,
        callbacks=callbacks
    )

    # Save the model
    model.save_pretrained('/content/drive/MyDrive/2025/model/history_chatbot_model')
    tokenizer.save_pretrained('/content/drive/MyDrive/2025/model/history_chatbot_model')

    return model, tokenizer, history


# 4. Generate Responses to verify how the model is performing

In [7]:
# 4. Generate responses
def generate_response(user_input, model, tokenizer, max_length=100):
    input_text = f"question: {user_input}"

    input_ids = tokenizer(input_text, return_tensors='tf').input_ids

    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,  # Prevent repeated phrases
        top_k=50,
        top_p=0.90,   # Lower top_p for more deterministic responses
        temperature=0.6,  # Reduce randomness for accuracy
        do_sample=True
    )

    bot_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return bot_response

# 5. Evaluate the model
def evaluate_chatbot(test_data, model, tokenizer):
    responses = []
    references = []

    for _, row in test_data.iterrows():
        user_input = row['user']
        reference = row['bot']

        generated = generate_response(user_input, model, tokenizer)

        responses.append(generated)
        references.append(reference)

    # Calculate BLEU score
    try:
        import nltk
        from nltk.translate.bleu_score import corpus_bleu

        # Download necessary NLTK data if not present
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

        references_tokenized = [[ref.split()] for ref in references]
        responses_tokenized = [resp.split() for resp in responses]

        bleu_score = corpus_bleu(references_tokenized, responses_tokenized)
        print(f"BLEU Score: {bleu_score}")
    except:
        print("Couldn't calculate BLEU score. Continuing with other metrics.")
        bleu_score = 0

    # Calculate ROUGE score
    try:
        from rouge_score import rouge_scorer
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

        rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
        for ref, gen in zip(references, responses):
            score = scorer.score(ref, gen)
            for key in rouge_scores:
                rouge_scores[key] += score[key].fmeasure

        # Average the scores
        for key in rouge_scores:
            rouge_scores[key] /= len(references)

        print(f"ROUGE Scores: {rouge_scores}")
    except:
        print("Couldn't calculate ROUGE scores. Continuing with other metrics.")
        rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}

    # Simple exact match accuracy
    exact_matches = sum(1 for gen, ref in zip(responses, references) if gen == ref)
    accuracy = exact_matches / len(responses)
    print(f"Exact Match Accuracy: {accuracy}")

    return bleu_score, rouge_scores, accuracy, responses

# 6. Create a simple web interface using Gradio
def create_web_interface(model, tokenizer):
    import gradio as gr

    def chatbot_interface(user_input):
        if not user_input.strip():
            return "Please ask a history question."

        # Check if question is history-related
        history_keywords = [
            "history", "ancient", "medieval", "century", "war", "empire", "king",
            "queen", "civilization", "revolution", "world war", "dynasty", "emperor",
            "archaeological", "historical", "middle ages", "renaissance", "prehistoric",
            "civil war", "cold war", "rome", "egypt", "greece", "china", "mesopotamia",
            "pharaoh", "caesar", "viking", "ottoman", "byzantine", "mongol", "crusade",
            "independence", "conquest", "colonization", "monarchy", "republic"
        ]

        # Very basic domain filtering
        is_history_related = any(keyword.lower() in user_input.lower() for keyword in history_keywords)

        if not is_history_related:
            return "I'm a history chatbot. Please ask me about historical events, figures, or periods."

        response = generate_response(user_input, model, tokenizer)
        return response

    demo = gr.Interface(
        fn=chatbot_interface,
        inputs=gr.Textbox(lines=2, placeholder="Ask me about history..."),
        outputs="text",
        title="Historical Knowledge Chatbot (T5)",
        description="Ask questions about historical events, figures, and periods.",
        examples=[
            ["Who was Cleopatra?"],
            ["Tell me about the fall of the Roman Empire."],
            ["What caused World War I?"],
            ["Explain the significance of the Renaissance."],
            ["Who was Genghis Khan?"]
        ]
    )

    return demo

In [8]:
df = load_dataset(history_data)
print(df[['input_text', 'target_text']].head(10))


                                          input_text  \
0                                    question: Hello   
1                                       question: Hi   
2                                question: Hey there   
3                                  question: Goodbye   
4                            question: See you later   
5                       question: Who was Cleopatra?   
6  question: Tell me about the fall of the Roman ...   
7  question: Tell me about the fall of the Roman ...   
8     question: What was life like in ancient Egypt?   
9  question: What was the significance of the Mag...   

                                         target_text  
0  Hi there! Welcome to HistoryBot. I'm here to h...  
1  Hello! I'm your guide to the past. Whether you...  
2  Hey! Ready to dive into history? From the pyra...  
3  Goodbye! Thank you for exploring history with ...  
4  See you later! The chronicles of history will ...  
5  Cleopatra VII (69-30 BCE) was the last active ... 

# 5. Run all the function to Train the Model

In [19]:
# Main execution flow
if __name__ == "__main__":

    # Load and preprocess data from CSV
    df = load_dataset(history_data)

    # Split the data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

    # Initialize tokenizer
    tokenizer = T5Tokenizer.from_pretrained('t5-small')

    # Tokenize data
    train_dataset = tokenize_data(train_df, tokenizer)
    val_dataset = tokenize_data(val_df, tokenizer)

    # Fine-tune the model
    model, tokenizer, history = fine_tune_model(
        train_dataset,
        val_dataset,
        model_name='t5-small',  # You can use 't5-base' for better results
        epochs=6,
        batch_size=4
    )

    # Evaluate the model
    bleu_score, rouge_scores, accuracy, responses = evaluate_chatbot(test_df, model, tokenizer)

    # Save evaluation results
    evaluation_results = {
        'bleu_score': bleu_score,
        'rouge_scores': rouge_scores,
        'accuracy': accuracy,
        'examples': [
            {
                'user_input': row['user'],
                'reference': row['bot'],
                'generated': resp
            }
            for (_, row), resp in zip(test_df.iterrows(), responses)
        ]
    }

    import json
    with open('/content/drive/MyDrive/2025/data/evaluation_results.json', 'w') as f:
        json.dump(evaluation_results, f, indent=2)

    # Create web interface
    demo = create_web_interface(model, tokenizer)
    demo.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGene

Epoch 1/6



Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
BLEU Score: 0.039384050217590734
ROUGE Scores: {'rouge1': 0.09020043868587588, 'rouge2': 0.03760325604688853, 'rougeL': 0.07707438696990156}
Exact Match Accuracy: 0.0
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c1a690206e92668d20.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
