# Fine-tune GPT-2 on NCERT Class 7 Text Data
This notebook trains a GPT-2 model on Class 7 NCERT content to serve as a personalized teacher/coach.

In [None]:
# Uncomment the lines below to install required libraries if not already installed
# !pip install transformers datasets torch accelerate matplotlib


In [None]:
import os
from pathlib import Path
import matplotlib.pyplot as plt
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
import torch
from datetime import datetime


In [None]:
# Set paths for data
base_dir = "path/to/your/data"  # Replace with the directory containing your NCERT text files
save_model_dir = "./class7_gpt2_model"

# Load and prepare dataset
def load_text_files(base_dir):
    """Load all text files from a directory and subdirectories."""
    text_data = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text_data.append(f.read())
    return text_data

print(f"{datetime.now()} - Loading text data...")
text_data = load_text_files(base_dir)
print(f"{datetime.now()} - Loaded {len(text_data)} text files.")

# Create Hugging Face Dataset
dataset = Dataset.from_dict({"text": text_data})
print(f"{datetime.now()} - Dataset created with {len(dataset)} examples.")


In [None]:
# Load tokenizer
print(f"{datetime.now()} - Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add padding token for training

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=1024)

print(f"{datetime.now()} - Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
print(f"{datetime.now()} - Tokenization complete.")


In [None]:
# Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # GPT-2 is not a masked language model
)

# Visualize tokenized data distribution
token_lengths = [len(tokenizer(text)["input_ids"]) for text in text_data]
plt.hist(token_lengths, bins=50)
plt.title("Token Length Distribution")
plt.xlabel("Token Length")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Load GPT-2 Model
print(f"{datetime.now()} - Loading GPT-2 model...")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Adjust token embeddings
print(f"{datetime.now()} - Model loaded.")


In [None]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir=save_model_dir,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Adjust based on GPU memory
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,  # Mixed precision training for faster computation on M1
    report_to="none",
)

print(f"{datetime.now()} - Training arguments configured.")


In [None]:
# Create Trainer
print(f"{datetime.now()} - Setting up Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train Model
print(f"{datetime.now()} - Starting training...")
trainer.train()
print(f"{datetime.now()} - Training complete.")

# Save the model
print(f"{datetime.now()} - Saving model to {save_model_dir}...")
model.save_pretrained(save_model_dir)
tokenizer.save_pretrained(save_model_dir)


In [None]:
# Test the Trained Model
def generate_response(prompt, max_length=50):
    """Generate response using the trained model."""
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(torch.device("mps"))
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Test
prompt = "Explain the properties of integers."
response = generate_response(prompt)
print("Response:", response)
