<a href="https://colab.research.google.com/github/daisysong76/AI--Machine--learning/blob/main/Advanced_Educational_Content_Generation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    T5ForConditionalGeneration,
    T5Tokenizer,
    BertForSequenceClassification,
    BertTokenizer
)
from concurrent.futures import ProcessPoolExecutor
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
import numpy as np
from nltk.tokenize import sent_tokenize
import spacy
import textstat
import json
import logging
from tqdm import tqdm

@dataclass
class ContentMetrics:
    readability_score: float
    coherence_score: float
    factual_accuracy: float
    engagement_score: float
    overall_score: float

class ContentEvaluator:
    def __init__(self):
        # Load BERT model for coherence checking
        self.coherence_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        self.coherence_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Load spaCy for linguistic analysis
        self.nlp = spacy.load('en_core_web_sm')

    def evaluate_readability(self, content: str) -> float:
        """Evaluate text readability using multiple metrics"""
        flesch_score = textstat.flesch_reading_ease(content)
        grade_level = textstat.coleman_liau_index(content)

        # Normalize scores to 0-1 range
        normalized_flesch = flesch_score / 100
        normalized_grade = (20 - min(grade_level, 20)) / 20

        return (normalized_flesch + normalized_grade) / 2

    def evaluate_coherence(self, content: str) -> float:
        """Evaluate text coherence using BERT"""
        inputs = self.coherence_tokenizer(
            content,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )

        with torch.no_grad():
            outputs = self.coherence_model(**inputs)
            coherence_score = torch.sigmoid(outputs.logits).item()

        return coherence_score

    def evaluate_factual_accuracy(self, content: str) -> float:
        """
        Evaluate factual accuracy using NER and relationship checking
        This is a simplified version - in production, you'd want to check against a knowledge base
        """
        doc = self.nlp(content)

        # Check for presence of facts (entities, numbers, dates)
        facts_count = len([ent for ent in doc.ents])
        sentences = len(list(doc.sents))

        if sentences == 0:
            return 0.0

        facts_per_sentence = facts_count / sentences
        return min(facts_per_sentence / 3, 1.0)  # Normalize to 0-1

    def evaluate_engagement(self, content: str) -> float:
        """
        Evaluate potential engagement based on various factors
        """
        doc = self.nlp(content)

        # Check for interactive elements
        question_count = len([sent for sent in doc.sents
                            if sent.text.strip().endswith('?')])

        # Check for variety in sentence structure
        sentence_lengths = [len(sent) for sent in doc.sents]
        length_variety = np.std(sentence_lengths) if sentence_lengths else 0

        # Normalize scores
        question_score = min(question_count / 5, 1.0)
        variety_score = min(length_variety / 20, 1.0)

        return (question_score + variety_score) / 2

class ContentGenerator:
    def __init__(self):
        # Load GPT-2 for creative content generation
        self.gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

        # Load T5 for structured content generation
        self.t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')
        self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')

        # Initialize evaluator
        self.evaluator = ContentEvaluator()

    def generate_content(
        self,
        prompt: str,
        target_audience: str,
        content_type: str,
        min_length: int = 200,
        max_length: int = 800
    ) -> Tuple[str, ContentMetrics]:
        """Generate educational content with specified parameters"""

        # Construct enhanced prompt
        enhanced_prompt = f"""
        Create {content_type} content about: {prompt}
        Target audience: {target_audience}
        Make it engaging and educational.
        Include examples and explanations.
        """

        # Generate initial content using GPT-2
        inputs = self.gpt2_tokenizer.encode(
            enhanced_prompt,
            return_tensors="pt",
            max_length=100
        )

        outputs = self.gpt2_model.generate(
            inputs,
            min_length=min_length,
            max_length=max_length,
            num_return_sequences=3,
            no_repeat_ngram_size=3,
            top_p=0.95,
            temperature=0.7,
            do_sample=True
        )

        generated_contents = [
            self.gpt2_tokenizer.decode(output, skip_special_tokens=True)
            for output in outputs
        ]

        # Evaluate and select best content
        best_content = None
        best_metrics = None
        best_score = -1

        for content in generated_contents:
            metrics = self._evaluate_content(content)
            if metrics.overall_score > best_score:
                best_score = metrics.overall_score
                best_content = content
                best_metrics = metrics

        # Refine content using T5 if needed
        if best_metrics.overall_score < 0.7:
            refined_content = self._refine_content(best_content)
            refined_metrics = self._evaluate_content(refined_content)

            if refined_metrics.overall_score > best_metrics.overall_score:
                return refined_content, refined_metrics

        return best_content, best_metrics

    def _evaluate_content(self, content: str) -> ContentMetrics:
        """Evaluate content using multiple metrics"""
        readability = self.evaluator.evaluate_readability(content)
        coherence = self.evaluator.evaluate_coherence(content)
        factual = self.evaluator.evaluate_factual_accuracy(content)
        engagement = self.evaluator.evaluate_engagement(content)

        # Calculate weighted overall score
        weights = {
            'readability': 0.25,
            'coherence': 0.3,
            'factual': 0.25,
            'engagement': 0.2
        }

        overall = (
            weights['readability'] * readability +
            weights['coherence'] * coherence +
            weights['factual'] * factual +
            weights['engagement'] * engagement
        )

        return ContentMetrics(
            readability_score=readability,
            coherence_score=coherence,
            factual_accuracy=factual,
            engagement_score=engagement,
            overall_score=overall
        )

    def _refine_content(self, content: str) -> str:
        """Refine content using T5 model"""
        input_text = f"refine educational content: {content}"
        inputs = self.t5_tokenizer.encode(
            input_text,
            return_tensors="pt",
            max_length=1024,
            truncation=True
        )

        outputs = self.t5_model.generate(
            inputs,
            max_length=1024,
            num_return_sequences=1,
            no_repeat_ngram_size=3,
            top_p=0.95,
            temperature=0.7
        )

        refined_content = self.t5_tokenizer.decode(
            outputs[0],
            skip_special_tokens=True
        )
        return refined_content

class EducationalContentPipeline:
    def __init__(self, num_workers: int = 4):
        self.generator = ContentGenerator()
        self.num_workers = num_workers

        # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def generate_curriculum(
        self,
        topics: List[Dict],
        target_audience: str
    ) -> List[Dict]:
        """
        Generate a complete curriculum of educational content

        Args:
            topics: List of dicts containing topic info
            target_audience: Target

This advanced educational content generation system includes several sophisticated features:

Multi-Model Approach:
Uses GPT-2 for creative content generation
Uses T5 for content refinement
Uses BERT for coherence evaluation
Uses spaCy for linguistic analysis


Comprehensive Content Evaluation:
Readability scoring using multiple metrics
Coherence evaluation using BERT
Factual accuracy checking using NER
Engagement scoring based on interactive elements
Weighted scoring system for overall quality


Advanced Features:
Parallel processing for curriculum generation
Automatic content refinement when quality thresholds aren't met
Detailed metrics and logging
Support for different content types and audience levels
Error handling and recovery


Production-Ready Elements:
Type hints for better code maintainability
Logging system for monitoring and debugging
Progress tracking with tqdm
JSON output for easy integration
Modular design for easy extension

To use this system, you'll need to install the required dependencies:
bashCopypip install transformers torch spacy textstat nltk tqdm
python -m spacy download en_core_web_sm
The system can be used to generate entire curriculums or individual pieces of content, with detailed quality metrics for each generation. Would you like me to explain any specific part in more detail or show how to customize it for particular use cases?