# Advanced Analysis and Log Mining

This notebook demonstrates advanced techniques for analyzing student learning data and detecting error patterns.

## Topics Covered
- Log data extraction and processing
- Pattern detection in student errors
- Statistical analysis of learning outcomes
- Visualization of learning trajectories

In [None]:
# Setup
import sys
sys.path.insert(0, '../..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json

from core.learning_logger import LearningLogger
from core.exercise_spec import load_exercise_spec
from core.symbolic_engine import SymbolicEngine
from core.computation_engine import ComputationEngine
from core.hint_engine import HintEngine

sns.set_style("whitegrid")
%matplotlib inline

## 1. Loading and Processing Log Data

Learn how to load and process MathLang learning logs.

In [None]:
# Example: Create sample log data
logger = LearningLogger()

# Simulate a learning session
logger.record(phase="problem", expression="x + x", rendered="Problem: x + x", status="ok")
logger.record(phase="step", expression="2*x", rendered="Step: 2*x", status="ok")
logger.record(phase="end", expression="done", rendered="End: done", status="ok")

# Convert to DataFrame for analysis
df = pd.DataFrame(logger.records)
print("Log Data Structure:")
print(df.head())
print(f"\nColumns: {df.columns.tolist()}")

## 2. Error Pattern Detection

Identify common mistake patterns in student work.

In [None]:
class ErrorPatternAnalyzer:
    """Analyze error patterns in learning logs."""
    
    def __init__(self):
        self.patterns = {}
        
    def analyze_logs(self, logs):
        """Extract error patterns from logs."""
        errors = [log for log in logs if log['status'] == 'mistake']
        
        for error in errors:
            meta = error.get('meta', {})
            reason = meta.get('reason', 'unknown')
            
            if reason not in self.patterns:
                self.patterns[reason] = []
            
            self.patterns[reason].append({
                'expression': error.get('expression'),
                'phase': error.get('phase'),
                'timestamp': error.get('timestamp')
            })
        
        return self.patterns
    
    def get_pattern_frequencies(self):
        """Get frequency of each error pattern."""
        return {k: len(v) for k, v in self.patterns.items()}
    
    def visualize_patterns(self):
        """Visualize error pattern distribution."""
        freqs = self.get_pattern_frequencies()
        
        plt.figure(figsize=(10, 6))
        plt.bar(freqs.keys(), freqs.values())
        plt.xlabel('Error Pattern')
        plt.ylabel('Frequency')
        plt.title('Error Pattern Distribution')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

# Example usage
analyzer = ErrorPatternAnalyzer()
# analyzer.analyze_logs(your_logs)
# analyzer.visualize_patterns()

## 3. Hint Effectiveness Analysis

Analyze which hints are most effective for different error types.

In [None]:
def analyze_hint_effectiveness(logs):
    """Analyze correlation between hints and subsequent success."""
    
    # Find all hint events
    hint_events = []
    for i, log in enumerate(logs):
        if log['status'] == 'mistake' and 'hint' in log.get('meta', {}):
            # Check if next attempt was successful
            next_success = False
            if i + 1 < len(logs) and logs[i + 1]['status'] == 'ok':
                next_success = True
            
            hint_events.append({
                'hint_type': log['meta'].get('hint', {}).get('type'),
                'next_success': next_success
            })
    
    # Calculate effectiveness
    df = pd.DataFrame(hint_events)
    if not df.empty:
        effectiveness = df.groupby('hint_type')['next_success'].mean()
        return effectiveness
    return pd.Series()

# Example
# effectiveness = analyze_hint_effectiveness(your_logs)
# print(f"Hint Effectiveness:\n{effectiveness}")

## 4. Learning Trajectory Visualization

Visualize how students progress through problems.

In [None]:
def plot_learning_trajectory(logs):
    """Plot success rate over time."""
    df = pd.DataFrame(logs)
    
    # Calculate rolling success rate
    df['success'] = (df['status'] == 'ok').astype(int)
    df['step_num'] = range(len(df))
    
    # Rolling average
    window = 5
    df['rolling_success'] = df['success'].rolling(window=window, min_periods=1).mean()
    
    # Plot
    plt.figure(figsize=(12, 6))
    plt.plot(df['step_num'], df['rolling_success'], marker='o', linewidth=2)
    plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='50% threshold')
    plt.xlabel('Step Number')
    plt.ylabel(f'Success Rate (rolling {window}-step average)')
    plt.title('Learning Trajectory')
    plt.ylim(0, 1.1)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Example
# plot_learning_trajectory(your_logs)

## 5. Multi-Student Comparison

Compare performance across multiple students.

In [None]:
def compare_students(student_logs_dict):
    """Compare metrics across multiple students."""
    
    metrics = []
    for student_id, logs in student_logs_dict.items():
        df = pd.DataFrame(logs)
        
        total_steps = len(df[df['phase'].isin(['step', 'end'])])
        successful_steps = len(df[(df['phase'].isin(['step', 'end'])) & (df['status'] == 'ok')])
        
        metrics.append({
            'student_id': student_id,
            'total_steps': total_steps,
            'successful_steps': successful_steps,
            'success_rate': successful_steps / total_steps if total_steps > 0 else 0,
            'errors': len(df[df['status'] == 'mistake'])
        })
    
    comparison_df = pd.DataFrame(metrics)
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Success rates
    axes[0].bar(comparison_df['student_id'], comparison_df['success_rate'])
    axes[0].set_xlabel('Student ID')
    axes[0].set_ylabel('Success Rate')
    axes[0].set_title('Student Success Rates')
    axes[0].set_ylim(0, 1)
    
    # Error counts
    axes[1].bar(comparison_df['student_id'], comparison_df['errors'], color='coral')
    axes[1].set_xlabel('Student ID')
    axes[1].set_ylabel('Number of Errors')
    axes[1].set_title('Error Frequency by Student')
    
    plt.tight_layout()
    plt.show()
    
    return comparison_df

# Example
# student_data = {
#     'student_1': logs1,
#     'student_2': logs2,
#     'student_3': logs3
# }
# comparison = compare_students(student_data)
# print(comparison)

## 6. Exercise Difficulty Analysis

Analyze which exercises are most challenging.

In [None]:
def analyze_exercise_difficulty(exercise_logs_dict):
    """Analyze difficulty based on completion rates and error rates."""
    
    difficulty_metrics = []
    
    for exercise_id, logs in exercise_logs_dict.items():
        df = pd.DataFrame(logs)
        
        attempts = len(df[df['phase'] == 'step'])
        errors = len(df[df['status'] == 'mistake'])
        
        difficulty_metrics.append({
            'exercise_id': exercise_id,
            'avg_attempts': attempts,
            'error_rate': errors / attempts if attempts > 0 else 0
        })
    
    return pd.DataFrame(difficulty_metrics)

# Example usage would go here

## Summary

This notebook provided tools for:
- Processing and analyzing learning logs
- Detecting error patterns
- Evaluating hint effectiveness
- Visualizing learning trajectories
- Comparing student performance
- Assessing exercise difficulty

Use these techniques to gain insights into student learning and improve educational interventions.