
# **Adaptive Concept Drift Detection Framework**
    ** PySpark Consumer with Real-time Dashboard**

In [1]:
#Install required packages (if needed)
# Run this cell first if packages are not installed:
# !pip install pyspark kafka-python numpy pandas matplotlib ipywidgets

In [2]:
#Imports and Environment Setup
import os
import sys
import time
import numpy as np
import pandas as pd
from datetime import datetime
from collections import deque
import warnings
import json
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from IPython.display import display, clear_output, HTML
import ipywidgets as widgets

warnings.filterwarnings('ignore')

In [3]:
# Set PySpark environment
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import from_json, col

In [None]:
# Kafka Configuration
KAFKA_BROKER = "localhost:9092"
KAFKA_TOPIC = "drift_stream"

# Window Management
INITIAL_WINDOW_SIZE = 200
MIN_WINDOW_SIZE = 150
MAX_WINDOW_SIZE = 500
WINDOW_ADJUSTMENT_FACTOR = 0.1

# Drift Detection Parameters
WARNING_THRESHOLD_MULTIPLIER = 0.8
DRIFT_THRESHOLD_MULTIPLIER = 1.0
MIN_SAMPLES_FOR_DETECTION = 30

# Performance Parameters
BATCH_INTERVAL = "2 seconds"
CHECKPOINT_LOCATION = "/tmp/drift_checkpoint"
MAX_OFFSETS_PER_TRIGGER = 1000

In [5]:
# Spark Session Creation
def create_optimized_spark_session():
    """Create an optimized Spark session for streaming"""
    return SparkSession.builder \
        .appName("AdaptiveDriftDetection") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true") \
        .config("spark.sql.shuffle.partitions", "20") \
        .config("spark.streaming.backpressure.enabled", "true") \
        .config("spark.ui.showConsoleProgress", "false") \
        .getOrCreate()

In [6]:
# Hoeffding Tree Implementation
class HoeffdingTree:
    """Simplified Hoeffding Tree for streaming classification"""

    def __init__(self, grace_period=200, split_confidence=0.95):
        self.grace_period = grace_period
        self.split_confidence = split_confidence
        self.tree = {'is_leaf': True, 'class_counts': {}, 'total_count': 0}
        self.accuracy_history = deque(maxlen=100)

    def partial_fit(self, X, y):
        """Update the tree with new samples"""
        for i in range(len(X)):
            self._update_leaf(self.tree, X[i], y[i])

    def _update_leaf(self, node, x, y):
        """Update leaf node statistics"""
        if y not in node['class_counts']:
            node['class_counts'][y] = 0
        node['class_counts'][y] += 1
        node['total_count'] += 1

    def predict(self, X):
        """Make predictions"""
        predictions = []
        for x in X:
            pred = self._predict_single(self.tree, x)
            predictions.append(pred)
        return np.array(predictions)

    def _predict_single(self, node, x):
        """Predict single instance"""
        if node['is_leaf']:
            if node['class_counts']:
                return max(node['class_counts'], key=node['class_counts'].get)
            return '0'
        return '0'

    def get_accuracy(self, X, y):
        """Calculate accuracy"""
        if len(X) == 0:
            return 1.0
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        self.accuracy_history.append(accuracy)
        return accuracy

In [7]:
# Adaptive Window Manager
class AdaptiveWindowManager:
    """Manages adaptive sliding windows for drift detection"""

    def __init__(self, initial_size=INITIAL_WINDOW_SIZE):
        self.window_size = initial_size
        self.min_size = MIN_WINDOW_SIZE
        self.max_size = MAX_WINDOW_SIZE
        self.adjustment_factor = WINDOW_ADJUSTMENT_FACTOR
        self.data_windows = {}

    def update_window_size(self, drift_detected, confidence):
        """Dynamically adjust window size based on drift detection"""
        if drift_detected:
            # Decrease window size for better sensitivity
            new_size = max(self.min_size,
                          int(self.window_size * (1 - self.adjustment_factor)))
        else:
            # Increase window size for stability
            new_size = min(self.max_size,
                          int(self.window_size * (1 + self.adjustment_factor * (1 - confidence))))

        # Smooth transition
        self.window_size = int(0.7 * self.window_size + 0.3 * new_size)

    def add_data(self, feature, values):
        """Add data to feature window"""
        if feature not in self.data_windows:
            self.data_windows[feature] = deque(maxlen=self.window_size)

        # Adjust existing windows to new size
        if len(self.data_windows[feature]) != self.window_size:
            self.data_windows[feature] = deque(self.data_windows[feature],
                                              maxlen=self.window_size)

        self.data_windows[feature].extend(values)

    def get_reference_and_current(self, feature):
        """Split window into reference and current portions"""
        if feature not in self.data_windows or len(self.data_windows[feature]) < self.min_size:
            return None, None

        data = list(self.data_windows[feature])
        split_point = len(data) // 2
        return data[:split_point], data[split_point:]

In [8]:
# Drift Detection Algorithms
class DriftDetector:
    """Base class for drift detectors"""

    def __init__(self, window_manager):
        self.window_manager = window_manager
        self.drift_scores = deque(maxlen=50)
        self.threshold = 0.1
        self.adaptive_threshold = True

    def calculate_threshold(self):
        """Calculate adaptive threshold based on score history"""
        if len(self.drift_scores) < 10:
            return self.threshold

        scores = list(self.drift_scores)
        mean = np.mean(scores)
        std = np.std(scores)
        return mean + DRIFT_THRESHOLD_MULTIPLIER * std

class KLDivergenceDetector(DriftDetector):
    """KL Divergence for sudden drift detection"""

    def __init__(self, window_manager):
        super().__init__(window_manager)
        self.name = "KL Divergence"
        self.drift_type = "sudden"

    def detect(self, features):
        """Detect drift using KL divergence"""
        scores = []

        for feature in features:
            ref_data, curr_data = self.window_manager.get_reference_and_current(feature)
            if ref_data is None or curr_data is None:
                continue

            # Calculate KL divergence
            kl_score = self._calculate_kl_divergence(ref_data, curr_data)
            scores.append(kl_score)

        if not scores:
            return False, 0.0, {}

        avg_score = np.mean(scores)
        self.drift_scores.append(avg_score)

        threshold = self.calculate_threshold()
        drift_detected = avg_score > threshold

        return drift_detected, avg_score, {
            'threshold': threshold,
            'feature_scores': scores
        }

    def _calculate_kl_divergence(self, ref_data, curr_data, n_bins=10):
        """Calculate KL divergence between two distributions"""
        # Create histograms
        min_val = min(min(ref_data), min(curr_data))
        max_val = max(max(ref_data), max(curr_data))
        bins = np.linspace(min_val, max_val, n_bins + 1)

        ref_hist, _ = np.histogram(ref_data, bins=bins)
        curr_hist, _ = np.histogram(curr_data, bins=bins)

        # Normalize
        ref_hist = ref_hist + 1e-10
        curr_hist = curr_hist + 1e-10
        ref_hist = ref_hist / ref_hist.sum()
        curr_hist = curr_hist / curr_hist.sum()

        # Calculate KL divergence
        kl = np.sum(ref_hist * np.log(ref_hist / curr_hist))

        return kl

class HellingerDistanceDetector(DriftDetector):
    """Hellinger Distance for gradual/incremental drift detection"""

    def __init__(self, window_manager):
        super().__init__(window_manager)
        self.name = "Hellinger Distance"
        self.drift_type = "gradual/incremental"

    def detect(self, features):
        """Detect drift using Hellinger distance"""
        scores = []

        for feature in features:
            ref_data, curr_data = self.window_manager.get_reference_and_current(feature)
            if ref_data is None or curr_data is None:
                continue

            # Calculate Hellinger distance
            hellinger_score = self._calculate_hellinger_distance(ref_data, curr_data)
            scores.append(hellinger_score)

        if not scores:
            return False, 0.0, {}

        avg_score = np.mean(scores)
        self.drift_scores.append(avg_score)

        threshold = self.calculate_threshold()
        drift_detected = avg_score > threshold

        return drift_detected, avg_score, {
            'threshold': threshold,
            'feature_scores': scores
        }

    def _calculate_hellinger_distance(self, ref_data, curr_data, n_bins=10):
        """Calculate Hellinger distance between two distributions"""
        # Create histograms
        min_val = min(min(ref_data), min(curr_data))
        max_val = max(max(ref_data), max(curr_data))
        bins = np.linspace(min_val, max_val, n_bins + 1)

        ref_hist, _ = np.histogram(ref_data, bins=bins)
        curr_hist, _ = np.histogram(curr_data, bins=bins)

        # Normalize
        ref_hist = ref_hist + 1e-10
        curr_hist = curr_hist + 1e-10
        ref_hist = ref_hist / ref_hist.sum()
        curr_hist = curr_hist / curr_hist.sum()

        # Calculate Hellinger distance
        hellinger = np.sqrt(0.5 * np.sum((np.sqrt(ref_hist) - np.sqrt(curr_hist))**2))

        return hellinger


In [9]:
# Model Management System
class ModelManager:
    """Manages base learner models and handles retraining"""

    def __init__(self):
        self.current_model = HoeffdingTree()
        self.candidate_model = None
        self.model_version = 0
        self.performance_history = deque(maxlen=100)
        self.warning_mode = False

    def update_model(self, X, y):
        """Update current model with new data"""
        self.current_model.partial_fit(X, y)
        accuracy = self.current_model.get_accuracy(X, y)
        self.performance_history.append(accuracy)

        # Update candidate model if in warning mode
        if self.warning_mode and self.candidate_model:
            self.candidate_model.partial_fit(X, y)

        return accuracy

    def handle_warning(self):
        """Start training candidate model"""
        if not self.warning_mode:
            self.warning_mode = True
            self.candidate_model = HoeffdingTree()
            print("Warning mode activated - training candidate model")

    def handle_drift(self):
        """Switch to new model after drift confirmation"""
        if self.candidate_model:
            self.current_model = self.candidate_model
            self.candidate_model = None
            self.model_version += 1
            self.warning_mode = False
            print(f"Model switched to version {self.model_version}")
        else:
            # Retrain from scratch
            self.current_model = HoeffdingTree()
            self.model_version += 1
            print(f"Model retrained - version {self.model_version}")

In [10]:
# Performance Tracker
class PerformanceTracker:
    """Tracks and analyzes system performance metrics"""

    def __init__(self):
        self.metrics = {
            'drift_detection_delay': deque(maxlen=100),
            'false_positive_rate': deque(maxlen=100),
            'false_negative_rate': deque(maxlen=100),
            'accuracy_before_drift': deque(maxlen=100),
            'accuracy_after_adaptation': deque(maxlen=100),
            'processing_time': deque(maxlen=100),
            'memory_usage': deque(maxlen=100)
        }
        self.drift_history = []
        self.batch_count = 0

    def record_batch(self, batch_id, drift_detected, drift_type, detector_name,
                    accuracy, processing_time):
        """Record metrics for a batch"""
        self.batch_count += 1

        if drift_detected:
            self.drift_history.append({
                'batch_id': batch_id,
                'drift_type': drift_type,
                'detector': detector_name,
                'timestamp': datetime.now()
            })

        self.metrics['processing_time'].append(processing_time)

        # Calculate AUPC (Area Under Performance Curve)
        if len(self.metrics['accuracy_before_drift']) > 0:
            aupc = np.trapz(list(self.metrics['accuracy_before_drift']))
            return aupc
        return 0.0

    def get_summary(self):
        """Get performance summary"""
        summary = {
            'total_batches': self.batch_count,
            'total_drifts': len(self.drift_history),
            'drift_rate': len(self.drift_history) / max(1, self.batch_count),
            'avg_processing_time': np.mean(self.metrics['processing_time']) if self.metrics['processing_time'] else 0,
            'drift_types': {}
        }

        # Count drift types
        for drift in self.drift_history:
            dtype = drift['drift_type']
            summary['drift_types'][dtype] = summary['drift_types'].get(dtype, 0) + 1

        return summary


In [11]:
# Real-time Dashboard
class DriftDetectionDashboard:
    """Interactive dashboard for real-time monitoring"""

    def __init__(self):
        self.setup_ui()
        self.plot_data = {
            'batches': deque(maxlen=100),
            'kl_scores': deque(maxlen=100),
            'hellinger_scores': deque(maxlen=100),
            'accuracy': deque(maxlen=100),
            'window_size': deque(maxlen=100)
        }
        self.drift_markers = []

    def setup_ui(self):
        """Create dashboard UI components"""
        # Status display
        self.status_label = widgets.HTML(
            value=self._create_status_html("Initializing...", "info")
        )

        # Metrics display
        self.metrics_output = widgets.Output()

        # Plots
        self.plot_output = widgets.Output()

        # Performance summary
        self.summary_output = widgets.Output()

        # Control buttons
        self.pause_button = widgets.Button(
            description='Pause',
            button_style='warning',
            icon='pause'
        )

        self.save_button = widgets.Button(
            description='Save Results',
            button_style='success',
            icon='save'
        )

        # Layout
        self.dashboard = widgets.VBox([
            widgets.HTML("<h2 style='text-align: center; color: #2c3e50;'>🚀 Adaptive Drift Detection Dashboard</h2>"),
            self.status_label,
            widgets.HBox([self.pause_button, self.save_button]),
            widgets.HBox([
                widgets.VBox([
                    widgets.HTML("<h3>📊 Real-time Metrics</h3>"),
                    self.metrics_output
                ], layout=widgets.Layout(width='40%')),
                widgets.VBox([
                    widgets.HTML("<h3>📈 Performance Summary</h3>"),
                    self.summary_output
                ], layout=widgets.Layout(width='60%'))
            ]),
            widgets.HTML("<h3>📉 Drift Detection Visualization</h3>"),
            self.plot_output
        ])

    def _create_status_html(self, message, status_type="info"):
        """Create formatted status HTML"""
        colors = {
            'info': '#3498db',
            'warning': '#f39c12',
            'danger': '#e74c3c',
            'success': '#27ae60'
        }

        return f"""
        <div style='background-color: {colors.get(status_type, colors['info'])};
                    color: white; padding: 15px; border-radius: 10px;
                    text-align: center; font-size: 16px; margin: 10px 0;'>
            {message}
        </div>
        """

    def update(self, batch_id, drift_results, model_accuracy, window_size, performance_summary):
        """Update dashboard with new data"""
        # Update plot data
        self.plot_data['batches'].append(batch_id)
        self.plot_data['kl_scores'].append(drift_results['kl']['score'])
        self.plot_data['hellinger_scores'].append(drift_results['hellinger']['score'])
        self.plot_data['accuracy'].append(model_accuracy)
        self.plot_data['window_size'].append(window_size)

        # Check for drift
        drift_detected = any(result['detected'] for result in drift_results.values())
        if drift_detected:
            drift_info = next((k, v) for k, v in drift_results.items() if v['detected'])
            self.drift_markers.append((batch_id, drift_info[0]))
            status_msg = f"⚠️ DRIFT DETECTED - Type: {drift_info[1]['type']} | Detector: {drift_info[0].upper()}"
            status_type = "danger"
        else:
            status_msg = f"✅ STABLE - Batch {batch_id} | Accuracy: {model_accuracy:.3f}"
            status_type = "success"

        self.status_label.value = self._create_status_html(status_msg, status_type)

        # Update metrics
        with self.metrics_output:
            clear_output(wait=True)
            self._display_metrics(drift_results, model_accuracy, window_size)

        # Update plots
        with self.plot_output:
            clear_output(wait=True)
            self._create_plots()

        # Update summary
        with self.summary_output:
            clear_output(wait=True)
            self._display_summary(performance_summary)

    def _display_metrics(self, drift_results, accuracy, window_size):
        """Display current metrics"""
        html = f"""
        <table style='width: 100%; border-collapse: collapse;'>
            <tr style='background-color: #34495e; color: white;'>
                <th style='padding: 10px; text-align: left;'>Metric</th>
                <th style='padding: 10px; text-align: right;'>Value</th>
            </tr>
            <tr style='background-color: #ecf0f1;'>
                <td style='padding: 10px;'>Model Accuracy</td>
                <td style='padding: 10px; text-align: right;'>{accuracy:.3f}</td>
            </tr>
            <tr>
                <td style='padding: 10px;'>Window Size</td>
                <td style='padding: 10px; text-align: right;'>{window_size}</td>
            </tr>
            <tr style='background-color: #ecf0f1;'>
                <td style='padding: 10px;'>KL Score</td>
                <td style='padding: 10px; text-align: right;'>{drift_results['kl']['score']:.4f}</td>
            </tr>
            <tr>
                <td style='padding: 10px;'>KL Threshold</td>
                <td style='padding: 10px; text-align: right;'>{drift_results['kl']['threshold']:.4f}</td>
            </tr>
            <tr style='background-color: #ecf0f1;'>
                <td style='padding: 10px;'>Hellinger Score</td>
                <td style='padding: 10px; text-align: right;'>{drift_results['hellinger']['score']:.4f}</td>
            </tr>
            <tr>
                <td style='padding: 10px;'>Hellinger Threshold</td>
                <td style='padding: 10px; text-align: right;'>{drift_results['hellinger']['threshold']:.4f}</td>
            </tr>
        </table>
        """
        display(HTML(html))

    def _display_summary(self, summary):
        """Display performance summary"""
        drift_types_html = ""
        for dtype, count in summary.get('drift_types', {}).items():
            drift_types_html += f"<li>{dtype}: {count}</li>"

        html = f"""
        <div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px;'>
            <div style='display: grid; grid-template-columns: repeat(2, 1fr); gap: 15px;'>
                <div>
                    <h4 style='color: #2c3e50; margin: 0;'>Total Batches</h4>
                    <p style='font-size: 24px; margin: 5px 0; color: #3498db;'>{summary['total_batches']}</p>
                </div>
                <div>
                    <h4 style='color: #2c3e50; margin: 0;'>Total Drifts</h4>
                    <p style='font-size: 24px; margin: 5px 0; color: #e74c3c;'>{summary['total_drifts']}</p>
                </div>
                <div>
                    <h4 style='color: #2c3e50; margin: 0;'>Drift Rate</h4>
                    <p style='font-size: 24px; margin: 5px 0; color: #f39c12;'>{summary['drift_rate']:.2%}</p>
                </div>
                <div>
                    <h4 style='color: #2c3e50; margin: 0;'>Avg Processing Time</h4>
                    <p style='font-size: 24px; margin: 5px 0; color: #27ae60;'>{summary['avg_processing_time']:.3f}s</p>
                </div>
            </div>
            <div style='margin-top: 15px;'>
                <h4 style='color: #2c3e50;'>Drift Types Detected:</h4>
                <ul style='margin: 5px 0;'>{drift_types_html if drift_types_html else '<li>None yet</li>'}</ul>
            </div>
        </div>
        """
        display(HTML(html))

    def _create_plots(self):
        """Create visualization plots"""
        if len(self.plot_data['batches']) < 2:
            return

        fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 10), sharex=True)

        batches = list(self.plot_data['batches'])

        # Plot 1: Drift Scores
        ax1.plot(batches, list(self.plot_data['kl_scores']),
                'r-', label='KL Divergence', linewidth=2)
        ax1.plot(batches, list(self.plot_data['hellinger_scores']),
                'b-', label='Hellinger Distance', linewidth=2)

        # Add drift markers
        for batch, detector in self.drift_markers:
            if batch in batches:
                color = 'r' if detector == 'kl' else 'b'
                ax1.axvline(x=batch, color=color, linestyle='--', alpha=0.5)

        ax1.set_ylabel('Drift Score', fontsize=12)
        ax1.set_title('Real-time Drift Detection Scores', fontsize=14, fontweight='bold')
        ax1.legend(loc='upper right')
        ax1.grid(True, alpha=0.3)

        # Plot 2: Model Accuracy
        ax2.plot(batches, list(self.plot_data['accuracy']),
                'g-', label='Model Accuracy', linewidth=2)
        ax2.fill_between(batches, list(self.plot_data['accuracy']),
                        alpha=0.3, color='green')
        ax2.set_ylabel('Accuracy', fontsize=12)
        ax2.set_title('Model Performance', fontsize=14, fontweight='bold')
        ax2.set_ylim(0, 1.1)
        ax2.legend(loc='upper right')
        ax2.grid(True, alpha=0.3)

        # Plot 3: Adaptive Window Size
        ax3.plot(batches, list(self.plot_data['window_size']),
                'm-', label='Window Size', linewidth=2)
        ax3.set_xlabel('Batch Number', fontsize=12)
        ax3.set_ylabel('Window Size', fontsize=12)
        ax3.set_title('Adaptive Window Size', fontsize=14, fontweight='bold')
        ax3.legend(loc='upper right')
        ax3.grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

    def display(self):
        """Display the dashboard"""
        display(self.dashboard)


In [12]:
# Main Drift Detection System
class AdaptiveDriftDetectionSystem:
    """Main system orchestrating all components"""

    def __init__(self):
        self.spark = create_optimized_spark_session()
        self.window_manager = AdaptiveWindowManager()
        self.kl_detector = KLDivergenceDetector(self.window_manager)
        self.hellinger_detector = HellingerDistanceDetector(self.window_manager)
        self.model_manager = ModelManager()
        self.performance_tracker = PerformanceTracker()
        self.dashboard = DriftDetectionDashboard()
        self.batch_id = 0
        self.is_running = True
        self.detected_features = None

    def detect_features(self, df):
        """Automatically detect active features"""
        if self.detected_features is None:
            self.detected_features = []
            for i in range(1, 7):  # Check up to 6 attributes
                col_name = f"at{i}"
                try:
                    if df.select(col_name).filter(col(col_name).isNotNull()).count() > 0:
                        self.detected_features.append(col_name)
                except:
                    pass
            print(f"Detected features: {self.detected_features}")
        return self.detected_features

    def process_batch(self, batch_df, batch_id):
        """Process a single batch of data"""
        start_time = time.time()

        try:
            # Skip if empty batch
            if batch_df.count() == 0:
                return

            # Detect features on first batch
            if self.batch_id == 0:
                self.detect_features(batch_df)

            # Convert to pandas for processing
            pdf = batch_df.select(self.detected_features + ['cl']).toPandas()

            # Prepare data
            X = pdf[self.detected_features].values
            y = pdf['cl'].values

            # Update windows
            for feature in self.detected_features:
                self.window_manager.add_data(feature, pdf[feature].values)

            # Update model and get accuracy
            accuracy = self.model_manager.update_model(X, y)

            # Run drift detection
            drift_results = self._run_drift_detection()

            # Handle drift if detected
            drift_detected = any(result['detected'] for result in drift_results.values())
            if drift_detected:
                self._handle_drift_detection(drift_results)

            # Update performance tracker
            drift_type = None
            detector_name = None
            for name, result in drift_results.items():
                if result['detected']:
                    drift_type = result['type']
                    detector_name = name
                    break

            processing_time = time.time() - start_time
            self.performance_tracker.record_batch(
                self.batch_id, drift_detected, drift_type,
                detector_name, accuracy, processing_time
            )

            # Update dashboard
            self.dashboard.update(
                self.batch_id,
                drift_results,
                accuracy,
                self.window_manager.window_size,
                self.performance_tracker.get_summary()
            )

            self.batch_id += 1

        except Exception as e:
            print(f"Error processing batch {self.batch_id}: {e}")

    def _run_drift_detection(self):
        """Run all drift detectors"""
        results = {}

        # KL Divergence Detection
        kl_detected, kl_score, kl_details = self.kl_detector.detect(self.detected_features)
        results['kl'] = {
            'detected': kl_detected,
            'score': kl_score,
            'threshold': kl_details.get('threshold', 0),
            'type': self.kl_detector.drift_type,
            'details': kl_details
        }

        # Hellinger Distance Detection
        hell_detected, hell_score, hell_details = self.hellinger_detector.detect(self.detected_features)
        results['hellinger'] = {
            'detected': hell_detected,
            'score': hell_score,
            'threshold': hell_details.get('threshold', 0),
            'type': self.hellinger_detector.drift_type,
            'details': hell_details
        }

        return results

    def _handle_drift_detection(self, drift_results):
        """Handle detected drift"""
        # Determine primary drift type
        primary_detector = None
        max_confidence = 0

        for name, result in drift_results.items():
            if result['detected']:
                confidence = result['score'] / result['threshold'] if result['threshold'] > 0 else 0
                if confidence > max_confidence:
                    max_confidence = confidence
                    primary_detector = name

        if primary_detector:
            print(f"Primary drift detected by {primary_detector}: {drift_results[primary_detector]['type']}")

            # Update window size
            self.window_manager.update_window_size(True, max_confidence)

            # Handle model adaptation
            if max_confidence > WARNING_THRESHOLD_MULTIPLIER:
                self.model_manager.handle_warning()
            if max_confidence > DRIFT_THRESHOLD_MULTIPLIER:
                self.model_manager.handle_drift()

    def start_streaming(self):
        """Start the streaming process"""
        # Display dashboard
        self.dashboard.display()

        # Define schema for Kafka messages
        schema = StructType([
            StructField(f"at{i}", DoubleType(), True) for i in range(1, 7)
        ] + [StructField("cl", StringType(), True)])

        # Create Kafka stream
        kafka_stream = self.spark \
            .readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", KAFKA_BROKER) \
            .option("subscribe", KAFKA_TOPIC) \
            .option("startingOffsets", "latest") \
            .option("maxOffsetsPerTrigger", MAX_OFFSETS_PER_TRIGGER) \
            .load()

        # Parse JSON messages
        parsed_stream = kafka_stream \
            .selectExpr("CAST(value AS STRING)") \
            .select(from_json(col("value"), schema).alias("data")) \
            .select("data.*")

        # Process stream
        query = parsed_stream \
            .writeStream \
            .foreachBatch(self.process_batch) \
            .trigger(processingTime=BATCH_INTERVAL) \
            .outputMode("update") \
            .start()

        print(f"""
        ========================================
        Adaptive Drift Detection System Started
        ========================================
        Kafka Broker: {KAFKA_BROKER}
        Topic: {KAFKA_TOPIC}
        Initial Window Size: {INITIAL_WINDOW_SIZE}
        Batch Interval: {BATCH_INTERVAL}

        Press Ctrl+C to stop...
        ========================================
        """)

        try:
            query.awaitTermination()
        except KeyboardInterrupt:
            print("\nStopping stream...")
            query.stop()
            self.save_results()

    def save_results(self):
        """Save results to CSV and JSON"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Create results directory
        os.makedirs('results', exist_ok=True)

        # Save drift history
        results_data = []
        for i, drift in enumerate(self.performance_tracker.drift_history):
            results_data.append({
                'drift_id': i + 1,
                'batch_id': drift['batch_id'],
                'drift_type': drift['drift_type'],
                'detector': drift['detector'],
                'timestamp': drift['timestamp']
            })

        if results_data:
            df = pd.DataFrame(results_data)
            filename = f"results/drift_detection_results_{timestamp}.csv"
            df.to_csv(filename, index=False)
            print(f"Results saved to {filename}")

        # Save performance summary
        summary = self.performance_tracker.get_summary()
        summary_filename = f"results/drift_detection_summary_{timestamp}.json"
        with open(summary_filename, 'w') as f:
            json.dump(summary, f, indent=4, default=str)
        print(f"Summary saved to {summary_filename}")

        # Save detailed metrics
        metrics_data = {
            'processing_times': list(self.performance_tracker.metrics['processing_time']),
            'drift_detection_delays': list(self.performance_tracker.metrics['drift_detection_delay']),
            'total_batches': self.batch_id,
            'final_window_size': self.window_manager.window_size,
            'model_version': self.model_manager.model_version
        }

        metrics_filename = f"results/drift_detection_metrics_{timestamp}.json"
        with open(metrics_filename, 'w') as f:
            json.dump(metrics_data, f, indent=4)
        print(f"Metrics saved to {metrics_filename}")


In [13]:
# Utility Functions
def check_kafka_connection():
    """Check if Kafka is accessible"""
    try:
        # For basic check, we'll just verify broker connectivity
        import socket
        host, port = KAFKA_BROKER.split(':')
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(5)
        result = sock.connect_ex((host, int(port)))
        sock.close()

        if result == 0:
            print(f"✅ Kafka broker {KAFKA_BROKER} is accessible")
            return True
        else:
            print(f"❌ Cannot connect to Kafka broker {KAFKA_BROKER}")
            return False
    except Exception as e:
        print(f"❌ Error checking Kafka connection: {e}")
        return False

def create_results_directory():
    """Create directory for results"""
    os.makedirs('results', exist_ok=True)
    os.makedirs(CHECKPOINT_LOCATION, exist_ok=True)

In [14]:
# Benchmark Testing Functions
class BenchmarkTester:
    """Utilities for benchmarking the drift detection system"""

    @staticmethod
    def generate_synthetic_drift_data(drift_type="sudden", n_samples=1000, n_features=3):
        """Generate synthetic data with controlled drift"""
        np.random.seed(42)

        # Pre-drift data
        pre_drift = np.random.randn(n_samples // 2, n_features)

        # Post-drift data
        if drift_type == "sudden":
            # Sudden shift in mean
            post_drift = np.random.randn(n_samples // 2, n_features) + 2
        elif drift_type == "gradual":
            # Gradual shift
            post_drift = []
            for i in range(n_samples // 2):
                shift = 2 * (i / (n_samples // 2))
                post_drift.append(np.random.randn(n_features) + shift)
            post_drift = np.array(post_drift)
        elif drift_type == "incremental":
            # Incremental changes
            post_drift = []
            for i in range(n_samples // 2):
                if i % 100 == 0:
                    shift = (i // 100) * 0.5
                else:
                    shift = ((i // 100) * 0.5)
                post_drift.append(np.random.randn(n_features) + shift)
            post_drift = np.array(post_drift)
        else:
            # No drift
            post_drift = np.random.randn(n_samples // 2, n_features)

        return np.vstack([pre_drift, post_drift])

    @staticmethod
    def evaluate_detector_performance(detector, data, true_drift_point):
        """Evaluate detector performance metrics"""
        detected_points = []

        window_size = 100
        for i in range(window_size, len(data), 10):
            window_data = data[i-window_size:i]
            # Simulate feature data
            detector.window_manager.add_data('feature1', window_data[:, 0])
            detected, score, _ = detector.detect(['feature1'])

            if detected:
                detected_points.append(i)

        # Calculate metrics
        if detected_points:
            detection_delay = min(detected_points) - true_drift_point
            false_positives = sum(1 for p in detected_points if p < true_drift_point)
        else:
            detection_delay = len(data) - true_drift_point
            false_positives = 0

        return {
            'detection_delay': detection_delay,
            'false_positives': false_positives,
            'detected_points': detected_points
        }

In [15]:
# Main Execution Function
def main():
    """Main execution function"""
    print("=" * 60)
    print("ADAPTIVE CONCEPT DRIFT DETECTION FRAMEWORK")
    print("=" * 60)

    # Setup
    create_results_directory()

    # Check Kafka connection
    if not check_kafka_connection():
        print("\nPlease ensure Kafka is running and the topic exists.")
        print(f"Create topic with: kafka-topics.sh --create --topic {KAFKA_TOPIC} --bootstrap-server {KAFKA_BROKER}")
        return

    # Initialize and start system
    system = AdaptiveDriftDetectionSystem()
    system.start_streaming()

    print("\nSystem shutdown complete.")


In [16]:
# Interactive Controls and Quick Start
class QuickStart:
    """Quick start utilities for the notebook"""

    @staticmethod
    def create_control_panel():
        """Create an interactive control panel"""
        # System controls
        start_button = widgets.Button(
            description='Start System',
            button_style='success',
            icon='play'
        )

        stop_button = widgets.Button(
            description='Stop System',
            button_style='danger',
            icon='stop',
            disabled=True
        )

        benchmark_button = widgets.Button(
            description='Run Benchmark',
            button_style='info',
            icon='chart-line'
        )

        # Configuration inputs
        window_size_input = widgets.IntSlider(
            value=INITIAL_WINDOW_SIZE,
            min=50,
            max=500,
            step=10,
            description='Window Size:',
            style={'description_width': 'initial'}
        )

        threshold_input = widgets.FloatSlider(
            value=DRIFT_THRESHOLD_MULTIPLIER,
            min=0.5,
            max=2.0,
            step=0.1,
            description='Drift Threshold:',
            style={'description_width': 'initial'}
        )

        # Status output
        status_output = widgets.Output()

        # System instance
        system_instance = None

        # Event handlers
        def on_start_click(b):
            nonlocal system_instance
            with status_output:
                clear_output()
                print("Starting drift detection system...")
                try:
                    # Update global configs with UI values
                    global INITIAL_WINDOW_SIZE, DRIFT_THRESHOLD_MULTIPLIER
                    INITIAL_WINDOW_SIZE = window_size_input.value
                    DRIFT_THRESHOLD_MULTIPLIER = threshold_input.value

                    # Start system
                    system_instance = AdaptiveDriftDetectionSystem()
                    start_button.disabled = True
                    stop_button.disabled = False

                    # Run in separate thread to keep UI responsive
                    import threading
                    thread = threading.Thread(target=system_instance.start_streaming)
                    thread.start()
                except Exception as e:
                    print(f"Error starting system: {e}")
                    start_button.disabled = False
                    stop_button.disabled = True

        def on_stop_click(b):
            with status_output:
                clear_output()
                print("Stopping system...")
                if system_instance:
                    system_instance.is_running = False
                start_button.disabled = False
                stop_button.disabled = True

        def on_benchmark_click(b):
            with status_output:
                clear_output()
                print("Running benchmark tests...")
                run_benchmark_tests()

        start_button.on_click(on_start_click)
        stop_button.on_click(on_stop_click)
        benchmark_button.on_click(on_benchmark_click)

        # Create layout
        control_panel = widgets.VBox([
            widgets.HTML("<h2 style='text-align: center;'>🎛️ Control Panel</h2>"),
            widgets.HBox([start_button, stop_button, benchmark_button]),
            widgets.HTML("<h3>Configuration</h3>"),
            window_size_input,
            threshold_input,
            widgets.HTML("<h3>Status</h3>"),
            status_output
        ], layout=widgets.Layout(
            padding='20px',
            border='2px solid #ddd',
            border_radius='10px'
        ))

        return control_panel

def run_benchmark_tests():
    """Run comprehensive benchmark tests"""
    tester = BenchmarkTester()

    # Test different drift types
    drift_types = ['sudden', 'gradual', 'incremental']
    results = {}

    for drift_type in drift_types:
        print(f"\nTesting {drift_type} drift...")

        # Generate synthetic data
        data = tester.generate_synthetic_drift_data(drift_type=drift_type)

        # Test with window manager
        window_manager = AdaptiveWindowManager()

        # Test KL detector
        kl_detector = KLDivergenceDetector(window_manager)
        kl_results = tester.evaluate_detector_performance(kl_detector, data, 500)

        # Test Hellinger detector
        hell_detector = HellingerDistanceDetector(window_manager)
        hell_results = tester.evaluate_detector_performance(hell_detector, data, 500)

        results[drift_type] = {
            'kl': kl_results,
            'hellinger': hell_results
        }

        print(f"KL Divergence - Detection Delay: {kl_results['detection_delay']}")
        print(f"Hellinger Distance - Detection Delay: {hell_results['detection_delay']}")

    # Visualize results
    plt.figure(figsize=(10, 6))

    drift_types_list = list(results.keys())
    kl_delays = [results[dt]['kl']['detection_delay'] for dt in drift_types_list]
    hell_delays = [results[dt]['hellinger']['detection_delay'] for dt in drift_types_list]

    x = np.arange(len(drift_types_list))
    width = 0.35

    plt.bar(x - width/2, kl_delays, width, label='KL Divergence', color='red', alpha=0.7)
    plt.bar(x + width/2, hell_delays, width, label='Hellinger Distance', color='blue', alpha=0.7)

    plt.xlabel('Drift Type')
    plt.ylabel('Detection Delay (samples)')
    plt.title('Drift Detection Performance Comparison')
    plt.xticks(x, drift_types_list)
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    return results

In [17]:
#Create and display control panel
if __name__ == "__main__":
    print("Creating control panel...")
    control_panel = QuickStart.create_control_panel()
    display(control_panel)
    print("\nUse the control panel above to:")
    print("1. Start/Stop the drift detection system")
    print("2. Adjust configuration parameters")
    print("3. Run benchmark tests")


# run
if __name__ == "__main__":
    main()

Creating control panel...


VBox(children=(HTML(value="<h2 style='text-align: center;'>🎛️ Control Panel</h2>"), HBox(children=(Button(butt…


Use the control panel above to:
1. Start/Stop the drift detection system
2. Adjust configuration parameters
3. Run benchmark tests
ADAPTIVE CONCEPT DRIFT DETECTION FRAMEWORK
✅ Kafka broker localhost:9092 is accessible


25/07/11 11:09:36 WARN Utils: Your hostname, djkumar143-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/07/11 11:09:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/djkumar143/spark/spark-3.2.0-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/djkumar143/.ivy2/cache
The jars for the packages stored in: /home/djkumar143/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9175595b-43a3-4b0f-9801-431d99fdd3b9;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in

VBox(children=(HTML(value="<h2 style='text-align: center; color: #2c3e50;'>🚀 Adaptive Drift Detection Dashboar…

25/07/11 11:09:44 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-1aa8df62-d854-4ff5-864e-b03654ac25c1. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/07/11 11:09:44 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.



        Adaptive Drift Detection System Started
        Kafka Broker: localhost:9092
        Topic: drift_detection_topic
        Initial Window Size: 200
        Batch Interval: 5 seconds

        Press Ctrl+C to stop...
        
Detected features: ['at1', 'at2', 'at3', 'at4']


25/07/11 11:09:51 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 6160 milliseconds


Primary drift detected by kl: sudden
Model switched to version 1


Primary drift detected by kl: sudden
Model switched to version 2


Primary drift detected by hellinger: gradual/incremental
Model switched to version 3


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/djkumar143/Desktop/kafka-pyspark/concept_drift/adaptive_drift_detection/concept_drift_env/lib/python3.11/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: reentrant call inside <_io.BufferedReader name=66>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/djkumar143/Desktop/kafka-pyspark/concept_drift/adaptive_drift_detection/concept_drift_env/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/djkumar143/Desktop/kafka-pyspark/concept_drift/adaptive_drift_detection/concept_drift_env/lib/python3.11/site-packages/py4j/clientserver.py", line 503, in send_command
    

Py4JError: An error occurred while calling o79.awaitTermination


        Adaptive Drift Detection System Started
        Kafka Broker: localhost:9092
        Topic: drift_detection_topic
        Initial Window Size: 200
        Batch Interval: 5 seconds

        Press Ctrl+C to stop...
        
Detected features: ['at1', 'at2', 'at3', 'at4']
