# Naolib Streaming Analysis

This notebook performs streaming analysis on real-time Naolib transportation data using a simplified approach.

In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import time
import json
import re
from kafka import KafkaConsumer
from IPython.display import clear_output
import matplotlib.pyplot as plt
import seaborn as sns

# Create a SparkSession
spark = SparkSession.builder \
    .appName('NaolibStreamingAnalysis') \
    .master('local[*]') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/25 01:00:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/25 01:00:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/25 01:00:41 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/03/25 01:00:41 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


## 1. Function to Collect Data from Kafka

In [2]:
# Define Kafka configurations
kafka_topic = "naolib_realtime"
kafka_server = "kafka1:9092"

# Function to convert wait time text to minutes
def convert_wait_time(wt):
    if pd.isna(wt):
        return None
    if wt == "proche":
        return 0
    # Try to extract numbers
    if isinstance(wt, str):
        # If it's just a number
        if wt.isdigit():
            return int(wt)
        # If it's in format "XYmn"
        match = re.search(r'(\d+)', wt)
        if match:
            return int(match.group(1))
    return None

# Function to collect real-time data from Kafka
def collect_realtime_data(max_messages=50, timeout=10):
    """
    Collect real-time data from Kafka
    max_messages: Maximum number of messages to collect
    timeout: Time to wait for messages in seconds
    """
    print(f"Collecting up to {max_messages} messages from Kafka...")
    consumer = KafkaConsumer(
        kafka_topic,
        bootstrap_servers=kafka_server,
        auto_offset_reset='earliest',
        consumer_timeout_ms=timeout*1000
    )
    
    messages = []
    expanded_rows = []
    
    start_time = time.time()
    
    for message in consumer:
        try:
            msg_data = json.loads(message.value.decode('utf-8'))
            messages.append(msg_data)
            
            # Extract data from the message
            timestamp = msg_data.get('timestamp')
            stop_code = msg_data.get('stop_code')
            stop_name = msg_data.get('stop_name')
            
            # Process arrivals array
            arrivals = msg_data.get('arrivals', [])
            for arrival in arrivals:
                new_row = {
                    'timestamp': timestamp,
                    'stop_code': stop_code,
                    'stop_name': stop_name,
                    'direction': arrival.get('sens'),
                    'terminus': arrival.get('terminus'),
                    'wait_time_text': arrival.get('temps'),
                    'is_real_time': arrival.get('tempsReel'),
                    'line_number': arrival.get('ligne', {}).get('numLigne'),
                    'processing_time': pd.Timestamp.now()
                }
                expanded_rows.append(new_row)
        except Exception as e:
            print(f"Error processing message: {str(e)}")
        
        if len(messages) >= max_messages:
            break
            
        if time.time() - start_time > timeout:
            break
    
    consumer.close()
    
    # Convert to DataFrame
    if expanded_rows:
        df = pd.DataFrame(expanded_rows)
        df['wait_time_minutes'] = df['wait_time_text'].apply(convert_wait_time)
        print(f"Collected {len(messages)} messages with {len(expanded_rows)} arrivals")
        return df
    else:
        print("No data collected")
        return pd.DataFrame()

## 2. Streaming Analysis 1: Real-time Average Wait Times

Our first streaming analysis calculates real-time average wait times with a 10-minute sliding window.

In [5]:
def analyze_realtime_wait_times():
    """Analyse des temps d'attente en temps réel avec meilleure visualisation"""
    
    # 1. Collecte des données
    data = collect_realtime_data(max_messages=50, timeout=10)
    
    if data.empty:
        print("⚠️ Aucune donnée reçue de Kafka")
        return None
    
    # 2. Nettoyage et conversion
    data['wait_time_minutes'] = data['temps'].apply(convert_wait_time)
    data = data.dropna(subset=['wait_time_minutes'])
    
    # 3. Analyse par ligne
    analysis = data.groupby('numLigne').agg(
        avg_wait=('wait_time_minutes', 'mean'),
        std_wait=('wait_time_minutes', 'std'),
        min_wait=('wait_time_minutes', 'min'),
        max_wait=('wait_time_minutes', 'max'),
        count=('numLigne', 'count')
    ).sort_values('avg_wait', ascending=False)
    
    # 4. Affichage détaillé
    print(f"\n📊 Analyse des temps d'attente ({len(data)} observations valides)")
    print(f"Période : {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("="*50)
    
    if not analysis.empty:
        # Affichage tableau
        display(analysis.style
                .background_gradient(cmap='YlOrRd', subset=['avg_wait'])
                .format({'avg_wait': '{:.1f} min', 'std_wait': '{:.1f}'}))
        
        # Visualisation
        plt.figure(figsize=(12, 6))
        sns.boxplot(
            x='numLigne', 
            y='wait_time_minutes',
            data=data,
            palette='viridis'
        )
        plt.title("Distribution des temps d'attente par ligne", pad=20)
        plt.xlabel("Ligne de bus")
        plt.ylabel("Temps d'attente (minutes)")
        plt.grid(axis='y', alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    else:
        print("Aucune donnée valide après filtrage")
    
    return analysis

# Exécution
wait_stats = analyze_realtime_wait_times()

Collecting up to 50 messages from Kafka...
Collected 6 messages with 70 arrivals


KeyError: 'temps'

## 3. Continuous Monitoring

Let's run the analysis continuously to simulate streaming with sliding windows.

In [6]:
# Number of iterations to run
num_iterations = 3
interval_seconds = 60

try:
    for i in range(num_iterations):
        clear_output(wait=True)
        print(f"Iteration {i+1}/{num_iterations}")
        analyze_realtime_wait_times()
        
        if i < num_iterations - 1:
            print(f"\nWaiting {interval_seconds} seconds for next analysis...")
            time.sleep(interval_seconds)
except KeyboardInterrupt:
    print("\nMonitoring stopped by user.")

Iteration 1/3
Collecting up to 50 messages from Kafka...
Collected 6 messages with 70 arrivals


KeyError: 'temps'

## 4. Streaming Analysis 2: Delay Detection

Our second streaming analysis detects unusual delays using a 15-minute window.

In [7]:
# Define baseline wait time for delay detection
typical_wait_time = 10  # minutes

# Function to detect delays
def detect_delays():
    """Detect unusual delays in real-time"""
    
    # Collect data
    data = collect_realtime_data(max_messages=100, timeout=15)
    if data.empty:
        return
    
    # Mark delays - consider waits 50% above typical as delays
    data['is_delayed'] = data['wait_time_minutes'] > (typical_wait_time * 1.5)
    data['delay_minutes'] = data.apply(
        lambda x: x['wait_time_minutes'] - typical_wait_time if x['is_delayed'] else 0, 
        axis=1
    )
    
    # Group by line and stop to detect patterns
    grouped = data.groupby(['line_number', 'stop_name']).agg(
        max_wait_time=('wait_time_minutes', 'max'),
        avg_wait_time=('wait_time_minutes', 'mean'),
        observation_count=('wait_time_minutes', 'count'),
        delayed_count=('is_delayed', 'sum'),
        avg_delay_minutes=('delay_minutes', 'mean')
    ).reset_index()
    
    # Filter for significant delays (at least 2 delayed observations)
    significant = grouped[grouped['delayed_count'] >= 2]
    
    # Add severity classification
    significant['delay_severity'] = 'MINOR'
    significant.loc[significant['avg_delay_minutes'] > 10, 'delay_severity'] = 'MODERATE'
    significant.loc[significant['avg_delay_minutes'] > 20, 'delay_severity'] = 'SEVERE'
    
    # Sort by severity
    significant = significant.sort_values('avg_delay_minutes', ascending=False)
    
    # Print results
    now = pd.Timestamp.now()
    print(f"\n=== Real-time Delay Detection (15-minute window) ===")
    print(f"Analysis time: {now}")
    print(f"Total delays detected: {data['is_delayed'].sum()}")
    
    if not significant.empty:
        print("\nSignificant Delays:")
        for _, row in significant.iterrows():
            print(f"DELAY ALERT: Line {row['line_number']} at {row['stop_name']} - {row['delay_severity']} delay of"
                  f" {row['avg_delay_minutes']:.1f} minutes")
    else:
        print("\nNo significant delays detected.")
        
    return significant

# Run the delay detection
delays = detect_delays()

Collecting up to 100 messages from Kafka...
Collected 6 messages with 70 arrivals

=== Real-time Delay Detection (15-minute window) ===
Analysis time: 2025-03-25 01:09:51.525010
Total delays detected: 0

No significant delays detected.


## 5. Continuous Delay Monitoring

Let's monitor for delays continuously to simulate a real-time alert system.

In [8]:
# Number of iterations to run
num_iterations = 3
interval_seconds = 120

try:
    for i in range(num_iterations):
        clear_output(wait=True)
        print(f"Iteration {i+1}/{num_iterations}")
        detect_delays()
        
        if i < num_iterations - 1:
            print(f"\nWaiting {interval_seconds} seconds for next detection...")
            time.sleep(interval_seconds)
except KeyboardInterrupt:
    print("\nMonitoring stopped by user.")

Iteration 3/3
Collecting up to 100 messages from Kafka...


NoBrokersAvailable: NoBrokersAvailable