In [23]:
# Basic imports
import requests
import json
from kafka import KafkaProducer


In [None]:
# Cell 2: Fetch and Send TAN Stop Data to Kafka
# Kafka configuration
kafka_config = {
    'bootstrap_servers': 'kafka1:9092',  # Kafka broker address
}

# Initialize Kafka Producer
producer = KafkaProducer(
    bootstrap_servers=kafka_config['bootstrap_servers'],
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# API URL for the stop data with coordinates for Nantes
latitude = "47.264"
longitude = "-1.585"
api_url = f"https://open.tan.fr/ewp/arrets.json/{latitude}/{longitude}"

# Field mappings from API response to our data schema
fields = {
    "codeLieu": "stop_code",
    "libelle": "stop_name",
    "distance": "stop_distance",
}

# Fetch data from TAN API
response = requests.get(api_url)

if response.status_code == 200:
    data = response.json()
    
    # Process and send each entry to Kafka
    for entry in data:
        # Map fields based on the provided mapping
        for field in fields:
            entry[fields[field]] = entry.pop(field, None)
        
        # Send the data to Kafka
        producer.send("tan_stops", value=entry)
        print(f"Sent: {entry}")
    
    # Ensure all messages are sent
    producer.flush()
    print(f"Sent {len(data)} records.")
else:
    print(f"Failed to fetch data: {response.status_code}, {response.text}")

# print(response.text)

Sent: {'ligne': [{'numLigne': '109'}, {'numLigne': '116'}, {'numLigne': '2'}, {'numLigne': '2B'}, {'numLigne': '50'}, {'numLigne': '59'}, {'numLigne': '89'}, {'numLigne': 'C2'}], 'stop_code': 'LCAR', 'stop_name': 'Le Cardo', 'stop_distance': '256 m'}
Sent: {'ligne': [{'numLigne': '59'}], 'stop_code': 'AURR', 'stop_name': 'Aurore', 'stop_distance': '324 m'}
Sent: {'ligne': [{'numLigne': '116'}, {'numLigne': '2B'}, {'numLigne': '50'}, {'numLigne': '59'}, {'numLigne': 'C2'}], 'stop_code': 'LRHE', 'stop_name': 'Les Roches', 'stop_distance': '428 m'}
Sent: {'ligne': [{'numLigne': '109'}, {'numLigne': '116'}, {'numLigne': '2B'}, {'numLigne': '50'}, {'numLigne': '89'}], 'stop_code': 'CORA', 'stop_name': 'Conraie', 'stop_distance': '430 m'}
Sent: {'ligne': [{'numLigne': '59'}], 'stop_code': 'BDLA', 'stop_name': 'Bout des Landes', 'stop_distance': '432 m'}
Sent 5 records.
[{"codeLieu":"LCAR","libelle":"Le Cardo","distance":"256 m","ligne":[{"numLigne":"109"},{"numLigne":"116"},{"numLigne":"2"},

In [None]:
# Cell 3: Batch Processing with Kafka Consumer
# Import additional libraries for data processing
from kafka import KafkaConsumer
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create a Kafka Consumer for retrieving batch data
# Adding timeout to avoid hanging in the notebook
consumer = KafkaConsumer(
    "tan_stops", 
    bootstrap_servers="kafka1:9092",
    value_deserializer=lambda x: json.loads(x.decode('utf-8')),
    consumer_timeout_ms=10000  # Stop after 10 seconds of no new messages
)

# Collect data from Kafka
data = [message.value for message in consumer]

# Convert to Pandas DataFrame
df = pd.DataFrame(data)

# Display the data structure
print("Data structure:")
print(df.head())

# Save to CSV for later analysis (optional)
df.to_csv("tan_stops_data.csv", index=False)
print("Data saved to tan_stops_data.csv")


Data structure:
Empty DataFrame
Columns: []
Index: []
Data saved to tan_stops_data.csv


In [16]:
# Cell 4: Data Visualization - Stop Distance Distribution
# Set figure size for better visibility
plt.figure(figsize=(10, 6))

# Create a histogram of stop distances
sns.histplot(df["stop_distance"].str.replace(' m', '').astype(float), bins=30, kde=True)
plt.xlabel("Distance (mètres)")
plt.ylabel("Nombre d'arrêts")
plt.title("Répartition des arrêts par distance")
plt.show()

KeyError: 'stop_distance'

<Figure size 1000x600 with 0 Axes>

In [17]:
# Cell 5: Data Visualization - Top Stops
# Find the top 10 most frequent stops
top_stops = df["stop_name"].value_counts().head(10)

# Create a bar chart
plt.figure(figsize=(12, 6))
sns.barplot(x=top_stops.index, y=top_stops.values, palette="viridis")
plt.xticks(rotation=45, ha='right')
plt.xlabel("Nom des arrêts")
plt.ylabel("Nombre d'apparitions")
plt.title("Top 10 des arrêts les plus fréquents")
plt.tight_layout()
plt.show()

KeyError: 'stop_name'

In [18]:
# Cell 6: Analysis of Bus Lines per Stop
# Extract and count bus lines per stop
df['line_count'] = df['ligne'].apply(lambda x: len(x))

# Create a scatter plot of distance vs number of lines
plt.figure(figsize=(10, 6))
plt.scatter(
    df["stop_distance"].str.replace(' m', '').astype(float),
    df['line_count'],
    alpha=0.6
)
plt.xlabel("Distance (mètres)")
plt.ylabel("Nombre de lignes desservant l'arrêt")
plt.title("Relation entre distance et nombre de lignes par arrêt")
plt.grid(True, alpha=0.3)
plt.show()

KeyError: 'ligne'

In [19]:
# Cell 7: Streaming Processing Setup with Spark
# Import Spark libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window
from pyspark.sql.types import StructType, StringType

# Create Spark session
spark = SparkSession.builder \
    .appName("TAN_Data_Streaming") \
    .getOrCreate()

print("Spark session created successfully")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/23 23:43:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session created successfully


In [20]:
# Cell 8: Streaming Processing for Stop Data
# Define the schema for stop data
schema = StructType() \
    .add("stop_code", StringType()) \
    .add("stop_name", StringType()) \
    .add("stop_distance", StringType())

# Set up streaming from Kafka
df_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("subscribe", "tan_stops") \
    .load()

# Parse the JSON data
df_parsed = df_stream \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# Create a time window to analyze stop data
df_windowed = df_parsed \
    .withWatermark("timestamp", "5 minutes") \
    .groupBy(
        window(col("timestamp"), "5 minutes"), 
        col("stop_name")
    ) \
    .count()

# Display the results in real-time (comment out to prevent execution)
"""
query = df_windowed.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query.awaitTermination(60)  # Run for 60 seconds
"""
print("Spark streaming query defined (not executed)")

AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide.

In [21]:
# Cell 9: Streaming Analysis of Wait Times
# Schema for wait time data
schema_wait = StructType() \
    .add("codeArret", StringType()) \
    .add("temps", StringType()) \
    .add("numLigne", StringType())

# Stream wait time data from Kafka
df_wait_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("subscribe", "tan_wait_times") \
    .load()

# Parse the JSON data
df_wait_parsed = df_wait_stream \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema_wait).alias("data")) \
    .select("data.*")

# Create a time window to analyze average wait times by line
df_wait_windowed = df_wait_parsed \
    .withWatermark("timestamp", "10 minutes") \
    .groupBy(
        window(col("timestamp"), "10 minutes"), 
        col("numLigne")
    ) \
    .avg("temps")

# Display the results in real-time (comment out to prevent execution)
"""
query_wait = df_wait_windowed.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query_wait.awaitTermination(60)  # Run for 60 seconds
"""
print("Wait times streaming query defined (not executed)")

AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide.

In [22]:
# Cell 10: Summary and Next Steps
"""
# Summary

This notebook has demonstrated a complete data pipeline for TAN public transportation data:

1. **Data Collection**: We fetched bus/tram stop data from the TAN API
2. **Data Ingestion**: We sent the data to Kafka topics
3. **Batch Analysis**: We processed the data with Pandas and created visualizations
4. **Streaming Setup**: We defined Spark streaming jobs for real-time analysis

## Next Steps

1. Collect real-time arrival data using the TAN API
2. Develop predictive models for bus arrival times
3. Create a dashboard to visualize the data in real-time
4. Extend the analysis to include historical patterns and trends
"""

'\n# Summary\n\nThis notebook has demonstrated a complete data pipeline for TAN public transportation data:\n\n1. **Data Collection**: We fetched bus/tram stop data from the TAN API\n2. **Data Ingestion**: We sent the data to Kafka topics\n3. **Batch Analysis**: We processed the data with Pandas and created visualizations\n4. **Streaming Setup**: We defined Spark streaming jobs for real-time analysis\n\n## Next Steps\n\n1. Collect real-time arrival data using the TAN API\n2. Develop predictive models for bus arrival times\n3. Create a dashboard to visualize the data in real-time\n4. Extend the analysis to include historical patterns and trends\n'