In [None]:
import requests
from kafka import KafkaProducer
import json

def send_tan_to_kafka(topic, api_url, fields={}):
    # Kafka configuration
    kafka_config = {
        'bootstrap_servers': 'kafka1:9092',  # Update with your Kafka broker
    }

    # Initialize Kafka Producer
    producer = KafkaProducer(
        bootstrap_servers=kafka_config['bootstrap_servers'],
        value_serializer=lambda v: json.dumps(v).encode('utf-8')
    )

    # Fetch data from TAN API
    response = requests.get(api_url)

    if response.status_code == 200:
        data = response.json()

        # For each entry in the data, process and send it to Kafka
        for entry in data:
            # Process fields based on the provided mapping
            for field in fields:
                entry[fields[field]] = entry.pop(field, None)

            # Send the data to Kafka
            producer.send(topic, value=entry)
            print(f"Sent: {entry}")

        # Ensure all messages are sent
        producer.flush()
        print(f"Sent {len(data)} records.")
    else:
        print(f"Failed to fetch data: {response.status_code}, {response.text}")

# API URL for the stop data (using f-string formatting)
latitude = "47.264"
longitude = "-1.585"
api_url = f"https://open.tan.fr/ewp/arrets.json/{latitude}/{longitude}"

# Field mappings from API response to Kafka data schema
fields = {
    "codeLieu": "stop_code",
    "libelle": "stop_name",
    "distance": "stop_distance",
}

# Example of sending the data to Kafka
send_tan_to_kafka("tan_stops", api_url, fields)


Sent: {'ligne': [{'numLigne': '109'}, {'numLigne': '116'}, {'numLigne': '2'}, {'numLigne': '2B'}, {'numLigne': '50'}, {'numLigne': '59'}, {'numLigne': '89'}, {'numLigne': 'C2'}], 'stop_code': 'LCAR', 'stop_name': 'Le Cardo', 'stop_distance': '256 m'}
Sent: {'ligne': [{'numLigne': '59'}], 'stop_code': 'AURR', 'stop_name': 'Aurore', 'stop_distance': '324 m'}
Sent: {'ligne': [{'numLigne': '116'}, {'numLigne': '2B'}, {'numLigne': '50'}, {'numLigne': '59'}, {'numLigne': 'C2'}], 'stop_code': 'LRHE', 'stop_name': 'Les Roches', 'stop_distance': '428 m'}
Sent: {'ligne': [{'numLigne': '109'}, {'numLigne': '116'}, {'numLigne': '2B'}, {'numLigne': '50'}, {'numLigne': '89'}], 'stop_code': 'CORA', 'stop_name': 'Conraie', 'stop_distance': '430 m'}
Sent: {'ligne': [{'numLigne': '59'}], 'stop_code': 'BDLA', 'stop_name': 'Bout des Landes', 'stop_distance': '432 m'}
Sent 5 records.


In [None]:
from kafka import KafkaConsumer
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Kafka Consumer pour récupérer les données batch
consumer = KafkaConsumer(
    "tan_stops", 
    bootstrap_servers="kafka1:9092",
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))
)

# Stocker les messages Kafka dans une liste
data = [message.value for message in consumer]

# Convertir en DataFrame Pandas
df = pd.DataFrame(data)

# Vérifier la structure des données
print(df.head())

# Enregistrer en CSV pour analyse (optionnel)
df.to_csv("tan_stops_data.csv", index=False)


In [None]:
sns.histplot(df["stop_distance"].astype(float), bins=30, kde=True)
plt.xlabel("Distance (mètres)")
plt.ylabel("Nombre d'arrêts")
plt.title("Répartition des arrêts par distance")
plt.show()


In [None]:
top_stops = df["stop_name"].value_counts().head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x=top_stops.index, y=top_stops.values, palette="viridis")
plt.xticks(rotation=45)
plt.xlabel("Nom des arrêts")
plt.ylabel("Nombre d'apparitions")
plt.title("Top 10 des arrêts les plus fréquents")
plt.show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window
from pyspark.sql.types import StructType, StringType

# Création de la session Spark
spark = SparkSession.builder \
    .appName("KafkaSparkStreaming") \
    .getOrCreate()

# Définition du schéma des données
schema = StructType() \
    .add("stop_code", StringType()) \
    .add("stop_name", StringType()) \
    .add("stop_distance", StringType())

# Lecture en streaming depuis Kafka
df_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("subscribe", "tan_stops") \
    .load()

# Transformation des données
df_parsed = df_stream \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# **📌 Étape 2 : Fenêtre temporelle sur les arrêts de bus**
df_windowed = df_parsed \
    .withWatermark("timestamp", "5 minutes") \
    .groupBy(
        window(col("timestamp"), "5 minutes"), 
        col("stop_name")
    ) \
    .count()

# Affichage des résultats dans la console en temps réel
query = df_windowed.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query.awaitTermination()


In [None]:
# Schéma pour les temps d'attente
schema_wait = StructType() \
    .add("codeArret", StringType()) \
    .add("temps", StringType()) \
    .add("numLigne", StringType())

# Lecture en streaming des temps d’attente
df_wait_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("subscribe", "tan_wait_times") \
    .load()

# Transformation des données
df_wait_parsed = df_wait_stream \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema_wait).alias("data")) \
    .select("data.*")

# Fenêtre temporelle pour regrouper les temps d'attente sur 10 minutes
df_wait_windowed = df_wait_parsed \
    .withWatermark("timestamp", "10 minutes") \
    .groupBy(
        window(col("timestamp"), "10 minutes"), 
        col("numLigne")
    ) \
    .avg("temps")

# Affichage en streaming
query_wait = df_wait_windowed.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query_wait.awaitTermination()
