# Analyse du trafic cycliste à Paris en fonction de la météo

Ce mini-projet d'extraction et d'analyse de données autour du trafic cycliste à Paris a pour objectif :
- Comprendre et appliquer le concept d'ETL (Extract, Trasnform, Load)
- Explorer des données ouvertes (open data)
- Croiser deux sources de données (API + scraping)
- Stocker les données dans une base SQL (SQLite3)
- Réaliser une visualisation (Streamlit) et/ou un modèle simple

Précisions quant au concept d'ETL :
- **Extract**: récupérer des données depuis une source (API, fichier, web...)
- **Transform**: nettoyer, reformater, enrichir les données
- **Load**: stocker les données dans une format structuré (CSV, DB, ...)

On souhaite répondre à la problématique suivante :

**Comment la météo influence-t-elle l'utilisation des pistes cyclables à Paris ?**

---

## Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import  datetime
from dateutil.relativedelta import relativedelta

## Extraction des données
Sources :

- [Données météo station Tour Eiffel](https://www.meteociel.fr/climatologie/obs_villes.php)
- [Données compteurs de vélos pour le site de comptage nommé 36 quai de Grenelle](https://opendata.paris.fr/explore/dataset/comptage-velo-donnees-compteurs/api/?disjunctive.id_compteur&disjunctive.nom_compteur&disjunctive.id&disjunctive.name)


### Scraping Meteociel

In [None]:
class MeteocielProcessor:
    def __init__(self):
        self.base_url = "https://www.meteociel.fr/climatologie/obs_villes.php?code2=75107005"
    
    def get_daily_data_from_period(self, start_month:int, start_year:int, end_month:int, end_year:int):
        # Init on start date
        current_date = datetime.date(year=start_year, month=start_month, day=1)
        end_date = datetime.date(year=end_year, month=end_month, day=1)
        try:
            df = self.get_daily_data_from_date(current_date.month, current_date.year)
        except:
            df = pd.DataFrame()
            
        # Loop until end date is reached
        while current_date < end_date:
            current_date = current_date + relativedelta(months=1)
            try:
                df = pd.concat([df, self.get_daily_data_from_date(current_date.month, current_date.year)], axis=0)
            except:
                print(f"Failed to load data from {current_date.month}-{current_date.year}")
        return df
    
    def get_daily_data_from_date(self, month:int, year:int):
        page = self.fetch_page_from_date(month, year)
        df = self.get_daily_data_from_page(page, month, year)
        return df
    
    def fetch_page_from_date(self, month:int, year:int):
        """
        Fetch the content of the the meteociel website for station 'Eiffel Tower' for a given month & year.as_integer_ratio

        :
        """
        # Error handling
        if month not in [i for i in range(1,13)]:
            raise ValueError(f"Month must be an `int` from 1 to 12, was given {month} instead.")
        if year not in [i for i in range(1996, 2026)]:
            raise ValueError(f"Year must be an ``int from 1996 to 2025, was given {year} instead.")
        
        # Fetch data
        url = f"{self.base_url}&mois={month}&annee={year}"
        response = requests.get(url)

        if response.status_code == 200:
            #print(f"Content from {month}-{year} successfully fetched.")
            return BeautifulSoup(response.content, "html.parser")
        else:
            print(f"Failed to load content from {month}-{year}.")
            print(response.text)
    
    def get_daily_data_from_page(self, page, month:int, year:int):
        data = {"date": [],
        "temp_max": [],
        "temp_min": []
        }

        # Retrieve dates
        dates_content = (
            page
            .find_all("table", cellpadding="2")[0]
            .find_all("td", bgcolor="#FFFFCC")
            )
        for el in dates_content:
            data["date"].append(f"{el.text.split(" ")[-1]}-{month}-{year}")

        # Retrieve max temp
        maxtemp_content = (
            page
            .find_all("table", cellpadding="2")[0]
            .find_all("td", bgcolor="#FFDDDD")
        )
        for el in maxtemp_content:
            temp = el.text.split()[0]
            if temp == "---":
                temp = None
            else:
                temp = float(temp)
            data["temp_max"].append(temp)

        # Retrieve min temp
        mintemp_content = (
            page
            .find_all("table", cellpadding="2")[0]
            .find_all("td", bgcolor="#DDDDFF")
        )
        for el in mintemp_content:
            temp = el.text.split()[0]
            if temp == "---":
                temp = None
            else:
                temp = float(temp)
            data["temp_min"].append(temp)
        
        # Transform to dataframe and return
        df = pd.DataFrame(data)
        df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
        df = df.set_index(keys="date")
        return df

In [None]:
response = meteociel_processor.fetch_page_from_date(2, 2024)
response

In [139]:
meteociel_processor = MeteocielProcessor()
start_date = datetime.date(year=2024, month=1, day=1)
end_date = datetime.date(year=2025, month=12, day=1)

df = meteociel_processor.get_daily_data_from_period(start_date.month, start_date.year, end_date.month, end_date.year)
df.head()

Failed to load data from 2-2024
Failed to load data from 3-2024
Failed to load data from 4-2024
Failed to load data from 5-2024
Failed to load data from 6-2024
Failed to load data from 7-2024
Failed to load data from 8-2024
Failed to load data from 9-2024
Failed to load data from 10-2024
Failed to load data from 11-2024
Failed to load data from 12-2024
Failed to load data from 1-2025
Failed to load data from 2-2025
Failed to load data from 3-2025
Failed to load data from 4-2025
Failed to load data from 5-2025
Failed to load data from 6-2025
Failed to load data from 7-2025
Failed to load data from 8-2025
Failed to load data from 9-2025
Failed to load data from 10-2025
Failed to load data from 11-2025
Failed to load data from 12-2025


In [None]:
df_meteociel.to_csv("../data/daily-temperatures-2024-2025.csv")

### API Open Data

In [98]:
import json

def update_cyclist_data(data, offset):
    url = "https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/comptage-velo-donnees-compteurs/records"
    params = {
        "select": ["sum_counts","date"],
        "limit": 100,
        "offset": offset,
        "refine": ['nom_compteur:36 quai de Grenelle NE-SO']
        }
    response = json.loads(
        requests.get(url, params).content
        )
    
    data["date"] = data["date"] + [dic['date'] for dic in response["results"]]
    data["sum_count"] = data["sum_count"] + [dic['sum_counts'] for dic in response["results"]]

def get_total_count():
    url = "https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/comptage-velo-donnees-compteurs/records"
    offset = 0
    params = {
        "select": ["sum_counts","date"],
        "limit": 0,
        "offset": offset,
        "refine": ['nom_compteur:36 quai de Grenelle NE-SO']
        }
    response = json.loads(
        requests.get(url, params).content
        )
    return response['total_count']

In [99]:
data = {
    "date": [],
    "sum_count": []
    }
offset = 0
while offset <= get_total_count():
    update_cyclist_data(data, offset)
    offset += 100

In [128]:
df_cyclist = pd.DataFrame(data)
df_cyclist["date"] = pd.to_datetime(df_cyclist["date"])
df_cyclist = df_cyclist.set_index("date")

## Transformation des données

Les données de comptage de vélo sont échantillonnées toutes les heures, là où les données de températures sont quotidiennes.
- Nous commençons donc par récupérer le compte journalier de vélo avec un `groupby().sum()`
- Avant de pouvoir `join` les deux tables.

In [129]:
df_cyclist = df_cyclist.groupby(df_cyclist.index.date).sum()
df_cyclist.head()

Unnamed: 0,sum_count
2024-11-01,1389
2024-11-02,1601
2024-11-03,1409
2024-11-04,2947
2024-11-05,3125


In [130]:
df = df_cyclist.join(df_meteociel, how='left')
df.head()

NameError: name 'df_meteociel' is not defined