Import necessary libraries

In [1]:
import pandas as pd
import requests
import os
from datetime import datetime
import numpy as np



# Downloading data (Traffic and Air quality). 
Automatically downloads from the following dates :

In [2]:

download_from = datetime(2024, 1, 1)  # Example: January 2023
download_until = datetime(2024, 4, 30) # Or set a specific end date

Then run the following to actually download the files.

In [3]:

# Catalan month names
catalan_months = {
    1: "Gener",
    2: "Febrer",
    3: "Marc",
    4: "Abril",
    5: "Maig",
    6: "Juny",
    7: "Juliol",
    8: "Agost",
    9: "Setembre",
    10: "Octubre",
    11: "Novembre",
    12: "Desembre",
}

# Trams transit relacio (relation between ids and locations)
trams_relacio_url = "https://opendata-ajuntament.barcelona.cat/data/dataset/1090983a-1c40-4609-8620-14ad49aae3ab/resource/1d6c814c-70ef-4147-aa16-a49ddb952f72/download/transit_relacio_trams.csv"
trams_relacio_path = "./data/transit_relacio_trams.csv"

# Air quality stations info (including lat and long)
# (The stations are unchanged since 2023 so downloading only the 2025 version is enough)
air_stations_info_url = "https://opendata-ajuntament.barcelona.cat/data/dataset/4dff88b1-151b-48db-91c2-45007cd5d07a/resource/d1aa40d7-66f9-451b-85f8-955b765fdc2f/download/2025_qualitat_aire_estacions.csv"
air_stations_info_path = "./data/air_stations_info.csv"

def generate_urls(download_from, download_until):
    urls = []
    current_date = datetime(download_from.year, download_from.month, 1)
    end_date = datetime(download_until.year, download_until.month, 1)

    while current_date <= end_date:
        year = current_date.year
        month = current_date.month
        month_name = catalan_months[month]

        # Generate URLs for both datasets
        tram_url = f"https://opendata-ajuntament.barcelona.cat/resources/auto/transit/{year}_{month:02d}_{month_name}_TRAMS_TRAMS.csv"
        aire_url = f"https://opendata-ajuntament.barcelona.cat/resources/bcn/QualitatAire/{year}_{month:02d}_{month_name}_qualitat_aire_BCN.csv"

        tram_filename = f"data/TRAMS_{year}_{month:02d}_{month_name}.csv"
        aire_filename = f"data/QualitatAire_{year}_{month:02d}_{month_name}.csv"

        urls.append((tram_url, tram_filename))
        urls.append((aire_url, aire_filename))

        # Move to the next month
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

    return urls

def download_file(url, filename):
    if os.path.exists(filename):
        print(f"File already exists: {filename}. Skipping download.")
        return

    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {filename}")
    except Exception as e:
        print(f"Failed to download {filename}: {e}")


# Create a "data" folder:
if not os.path.exists("./data"):
    os.mkdir("./data")

urls = generate_urls(download_from, download_until)

print("Dwonloading files...")
for url, filename in urls:
    download_file(url, filename)


# Then download the TRAMS relacio file
if not os.path.exists(trams_relacio_path):
    download_file(trams_relacio_url, trams_relacio_path)


# And the air stations info 
if not os.path.exists(air_stations_info_path):
    download_file(air_stations_info_url, air_stations_info_path)

Dwonloading files...
File already exists: data/TRAMS_2024_01_Gener.csv. Skipping download.
File already exists: data/QualitatAire_2024_01_Gener.csv. Skipping download.
File already exists: data/TRAMS_2024_02_Febrer.csv. Skipping download.
File already exists: data/QualitatAire_2024_02_Febrer.csv. Skipping download.
File already exists: data/TRAMS_2024_03_Marc.csv. Skipping download.
File already exists: data/QualitatAire_2024_03_Marc.csv. Skipping download.
File already exists: data/TRAMS_2024_04_Abril.csv. Skipping download.
File already exists: data/QualitatAire_2024_04_Abril.csv. Skipping download.


# Data treatment and cleaning


## Creating a merged dataset

### Combining all the air quality files

In [4]:
## Create dataframe of measurements

air_quality_files = [filename for url, filename in urls if "QualitatAire" in filename]
#print(air_quality_files)

# Covnert to pandas dataframes
air_quality_dfs = [pd.read_csv(file) for file in air_quality_files]
# Combine them into a single dataframe
air_quality_combined = pd.concat(air_quality_dfs, ignore_index=True)
#print(air_quality_combined.head())


## Create dataframe of station info
air_station_info = pd.read_csv(air_stations_info_path)
# Only keep location data for each station
air_station_locations = air_station_info[['Estacio', 'Latitud', 'Longitud']].drop_duplicates(subset=['Estacio'])
#print(air_station_locations.head(n=20))

## Now merge the air_quality measurements and the location

air_quality = air_quality_combined.merge(
    right=air_station_locations,
    left_on='ESTACIO',
    right_on='Estacio',
    how='left'
).drop(columns=['Estacio'])

print(air_quality.head())

   CODI_PROVINCIA  PROVINCIA  CODI_MUNICIPI   MUNICIPI  ESTACIO  \
0               8  Barcelona             19  Barcelona        4   
1               8  Barcelona             19  Barcelona        4   
2               8  Barcelona             19  Barcelona        4   
3               8  Barcelona             19  Barcelona        4   
4               8  Barcelona             19  Barcelona        4   

   CODI_CONTAMINANT   ANY  MES  DIA   H01  ...   H21  V21   H22  V22   H23  \
0                 7  2024    1    1   3.0  ...  15.0    V  44.0    V  20.0   
1                 7  2024    1    2  14.0  ...   2.0    V   3.0    V   6.0   
2                 7  2024    1    3   2.0  ...   2.0    V   1.0    V   2.0   
3                 7  2024    1    4  20.0  ...   2.0    V  29.0    V  57.0   
4                 7  2024    1    5  29.0  ...   1.0    V   2.0    V   1.0   

   V23   H24  V24  Latitud  Longitud  
0    V  12.0    V  41.4039    2.2045  
1    V   2.0    V  41.4039    2.2045  
2    V   3.

### Combining traffic files

In [5]:
## Create dataframe of measurements

traffic_files = [filename for url, filename in urls if "TRAMS" in filename]

# Covnert to pandas dataframes
traffic_dfs = [pd.read_csv(file) for file in traffic_files]
#print(traffic_dfs[0].head())

# Combine them into a single dataframe
traffic_combined = pd.concat(traffic_dfs, ignore_index=True)
print(traffic_combined.head())


## Create dataframe of station info
trams_info = pd.read_csv(trams_relacio_path)
# Only keep location data for each station
trams_locations = trams_info[['Tram', 'Coordenades']]

# Only keep one location point per section (tram=section)
def mean_coordinate(row):
    # print("entering function mean")
    # print(row)
    coord_text=row['Coordenades']
    numbers=coord_text.split(',')
    assert len(numbers) % 2 == 0
    lats = [float(x) for x in numbers[::2]]
    lons = [float(x) for x in numbers[1::2]]
    # print("infos")
    # print(lats)
    # print(type(lats[0]))
    lat = np.mean(lats)
    lon = np.mean(lons)
    return pd.Series(
        {
            'lat': lat,
            'lon': lon
        }
    )


trams_locations [['lat' , 'lon']] = trams_locations.apply(mean_coordinate, axis=1)
#print(trams_locations.head())


## Now merge the traffic measurements and the location
traffic = traffic_combined.merge(
    right=trams_locations,
    left_on='idTram',
    right_on='Tram',
    how='left'
)

traffic = traffic.drop(['Coordenades', 'Tram', 'estatPrevist'], axis=1)

print("RESULT FINA")
print(traffic.head())

   idTram            data  estatActual  estatPrevist
0       1  20240101000552            0             0
1       2  20240101000552            0             0
2       3  20240101000552            0             0
3       4  20240101000552            0             0
4       5  20240101000552            0             0
RESULT FINA
   idTram            data  estatActual       lat        lon
0       1  20240101000552            0  2.106769  41.382911
1       2  20240101000552            0  2.106769  41.383167
2       3  20240101000552            0  2.117372  41.385579
3       4  20240101000552            0  2.117281  41.385824
4       5  20240101000552            0  2.125109  41.387561


## Dataset issue

For some reason, the city does not provide information for the 'trams' with id greater than 527. In the `traffic` dataframe, there are some records with idTram between 535 and 539. They don't have a latitude/longitude. Let us drop them.

In [6]:
traffic = traffic.dropna(subset=['lat', 'lon'])

# Defining a traffic grid

Now, `traffic` contains the state (from 0-no car to 6-congestioned) of plenty of coordinates in Barcelona. But the density is not homogeneous, hence i will cut the city in a grid and compute an average value for traffic congestion.

In [8]:
# Getting the boundaries of the traffic information
minlat = traffic['lat'].min()
maxlat = traffic['lat'].max()
minlon = traffic['lon'].min()
maxlon = traffic['lon'].max()

#TODO add prints of the distance of the box here


nb_horizontal = 8
nb_vertical = 8

vertical_step = (maxlat - minlat) / nb_vertical
horizontal_step = (maxlon - minlon) / nb_horizontal


# Create new datafram from traffic

""" def getZone(row):
    lat = (row['lat'] - minlat) // vertical_step
    lon = (row['lon'] - minlon) // horizontal_step
    zone_number = (lat * nb_horizontal) + lon


traffic['zone'] = traffic.apply(getZone, axis=1) """
# BETTER WAY (vectorized)
# Create nb_vertical * nb_horizontal zones and assign every row a zone:
lat_idx = np.clip(np.floor((traffic['lat'] - minlat) / vertical_step).astype(int), 0, nb_vertical - 1)
lon_idx = np.clip(np.floor((traffic['lon'] - minlon) / horizontal_step).astype(int), 0, nb_horizontal - 1)
traffic['zone'] = lat_idx * nb_horizontal + lon_idx

assert traffic['zone'].max() <= nb_horizontal * nb_vertical

# Drop lat/lon as we will now use the zone
traffic.drop(['lat', 'lon'], axis=1)


print(traffic.head())
print(air_quality.head())

   idTram            data  estatActual       lat        lon  zone
0       1  20240101000552            0  2.106769  41.382911     2
1       2  20240101000552            0  2.106769  41.383167     2
2       3  20240101000552            0  2.117372  41.385579     3
3       4  20240101000552            0  2.117281  41.385824     3
4       5  20240101000552            0  2.125109  41.387561    11
   CODI_PROVINCIA  PROVINCIA  CODI_MUNICIPI   MUNICIPI  ESTACIO  \
0               8  Barcelona             19  Barcelona        4   
1               8  Barcelona             19  Barcelona        4   
2               8  Barcelona             19  Barcelona        4   
3               8  Barcelona             19  Barcelona        4   
4               8  Barcelona             19  Barcelona        4   

   CODI_CONTAMINANT   ANY  MES  DIA   H01  ...   H21  V21   H22  V22   H23  \
0                 7  2024    1    1   3.0  ...  15.0    V  44.0    V  20.0   
1                 7  2024    1    2  14.0  ..