## Config

In [1]:
# parameters
SAVE_OUTPUT = True
SIZE_PLOTS = (12,10)
BUFFER_SIZE = 402.336 

#Location of the data
INPUT_DATA_PATH = "../data/raw/bicizen"
OUTPUT_DATA_PATH = "../data/interim/bicizen"
INPUT_PROCESSED_DATA_PATH = "../data/processed/"


In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go


# Load data

In [3]:
counts=pd.read_csv(Path(INPUT_DATA_PATH) / "Counts_BiciZen.csv", sep=';')
locations=pd.read_csv(Path(INPUT_DATA_PATH) /"Station_Data_BiciZen.csv",sep=';')
counters=gpd.read_parquet(Path(INPUT_PROCESSED_DATA_PATH) /"013_oriented_stations2023.parquet")

# Data management

## Manage stations data

### General adjustements

In [4]:
#Remove the duplicated rows
print("Remove the duplicated rows")
print(locations.shape)
locations=locations.drop_duplicates()
print(locations.shape)

# Drop columns where all values are NaN
print("Drop columns where all values are NaN")
locations = locations.dropna(axis=1, how='all')
print(locations.shape)

#Make geodataframe
locations = gpd.GeoDataFrame(locations, geometry=gpd.points_from_xy(locations.Longitud, locations.Latitud))
locations.set_crs("epsg:4326",inplace=True)
locations.head(2)


Remove the duplicated rows
(929, 16)
(55, 16)
Drop columns where all values are NaN
(55, 10)


Unnamed: 0,ROOT_ID,Location,Street,Direcction,Cardinals,Infraestrucutura,Codigo Infraestructura,Equip de mesura,Latitud,Longitud,geometry
0,1060335,C/ Bilbao (Sentit Muntanya),C/ Bilbao,(Sentit Muntanya),(Sentit Muntanya),Protected separator,7,BiciZen,41.40476,2.20095,POINT (2.20095 41.40476)
1,1048229,C/ de Bilbao 117 (Sentit Trànsit Rodat),C/ de Bilbao 117,(Sentit Trànsit Rodat),Sentit Mar,Protected separator,7,BiciZen,41.40476,2.20095,POINT (2.20095 41.40476)


### Refine Attributes 

In [5]:
# Replace typos in the "Cardinals" column
locations["Cardinals"].replace({
    'noroeste': 'NW',
    'Este': 'E',
    'sureste': 'SE',
    'oeste': 'W',
    '(Nord)': 'N',
    'nordeste': 'NE',
    'suroeste': 'SW',
    'Oeste': 'W',
    'Norte': 'N',
    '(Sud)': 'S',
    'Sur': 'S',
    'sur': 'S'
}, inplace=True)

def assign_direction(text):
    if text is None:
        return 'No category'  # Or any other value you want to assign for None
    elif 'Llobregat' in text:
        return "SW"
    elif 'mar' in text or 'baixada' in text or 'Mar' in text:
        return "SE"
    elif 'muntanya' in text or 'pujada' in text or 'montanya' in text or 'Muntanya' in text:
        return "NW"
    elif 'Besòs' in text:
        return "NE"
    elif 'carril BICI' == text:
        return 'Unidirectional'
    else:
        return text  # Default category if no keyword matches
    
locations['direction_counter'] = locations['Cardinals'].apply(assign_direction)
locations['direction_counter'].value_counts()

direction_counter
SE    14
NW    12
NE     8
SW     8
W      4
N      3
E      3
S      3
Name: count, dtype: int64

In [6]:
locations=locations[['ROOT_ID','Street', 'Codigo Infraestructura', 'direction_counter',  'geometry']]
locations.head()

Unnamed: 0,ROOT_ID,Street,Codigo Infraestructura,direction_counter,geometry
0,1060335,C/ Bilbao,7,NW,POINT (2.20095 41.40476)
1,1048229,C/ de Bilbao 117,7,SE,POINT (2.20095 41.40476)
4,1048249,Rambla de l'Onze de Setembre 2,3,W,POINT (2.19482 41.43044)
6,1048273,C/ de Pi i Margall 114,1,N,POINT (2.16471 41.41152)
7,1048896,C/ Jocs Florals 175,1,SE,POINT (2.13685 41.37023)


### Add real counters id

In [7]:
#Project both to local coordinates
locations.to_crs("epsg:25831",inplace=True)
counters.to_crs("epsg:25831",inplace=True)

#Make a spatial Join
locations_buffer = locations.copy()
locations_buffer["geometry"] = locations_buffer["geometry"].buffer(50)

joined_buffer = gpd.sjoin(locations_buffer, counters, how="inner",predicate="intersects")
print(joined_buffer.shape)

#Keep the ones that match
joined_match = joined_buffer[joined_buffer['direction_counter_left'] == joined_buffer['direction_counter_right']]
print(joined_match.shape)

(28, 11)
(14, 11)


In [8]:

id_to_root_id = {
    #The ones found manually
    1051865: 20348,  # RONDA GUINARDO - TORRENT MELIS 2 (sentit Llobregat) NE
    1048223: 20248,  # MERIDIANA - CLOT #fix
    1048244: 20227,  # GARCILASO - AV. MERIDIANA
    #The ones from the spatial join
    1048258: 20194,  # DE PI FERRER - C/DEL GARROFERS (carril BICI se...)
    1053952: 20195,  # DE PI FERRER - ALSAMORA (carril BICI sentit mar)
    1048280: 20169,  # AV. MARIA CRISTINA - PL. ESPANYA (carril BICI ...)
    1057494: 20168,  # AV. MARIA CRISTINA - RIUS I TAULET (carril BIC...)
    1051840: 20113,  # Pg. Pujades - Picasso (carril BICI sentit Besòs)
    1047953: 20112,  # Pg. Pujajdes - Lluis Companys (carril BICI sen...)
    1048272: 20241,  # ARISTIDES MAILLOL - CARDENAL REIG (carril BICI)
    1054514: 20242,  # ARISTIDES MAILLOL - AV. DOCTOR MARAÑON (carril...)
    1048274: 20099,  # TRAV. DE GRACIA - EN GRASSOT (sentit Besòs)
    1049380: 20098,  # TRAV. DE GRACIA - HIPOLIT LAZARO (carril BICI)
    1048268: 20347,  # RONDA GUINARDO - TORRENT MELIS 2 (sentit Llobr...)
    1057984: 20325,  # CALABRIA - VALENCIA (sentit mar)
    1029389: 20326,  # CALABRIA - ARAGO (sentit muntanya)
    1046328: 20404,  # VIA AUGUSTA -PG. DE LA BONANOVA (carril bici -...)
}

# Convert the dictionary to a DataFrame
id_to_root_id_df = pd.DataFrame(list(id_to_root_id.items()), columns=['ROOT_ID', 'id'])
id_to_root_id_df.shape


(17, 2)

In [9]:
locations=locations.merge(id_to_root_id_df, on="ROOT_ID", how="left")
locations.head()

Unnamed: 0,ROOT_ID,Street,Codigo Infraestructura,direction_counter,geometry,id
0,1060335,C/ Bilbao,7,NW,POINT (433211.182 4583998.871),
1,1048229,C/ de Bilbao 117,7,SE,POINT (433211.182 4583998.871),
2,1048249,Rambla de l'Onze de Setembre 2,3,W,POINT (432725.306 4586854.582),
3,1048273,C/ de Pi i Margall 114,1,N,POINT (430189.262 4584777.934),
4,1048896,C/ Jocs Florals 175,1,SE,POINT (427815.131 4580216.804),


# Manage count

### Group or scale by hour

In [10]:
counts["TIME_MINUTES"].describe()

count    929.000000
mean      10.322928
std        2.522127
min       10.000000
25%       10.000000
50%       10.000000
75%       10.000000
max       30.000000
Name: TIME_MINUTES, dtype: float64

In [11]:
#Sum all into hour
counts=counts.fillna(0)
counts["count"]=counts["NUMBER_BICYCLES"]+counts["NUMBER_WOMEN_CYCLIST"]+counts["NUMBER_CHILDREN"]+counts["NUMBER_SENIORS"]+counts["NUMBER_SCOOTERS"]
counts_by_hour = counts.groupby(['ROOT_ID', 'Date', 'Hour'])[['count', 'TIME_MINUTES']].sum().reset_index()
print(counts.shape)
print(counts_by_hour.shape)
counts_by_hour.head()

(929, 19)
(722, 5)


Unnamed: 0,ROOT_ID,Date,Hour,count,TIME_MINUTES
0,747957,02/11/2024,12,19.0,10
1,1029389,05/11/2024,15,28.0,10
2,1029389,06/11/2024,8,58.0,10
3,1029389,06/11/2024,16,33.0,10
4,1029389,07/11/2024,14,26.0,10


In [12]:
counts_by_hour["count"].describe()

count    722.000000
mean      17.322715
std       21.276197
min        0.000000
25%        4.000000
50%       10.000000
75%       23.750000
max      215.000000
Name: count, dtype: float64

In [13]:
#Make factor
counts_by_hour["Factor"]=60/counts_by_hour["TIME_MINUTES"]
print(counts_by_hour["Factor"].value_counts())
print("\n")
#Apply factor
counts_by_hour["count"]=counts_by_hour["count"]*counts_by_hour["Factor"]
print(counts_by_hour["count"].describe())

#Check it adds up
print("VAlues are",(counts_by_hour["TIME_MINUTES"]*counts_by_hour["Factor"]).unique())

Factor
6.000000    632
3.000000     46
1.000000     26
2.000000     11
1.500000      3
0.666667      2
1.200000      1
0.545455      1
Name: count, dtype: int64


count    722.000000
mean      79.634441
std       78.507556
min        0.000000
25%       24.000000
50%       54.000000
75%      114.000000
max      684.000000
Name: count, dtype: float64
VAlues are [60. 60.]


In [14]:
attributes = ['ROOT_ID', 'Date','Hour','count']
df = counts_by_hour[attributes].copy()

df["date"] = pd.to_datetime(df["Date"], dayfirst=True)

# Extract Year, Month, Weekday, and Day from date
df["Year"] = df["date"].dt.year
df["Month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.weekday
df["Day"] = df["date"].dt.day

#Make 'Year', 'Month', 'Day', 'Hour' int
# Fill NaN values with 0 before converting to int
df[['Year', 'Month', 'Day', 'Hour']] = df[['Year', 'Month', 'Day', 'Hour']].astype(int)
df=df.rename(columns={"count":"Count"})
df

Unnamed: 0,ROOT_ID,Date,Hour,Count,date,Year,Month,weekday,Day
0,747957,02/11/2024,12,114.0,2024-11-02,2024,11,5,2
1,1029389,05/11/2024,15,168.0,2024-11-05,2024,11,1,5
2,1029389,06/11/2024,8,348.0,2024-11-06,2024,11,2,6
3,1029389,06/11/2024,16,198.0,2024-11-06,2024,11,2,6
4,1029389,07/11/2024,14,156.0,2024-11-07,2024,11,3,7
...,...,...,...,...,...,...,...,...,...
717,1060375,03/12/2024,9,30.0,2024-12-03,2024,12,1,3
718,1060375,13/12/2024,16,0.0,2024-12-13,2024,12,4,13
719,1060375,29/11/2024,17,6.0,2024-11-29,2024,11,4,29
720,1061432,03/12/2024,9,90.0,2024-12-03,2024,12,1,3


### Join with location

In [15]:
gdf = df.merge(locations[["ROOT_ID", "geometry",]], on="ROOT_ID", how="left")
gdf = gpd.GeoDataFrame(gdf, geometry='geometry')
print(gdf.isna().sum().sum())
print(gdf.shape)
gdf.head()

0
(722, 10)


Unnamed: 0,ROOT_ID,Date,Hour,Count,date,Year,Month,weekday,Day,geometry
0,747957,02/11/2024,12,114.0,2024-11-02,2024,11,5,2,POINT (426930.939 4582277.492)
1,1029389,05/11/2024,15,168.0,2024-11-05,2024,11,1,5,POINT (429028.861 4581597.123)
2,1029389,06/11/2024,8,348.0,2024-11-06,2024,11,2,6,POINT (429028.861 4581597.123)
3,1029389,06/11/2024,16,198.0,2024-11-06,2024,11,2,6,POINT (429028.861 4581597.123)
4,1029389,07/11/2024,14,156.0,2024-11-07,2024,11,3,7,POINT (429028.861 4581597.123)


In [16]:
gdf["Count"].describe()

count    722.000000
mean      79.634441
std       78.507556
min        0.000000
25%       24.000000
50%       54.000000
75%      114.000000
max      684.000000
Name: Count, dtype: float64

### Metadata counters

In [17]:
#Get number of observations by location
observations = gdf.groupby("ROOT_ID")["Count"].count().reset_index()
observations.columns = ["ROOT_ID", "Observations"]

# Get number of days with data per location
days_with_data = gdf.groupby("ROOT_ID")["date"].nunique().reset_index()
days_with_data.columns = ["ROOT_ID", "DaysWithData"]

# Get if location has weekend data or not
weekend_data = gdf[gdf["weekday"].isin([5, 6])].groupby("ROOT_ID")["weekday"].nunique().reset_index()
weekend_data["HasWeekendData"] = weekend_data["weekday"] > 0
weekend_data = weekend_data[["ROOT_ID", "HasWeekendData"]]

# Merge the new attributes
locations = locations.merge(days_with_data, on="ROOT_ID", how="left")
locations = locations.merge(weekend_data, on="ROOT_ID", how="left")
locations=locations.merge(observations, on="ROOT_ID", how="left")
locations["HasWeekendData"] = locations["HasWeekendData"].fillna(False)



## Plot observations 

In [None]:



# Group data by date and calculate the total number of observations per day
observations_by_day = gdf.groupby("date")["Count"].count().reset_index()
observations_by_day["weekday"] = observations_by_day["date"].dt.weekday
observations_by_day["is_weekend"] = observations_by_day["weekday"].isin([5, 6])
observations_by_day["Month"] = observations_by_day["date"].dt.month

# Create the plot
fig = px.bar(
    observations_by_day,
    x="date",
    y="Count",
    color="is_weekend",
    color_discrete_map={True: "red", False: "blue"},
    labels={"is_weekend": "Weekend"},
    title="Number of Observations by Day (Weekends Highlighted)"
)

fig.update_layout(xaxis_title="Date", yaxis_title="Number of Observations")

# Add background highlights for each month with a different color
month_colors = ['LightGreen', 'Orange', 'LightCoral']  # Replaced 'LightBlue' with 'Orange'
month_name = ['October','November','December']
for i, (month, month_data) in enumerate(observations_by_day.groupby("Month")):
    start_date = month_data['date'].min()
    end_date = month_data['date'].max()
    fig.add_shape(
        type="rect",
        x0=start_date,
        x1=end_date,
        y0=0,
        y1=observations_by_day['Count'].max(),
        fillcolor=month_colors[i % len(month_colors)],
        opacity=0.2,
        layer="below",
        line_width=0
    )

# Add month colors to the legend
for i, color in enumerate(month_colors):
    fig.add_trace(
        go.Scatter(
            x=[None], y=[None], mode='markers',
            marker=dict(size=10, color=color),
            name=month_name[i]
        )
    )

# Show the updated figure
fig.show()


In [None]:

# Create the bar chart
fig = px.bar(
    counters_by_month,
    x='Month',
    y='Number of Counters',
    title="Number of Counters with More Than 10 Observations by Month",
    labels={'Month': 'Month', 'Number of Counters': 'Number of Counters'},
    color_continuous_scale='Blues'  # Optional: Use a color scale
)

fig.update_layout(
    xaxis=dict(tickmode='linear'),  # Ensure all months are shown on the x-axis
    yaxis_title="Number of Counters",
    xaxis_title="Month"
)

# Show the plot
fig.show()

## Save output

In [None]:
if SAVE_OUTPUT:
    gdf.to_parquet(f'{OUTPUT_DATA_PATH}/bicizen.parquet')
    locations.to_parquet(f'{OUTPUT_DATA_PATH}/bicizen_stations.parquet')


## Watermark

In [None]:
!python -m pip install watermark --quiet

In [None]:
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [None]:
%watermark

Last updated: 2025-03-28T12:33:56.334882+01:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.25.0

Compiler    : MSC v.1938 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : AMD64 Family 25 Model 68 Stepping 1, AuthenticAMD
CPU cores   : 16
Architecture: 64bit



In [None]:
%watermark --iversions

numpy    : 1.24.4
plotly   : 5.24.1
pandas   : 2.0.3
geopandas: 0.13.2



In [None]:
!lsb_release -a

"lsb_release" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
