## Config

In [182]:
# parameters
SAVE_OUTPUT = True
SIZE_PLOTS = (12,10)
BUFFER_SIZE = 402.336 

#Location of the data
INPUT_DATA_PATH = "../data/raw/bicizen"
OUTPUT_DATA_PATH = "../data/interim/bicizen"
INPUT_PROCESSED_DATA_PATH = "../data/processed/"


In [183]:
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path


# Load data

In [184]:
counts=pd.read_csv(Path(INPUT_DATA_PATH) / "Counts_BiciZen.csv", sep=';')
locations=pd.read_csv(Path(INPUT_DATA_PATH) /"Station_Data_BiciZen.csv",sep=';')
counters=gpd.read_parquet(Path(INPUT_PROCESSED_DATA_PATH) /"013_oriented_stations2023.parquet")

# Data management

## Manage stations data

### General adjustements

In [185]:
#Remove the duplicated rows
print(locations.shape)
locations=locations.drop_duplicates()
print(locations.shape)

(929, 16)
(55, 16)


In [186]:
# Drop columns where all values are NaN
locations = locations.dropna(axis=1, how='all')

print(locations.columns)
locations.head(2)

Index(['ROOT_ID', 'Location', 'Street', 'Direcction', 'Cardinals',
       'Infraestrucutura', 'Codigo Infraestructura', 'Equip de mesura',
       'Latitud', 'Longitud'],
      dtype='object')


Unnamed: 0,ROOT_ID,Location,Street,Direcction,Cardinals,Infraestrucutura,Codigo Infraestructura,Equip de mesura,Latitud,Longitud
0,1060335,C/ Bilbao (Sentit Muntanya),C/ Bilbao,(Sentit Muntanya),(Sentit Muntanya),Protected separator,7,BiciZen,41.40476,2.20095
1,1048229,C/ de Bilbao 117 (Sentit Trànsit Rodat),C/ de Bilbao 117,(Sentit Trànsit Rodat),Sentit Mar,Protected separator,7,BiciZen,41.40476,2.20095


In [187]:
#Make geodataframe
locations = gpd.GeoDataFrame(locations, geometry=gpd.points_from_xy(locations.Longitud, locations.Latitud))
locations.set_crs("epsg:4326",inplace=True)
locations.head(2)

Unnamed: 0,ROOT_ID,Location,Street,Direcction,Cardinals,Infraestrucutura,Codigo Infraestructura,Equip de mesura,Latitud,Longitud,geometry
0,1060335,C/ Bilbao (Sentit Muntanya),C/ Bilbao,(Sentit Muntanya),(Sentit Muntanya),Protected separator,7,BiciZen,41.40476,2.20095,POINT (2.20095 41.40476)
1,1048229,C/ de Bilbao 117 (Sentit Trànsit Rodat),C/ de Bilbao 117,(Sentit Trànsit Rodat),Sentit Mar,Protected separator,7,BiciZen,41.40476,2.20095,POINT (2.20095 41.40476)


### Refine Attributes 

In [188]:
locations["Cardinals"].replace({
    'noroeste': 'NW',
    'Este': 'E',
    'sureste': 'SE',
    'oeste': 'W',
    '(Nord)': 'N',
    'nordeste': 'NE',
    'suroeste': 'SW',
    'Oeste': 'W',
    'Norte': 'N',
    '(Sud)': 'S',
    'Sur': 'S',
    'sur': 'S'
}, inplace=True)
def assign_direction(text):
    if text is None:
        return 'No category'  # Or any other value you want to assign for None
    elif 'Llobregat' in text:
        return "SW"
    elif 'mar' in text or 'baixada' in text or 'Mar' in text:
        return "SE"
    elif 'muntanya' in text or 'pujada' in text or 'montanya' in text or 'Muntanya' in text:
        return "NW"
    elif 'Besòs' in text:
        return "NE"
    elif 'carril BICI' == text:
        return 'Unidirectional'
    else:
        return text  # Default category if no keyword matches
    
locations['direction_counter'] = locations['Cardinals'].apply(assign_direction)
locations['direction_counter'].value_counts()

direction_counter
SE    14
NW    12
NE     8
SW     8
W      4
N      3
E      3
S      3
Name: count, dtype: int64

In [189]:
locations=locations[['ROOT_ID','Street', 'Codigo Infraestructura', 'direction_counter',  'geometry']]
locations.head()

Unnamed: 0,ROOT_ID,Street,Codigo Infraestructura,direction_counter,geometry
0,1060335,C/ Bilbao,7,NW,POINT (2.20095 41.40476)
1,1048229,C/ de Bilbao 117,7,SE,POINT (2.20095 41.40476)
4,1048249,Rambla de l'Onze de Setembre 2,3,W,POINT (2.19482 41.43044)
6,1048273,C/ de Pi i Margall 114,1,N,POINT (2.16471 41.41152)
7,1048896,C/ Jocs Florals 175,1,SE,POINT (2.13685 41.37023)


## Real counter

In [191]:
#Project both to local coordinates
locations.to_crs("epsg:25831",inplace=True)
counters.to_crs("epsg:25831",inplace=True)


In [192]:
#Make a spatial Join
locations_buffer = locations.copy()
locations_buffer["geometry"] = locations_buffer["geometry"].buffer(50)

joined_buffer = gpd.sjoin(locations_buffer, counters, how="inner",predicate="intersects")
print(joined_buffer.shape)

#Keep the ones that match
joined_match = joined_buffer[joined_buffer['direction_counter_left'] == joined_buffer['direction_counter_right']]
print(joined_match.shape)

(28, 11)
(14, 11)


In [193]:
#Fixes:
#Fixes:
id_to_root_id = {
    #The ones found manually
    1051865: 20348,  # RONDA GUINARDO - TORRENT MELIS 2 (sentit Llobregat) NE
    1048223: 20248,  # MERIDIANA - CLOT #fix
    1048244: 20227,  # GARCILASO - AV. MERIDIANA
    #The ones from the spatial join
    1048258: 20194,  # DE PI FERRER - C/DEL GARROFERS (carril BICI se...)
    1053952: 20195,  # DE PI FERRER - ALSAMORA (carril BICI sentit mar)
    1048280: 20169,  # AV. MARIA CRISTINA - PL. ESPANYA (carril BICI ...)
    1057494: 20168,  # AV. MARIA CRISTINA - RIUS I TAULET (carril BIC...)
    1051840: 20113,  # Pg. Pujades - Picasso (carril BICI sentit Besòs)
    1047953: 20112,  # Pg. Pujajdes - Lluis Companys (carril BICI sen...)
    1048272: 20241,  # ARISTIDES MAILLOL - CARDENAL REIG (carril BICI)
    1054514: 20242,  # ARISTIDES MAILLOL - AV. DOCTOR MARAÑON (carril...)
    1048274: 20099,  # TRAV. DE GRACIA - EN GRASSOT (sentit Besòs)
    1049380: 20098,  # TRAV. DE GRACIA - HIPOLIT LAZARO (carril BICI)
    1048268: 20347,  # RONDA GUINARDO - TORRENT MELIS 2 (sentit Llobr...)
    1057984: 20325,  # CALABRIA - VALENCIA (sentit mar)
    1029389: 20326,  # CALABRIA - ARAGO (sentit muntanya)
    1046328: 20404,  # VIA AUGUSTA -PG. DE LA BONANOVA (carril bici -...)
}

# Convert the dictionary to a DataFrame
id_to_root_id_df = pd.DataFrame(list(id_to_root_id.items()), columns=['ROOT_ID', 'id'])
id_to_root_id_df.shape


(17, 2)

In [207]:
locations=locations.merge(id_to_root_id_df, on="ROOT_ID", how="left")
locations.head()

Unnamed: 0,ROOT_ID,Street,Codigo Infraestructura,direction_counter,geometry,id_x,id_y
0,1060335,C/ Bilbao,7,NW,POINT (433211.182 4583998.871),,
1,1048229,C/ de Bilbao 117,7,SE,POINT (433211.182 4583998.871),,
2,1048249,Rambla de l'Onze de Setembre 2,3,W,POINT (432725.306 4586854.582),,
3,1048273,C/ de Pi i Margall 114,1,N,POINT (430189.262 4584777.934),,
4,1048896,C/ Jocs Florals 175,1,SE,POINT (427815.131 4580216.804),,


## Explore data

In [195]:
#Explore stations
locations.drop_duplicates(inplace=True)
print(locations.shape)
print(locations.columns)
# print(locations.dtypes)
# locations.head()

(55, 6)
Index(['ROOT_ID', 'Street', 'Codigo Infraestructura', 'direction_counter',
       'geometry', 'id'],
      dtype='object')


In [196]:
#Explore counts
print(counts.shape)
print(counts.columns)
print(counts.dtypes)
counts.tail()

(929, 18)
Index(['ROOT_ID', 'TIME_MINUTES', 'Factor', 'Volumen Bicicletas', 'SPOTTED_AT',
       'Date', 'Weekday', 'Time', 'Year', 'Month', 'Day', 'Hour',
       'NUMBER_BICYCLES', 'NUMBER_WOMEN_CYCLIST', 'NUMBER_CHILDREN',
       'NUMBER_SENIORS', 'NUMBER_SCOOTERS', 'NUMBER_VEHICLES'],
      dtype='object')
ROOT_ID                   int64
TIME_MINUTES              int64
Factor                    int64
Volumen Bicicletas        int64
SPOTTED_AT               object
Date                     object
Weekday                  object
Time                     object
Year                      int64
Month                     int64
Day                       int64
Hour                      int64
NUMBER_BICYCLES         float64
NUMBER_WOMEN_CYCLIST    float64
NUMBER_CHILDREN         float64
NUMBER_SENIORS          float64
NUMBER_SCOOTERS         float64
NUMBER_VEHICLES         float64
dtype: object


Unnamed: 0,ROOT_ID,TIME_MINUTES,Factor,Volumen Bicicletas,SPOTTED_AT,Date,Weekday,Time,Year,Month,Day,Hour,NUMBER_BICYCLES,NUMBER_WOMEN_CYCLIST,NUMBER_CHILDREN,NUMBER_SENIORS,NUMBER_SCOOTERS,NUMBER_VEHICLES
924,1047975,10,6,270,2024-10-28 09:03:00,28/10/2024,lunes,09:03:00,2024,10,28,9,45.0,16.0,,4.0,4.0,
925,1047953,10,6,192,2024-10-28 08:23:00,28/10/2024,lunes,08:23:00,2024,10,28,8,32.0,9.0,,,1.0,
926,1046343,10,6,18,2024-10-25 13:47:00,25/10/2024,viernes,13:47:00,2024,10,25,13,3.0,1.0,,,,
927,1046328,10,6,24,2024-10-25 13:04:00,25/10/2024,viernes,13:04:00,2024,10,25,13,4.0,,,,,
928,1046310,10,6,18,2024-10-25 12:39:00,25/10/2024,viernes,12:39:00,2024,10,25,12,3.0,1.0,,,,


In [197]:
counts.describe()

Unnamed: 0,ROOT_ID,TIME_MINUTES,Factor,Volumen Bicicletas,Year,Month,Day,Hour,NUMBER_BICYCLES,NUMBER_WOMEN_CYCLIST,NUMBER_CHILDREN,NUMBER_SENIORS,NUMBER_SCOOTERS,NUMBER_VEHICLES
count,929.0,929.0,929.0,929.0,929.0,929.0,929.0,929.0,906.0,799.0,407.0,438.0,811.0,370.0
mean,1049844.0,10.322928,5.935414,46.557589,2024.0,11.051668,16.155005,15.218515,8.249448,2.846058,0.253071,0.349315,3.086313,0.435135
std,11195.06,2.522127,0.504425,45.792297,0.0,0.333993,8.533509,4.414494,8.717267,3.504037,0.763924,0.788294,3.231779,1.684412
min,747957.0,10.0,2.0,0.0,2024.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1048268.0,10.0,6.0,18.0,2024.0,11.0,8.0,10.0,3.0,1.0,0.0,0.0,1.0,0.0
50%,1048955.0,10.0,6.0,30.0,2024.0,11.0,16.0,17.0,6.0,2.0,0.0,0.0,2.0,0.0
75%,1052805.0,10.0,6.0,60.0,2024.0,11.0,23.0,18.0,11.0,4.0,0.0,0.0,4.0,0.0
max,1061432.0,30.0,6.0,450.0,2024.0,12.0,31.0,22.0,129.0,44.0,11.0,5.0,37.0,16.0


### Explore counts

In [198]:
counts["SPOTTED_AT"] = pd.to_datetime(counts["SPOTTED_AT"])
counts["weekday"] = counts["SPOTTED_AT"].dt.weekday
counts["weekday"].value_counts()

weekday
3    188
2    169
4    164
0    162
1    160
5     52
6     34
Name: count, dtype: int64

In [199]:
test = counts.groupby('ROOT_ID').agg({"TIME_MINUTES": "sum", "NUMBER_BICYCLES": "sum"})
print(test.shape)
test.sort_values(by='TIME_MINUTES').head(10)
# test.describe()

(55, 2)


Unnamed: 0_level_0,TIME_MINUTES,NUMBER_BICYCLES
ROOT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
747957,10,8.0
1050644,10,14.0
1048276,10,12.0
1048263,10,0.0
1048244,20,7.0
1061432,20,25.0
1060375,30,4.0
1052622,30,10.0
1052618,30,12.0
1046310,30,10.0


In [200]:
attributes = ['ROOT_ID', 'Year', 'Month','weekday', 'Day', 'Hour','Volumen Bicicletas']
df = counts[attributes].copy()
#Make 'Year', 'Month', 'Day', 'Hour' int
# Fill NaN values with 0 before converting to int
df[['Year', 'Month', 'Day', 'Hour']] = df[['Year', 'Month', 'Day', 'Hour']].astype(int)
df = df.rename(columns={'Volumen Bicicletas': 'Count'})
df

Unnamed: 0,ROOT_ID,Year,Month,weekday,Day,Hour,Count
0,1060335,2024,11,4,29,17,52
1,1048229,2024,11,4,29,17,70
2,1060335,2024,11,4,29,17,42
3,1048229,2024,11,4,29,17,68
4,1048249,2024,12,6,1,17,54
...,...,...,...,...,...,...,...
924,1047975,2024,10,0,28,9,270
925,1047953,2024,10,0,28,8,192
926,1046343,2024,10,4,25,13,18
927,1046328,2024,10,4,25,13,24


### Join with location

In [204]:
gdf = df.merge(locations[["ROOT_ID", "geometry",]], on="ROOT_ID", how="left")
gdf = gpd.GeoDataFrame(gdf, geometry='geometry')
print(gdf.isna().sum().sum())
print(gdf.shape)
gdf.head()

0
(929, 8)


Unnamed: 0,ROOT_ID,Year,Month,weekday,Day,Hour,Count,geometry
0,1060335,2024,11,4,29,17,52,POINT (433211.182 4583998.871)
1,1048229,2024,11,4,29,17,70,POINT (433211.182 4583998.871)
2,1060335,2024,11,4,29,17,42,POINT (433211.182 4583998.871)
3,1048229,2024,11,4,29,17,68,POINT (433211.182 4583998.871)
4,1048249,2024,12,6,1,17,54,POINT (432725.306 4586854.582)


## Save output

In [208]:
if SAVE_OUTPUT:
    gdf.to_parquet(f'{OUTPUT_DATA_PATH}/bicizen.parquet')
    locations.to_parquet(f'{OUTPUT_DATA_PATH}/bicizen_stations.parquet')


## Watermark

In [14]:
!python -m pip install watermark --quiet

In [15]:
%load_ext watermark

In [16]:
%watermark

Last updated: 2025-03-21T11:50:06.008930+01:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.25.0

Compiler    : MSC v.1938 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : AMD64 Family 25 Model 68 Stepping 1, AuthenticAMD
CPU cores   : 16
Architecture: 64bit



In [17]:
%watermark --iversions

geopandas: 0.13.2
pandas   : 2.0.3
numpy    : 1.24.4



In [18]:
!lsb_release -a

"lsb_release" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
