## Config

In [23]:
# parameters
SAVE_OUTPUT = True
SIZE_PLOTS = (12,10)
BUFFER_SIZE = 402.336 

#Location of the data
INPUT_DATA_PATH = "../data/raw/bacc"
OUTPUT_DATA_PATH = "../data/interim/bacc"

In [24]:
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path


# Load data

In [25]:
counts=pd.read_parquet(Path(INPUT_DATA_PATH) / "data_bacc.parquet")
counts['Location'] = counts['Location'].apply(lambda x: x.split('_')[1])

In [32]:
counts["Location"].value_counts()

Location
DiagonalAmb5D          64
MeridianaAmbAragó      64
DiagonalAmbGanduxer    64
DiagonalAmbGlories     64
GranVíaAmbLaBordeta    64
AragoAmbRCatalunya     32
Name: count, dtype: int64

# Data management

## Explore data

In [26]:
#Explore counts
print(counts.shape)
print(counts.columns)
print(counts.dtypes)
counts.tail()

(352, 22)
Index(['Location', 'date', 'Procesado video', 'Start time', 'End time',
       'Total vehicles', 'Bicing Mechanical Men', 'Bicing Mechanical Women',
       'Bicing Electric Men', 'Bicing Electric Women', 'Urban Mechanical Men',
       'Urban Mechanical Women', 'Urban Electric Men', 'Urban Electric Women',
       'Scooter Electric Men', 'Scooter Electric Women',
       'Foldable Mechanical Men', 'Foldable Mechanical Women',
       'Foldable Electric Men', 'Foldable Electric Women', 'Cargo-Bike',
       'Others'],
      dtype='object')
Location                     object
date                         object
Procesado video              object
Start time                   object
End time                     object
Total vehicles                int64
Bicing Mechanical Men         int64
Bicing Mechanical Women       int64
Bicing Electric Men           int64
Bicing Electric Women         int64
Urban Mechanical Men          int64
Urban Mechanical Women        int64
Urban Electric Men

Unnamed: 0,Location,date,Procesado video,Start time,End time,Total vehicles,Bicing Mechanical Men,Bicing Mechanical Women,Bicing Electric Men,Bicing Electric Women,...,Urban Electric Men,Urban Electric Women,Scooter Electric Men,Scooter Electric Women,Foldable Mechanical Men,Foldable Mechanical Women,Foldable Electric Men,Foldable Electric Women,Cargo-Bike,Others
347,DiagonalAmbGanduxer,2024-09-26,4,18:30:00,18:45:00,182,1,1,36,5,...,26,5,34,13,7,3,4,7,2,3
348,DiagonalAmbGanduxer,2024-09-26,5,18:45:00,19:00:00,186,1,0,37,13,...,23,7,36,9,9,2,7,3,0,5
349,DiagonalAmbGanduxer,2024-09-26,6,19:00:00,19:15:00,166,1,0,31,14,...,25,7,28,8,2,2,8,3,1,2
350,DiagonalAmbGanduxer,2024-09-26,7,19:15:00,19:30:00,210,1,0,41,20,...,21,1,32,14,9,5,8,6,1,4
351,DiagonalAmbGanduxer,2024-09-26,8,19:30:00,19:45:00,135,0,1,24,10,...,12,3,26,8,4,0,7,1,2,5


## Group or scale it by hour


In [27]:
#Handle temporal data
counts["datetime"] = pd.to_datetime(counts["date"].astype(str) + ' ' + counts["Start time"].astype(str))
counts["end_datetime"] = pd.to_datetime(counts["date"].astype(str) + ' ' + counts["End time"].astype(str))
# Calculate the difference between End time and Start time in minutes
counts["minutes"] = (counts["end_datetime"] - counts["datetime"]).dt.total_seconds() / 60

counts["Hour"] = counts["datetime"].dt.hour
counts[['Location', 'datetime', 'Hour', "minutes"]].head(2)

Unnamed: 0,Location,datetime,Hour,minutes
0,AragoAmbRCatalunya,2023-09-28 07:45:00,7,15.0
1,AragoAmbRCatalunya,2023-09-28 08:00:00,8,15.0


In [28]:
#Group by hour
counts_by_hour = counts.groupby(['Location', 'date', 'Hour'])[['Total vehicles', 'minutes']].sum().reset_index()
counts_by_hour.rename(columns={'Total vehicles': 'Count'}, inplace=True)
print(counts.shape)
print(counts_by_hour.shape)
counts_by_hour.head()

(352, 26)
(132, 5)


Unnamed: 0,Location,date,Hour,Count,minutes
0,AragoAmbRCatalunya,2023-09-07,7,56,15.0
1,AragoAmbRCatalunya,2023-09-07,8,222,60.0
2,AragoAmbRCatalunya,2023-09-07,9,141,45.0
3,AragoAmbRCatalunya,2023-09-07,17,54,15.0
4,AragoAmbRCatalunya,2023-09-07,18,230,60.0


In [29]:
#Make factor
counts_by_hour["Factor"]=60/counts_by_hour["minutes"]
counts_by_hour["Factor"].value_counts()

#Apply factor
counts_by_hour["Count"]=counts_by_hour["Count"]*counts_by_hour["Factor"]
counts_by_hour["Count"].describe()

count    132.000000
mean     505.477273
std      204.591010
min      144.000000
25%      362.000000
50%      483.166667
75%      678.083333
max      978.000000
Name: Count, dtype: float64

In [30]:
test = counts_by_hour.copy()
test = test.groupby('Location').agg({'Count': 'mean'}).reset_index()
test 


Unnamed: 0,Location,Count
0,AragoAmbRCatalunya,229.416667
1,DiagonalAmb5D,505.083333
2,DiagonalAmbGanduxer,780.472222
3,DiagonalAmbGlories,625.027778
4,GranVíaAmbLaBordeta,380.222222
5,MeridianaAmbAragó,374.611111


In [8]:
#Most of the locations are bidirectional, except 
counts_by_hour.loc[counts_by_hour['Location'] != 'DiagonalAmb5D', 'Count'] = counts_by_hour['Count'] / 2
counts_by_hour["Count"].describe()

count    132.000000
mean     298.655303
std      145.955804
min       72.000000
25%      184.000000
50%      304.000000
75%      388.166667
max      696.000000
Name: Count, dtype: float64

## Add location & BiciZen ID

In [9]:
locations = {
    'DiagonalAmb5D': {'lat': 41.396333, 'lon': 2.159083, 'ROOT_ID': 1095689},
    'DiagonalAmbGanduxer': {'lat': 41.391639, 'lon': 2.139417, 'ROOT_ID': 1095690},
    'MeridianaAmbAragó': {'lat': 41.409056, 'lon': 2.186861, 'ROOT_ID': 1095691},
    'DiagonalAmbGlories': {'lat': 41.402722, 'lon': 2.183528, 'ROOT_ID': 1095692},
    'GranVíaAmbLaBordeta': {'lat': 41.373639, 'lon': 2.147028, 'ROOT_ID': 1095693},
    'AragoAmbRCatalunya': {'lat': 41.391139, 'lon': 2.163694, 'ROOT_ID': 1095694}
}

# Join count with locations
locations_df = pd.DataFrame.from_dict(locations, orient='index').reset_index()
locations_df.columns = ['Location', 'Latitud', 'Longitud','ROOT_ID']
df = counts_by_hour.merge(locations_df, on='Location', how='left')

df.columns



Index(['Location', 'date', 'Hour', 'Count', 'minutes', 'Factor', 'Latitud',
       'Longitud', 'ROOT_ID'],
      dtype='object')

### Deal with date and Join with location

In [10]:
# Convert date column to datetime
df["date"] = pd.to_datetime(df["date"])

# Extract Year, Month, Weekday, and Day from date
df["Year"] = df["date"].dt.year
df["Month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.weekday
df["Day"] = df["date"].dt.day



In [11]:
df.columns

Index(['Location', 'date', 'Hour', 'Count', 'minutes', 'Factor', 'Latitud',
       'Longitud', 'ROOT_ID', 'Year', 'Month', 'weekday', 'Day'],
      dtype='object')

In [12]:
df=df[['ROOT_ID', 'Year', 'Month', 'weekday', 'Day', 'Hour', 'Count',
       'Latitud', 'Longitud']]

In [13]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitud, df.Latitud))
print(gdf.isna().sum().sum())
print(gdf.shape)
gdf.head()

0
(132, 10)


Unnamed: 0,ROOT_ID,Year,Month,weekday,Day,Hour,Count,Latitud,Longitud,geometry
0,1095694,2023,9,3,7,7,112.0,41.391139,2.163694,POINT (2.16369 41.39114)
1,1095694,2023,9,3,7,8,111.0,41.391139,2.163694,POINT (2.16369 41.39114)
2,1095694,2023,9,3,7,9,94.0,41.391139,2.163694,POINT (2.16369 41.39114)
3,1095694,2023,9,3,7,17,108.0,41.391139,2.163694,POINT (2.16369 41.39114)
4,1095694,2023,9,3,7,18,115.0,41.391139,2.163694,POINT (2.16369 41.39114)


# Station data

In [37]:
stations_details = {
    1095689: {'Desc': 'Diagonal Plaza 5 Oros', 'Direction': 'NE', 'Directionality': 'Unidirectional'},
    1095690: {'Desc': 'Diagonal Amb Ganduxer', 'Direction': 'SW', 'Directionality': 'Bidirectional'},
    1095691: {'Desc': 'Meridiana Amb Aragó', 'Direction': 'N', 'Directionality': 'Bidirectional'},
    1095692: {'Desc': 'Diagonal Amb Glories', 'Direction': 'NE', 'Directionality': 'Bidirectional'},
    1095693: {'Desc': 'Gran Via Amb La Bordeta', 'Direction': 'SW', 'Directionality': 'Bidirectional'},
    1095694: {'Desc': 'Aragó Amb Ronda Catalunya', 'Direction': 'SW', 'Directionality': 'Unidirectional'}
}
stations = pd.DataFrame.from_dict(stations_details, orient='index').reset_index()
#Rename indes to ROOT_ID
stations.rename(columns={'index': 'ROOT_ID'}, inplace=True)

stations=pd.merge(locations_df,stations, on='ROOT_ID', how='inner')
stations


Unnamed: 0,Location,Latitud,Longitud,ROOT_ID,Desc,Direction,Directionality
0,DiagonalAmb5D,41.396333,2.159083,1095689,Diagonal Plaza 5 Oros,NE,Unidirectional
1,DiagonalAmbGanduxer,41.391639,2.139417,1095690,Diagonal Amb Ganduxer,SW,Bidirectional
2,MeridianaAmbAragó,41.409056,2.186861,1095691,Meridiana Amb Aragó,N,Bidirectional
3,DiagonalAmbGlories,41.402722,2.183528,1095692,Diagonal Amb Glories,NE,Bidirectional
4,GranVíaAmbLaBordeta,41.373639,2.147028,1095693,Gran Via Amb La Bordeta,SW,Bidirectional
5,AragoAmbRCatalunya,41.391139,2.163694,1095694,Aragó Amb Ronda Catalunya,SW,Unidirectional


## Save output

In [38]:
if SAVE_OUTPUT:
    gdf.to_parquet(f'{OUTPUT_DATA_PATH}/BACC.parquet')
    stations.to_parquet(f'{OUTPUT_DATA_PATH}/bacc_stations.parquet')

## Watermark

In [139]:
!python -m pip install watermark --quiet

In [140]:
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [141]:
%watermark

Last updated: 2025-03-21T20:50:22.194862+01:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.25.0

Compiler    : MSC v.1938 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : AMD64 Family 25 Model 68 Stepping 1, AuthenticAMD
CPU cores   : 16
Architecture: 64bit



In [142]:
%watermark --iversions

geopandas: 0.13.2
pandas   : 2.0.3
numpy    : 1.24.4



In [143]:
!lsb_release -a

"lsb_release" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
