# Config

In [42]:
# parameters
SAVE_OUTPUT = True
SIZE_PLOTS = (12,10)
BUFFER_SIZE = 402.336 

#Location of the data
INPUT_DATA_PATH = "../data/raw/bicizen"
OUTPUT_DATA_PATH = "../data/interim/bicizen"

In [4]:
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path


# Load data

In [11]:
counts=pd.read_csv(Path(INPUT_DATA_PATH) / "Counts_BiciZen.csv", sep=';')
locations=pd.read_csv(Path(INPUT_DATA_PATH) /"Station_Data_BiciZen.csv",sep=';')


# Data management

## Explore data

In [40]:
#Explore stations
locations.drop_duplicates(inplace=True)
print(locations.shape)
print(locations.columns)
# print(locations.dtypes)
# locations.head()

(55, 16)
Index(['ROOT_ID', 'Ubication', 'Street', 'Direcction', 'Cardinals',
       'Infraestrucutura', 'Codigo Infraestructura', 'Num_carrils',
       'Codi_districte', 'Codi_equip_mesura', 'Equip de mesura', 'Latitud',
       'Longitud', 'X_ETRS89', 'Y_ETRS89', 'geometry'],
      dtype='object')


In [None]:
#Explore counts
print(counts.shape)
print(counts.columns)
print(counts.dtypes)
counts.tail()

(929, 18)
Index(['ROOT_ID', 'TIME_MINUTES', 'Factor', 'Volumen Bicicletas', 'SPOTTED_AT',
       'Date', 'Weekday', 'Time', 'Year', 'Month', 'Day', 'Hour',
       'NUMBER_BICYCLES', 'NUMBER_WOMEN_CYCLIST', 'NUMBER_CHILDREN',
       'NUMBER_SENIORS', 'NUMBER_SCOOTERS', 'NUMBER_VEHICLES'],
      dtype='object')
ROOT_ID                   int64
TIME_MINUTES              int64
Factor                    int64
Volumen Bicicletas        int64
SPOTTED_AT               object
Date                     object
Weekday                  object
Time                     object
Year                      int64
Month                     int64
Day                       int64
Hour                      int64
NUMBER_BICYCLES         float64
NUMBER_WOMEN_CYCLIST    float64
NUMBER_CHILDREN         float64
NUMBER_SENIORS          float64
NUMBER_SCOOTERS         float64
NUMBER_VEHICLES         float64
dtype: object


Unnamed: 0,ROOT_ID,TIME_MINUTES,Factor,Volumen Bicicletas,SPOTTED_AT,Date,Weekday,Time,Year,Month,Day,Hour,NUMBER_BICYCLES,NUMBER_WOMEN_CYCLIST,NUMBER_CHILDREN,NUMBER_SENIORS,NUMBER_SCOOTERS,NUMBER_VEHICLES
924,1047975,10,6,270,2024-10-28 09:03:00,28/10/2024,lunes,09:03:00,2024,10,28,9,45.0,16.0,,4.0,4.0,
925,1047953,10,6,192,2024-10-28 08:23:00,28/10/2024,lunes,08:23:00,2024,10,28,8,32.0,9.0,,,1.0,
926,1046343,10,6,18,2024-10-25 13:47:00,25/10/2024,viernes,13:47:00,2024,10,25,13,3.0,1.0,,,,
927,1046328,10,6,24,2024-10-25 13:04:00,25/10/2024,viernes,13:04:00,2024,10,25,13,4.0,,,,,
928,1046310,10,6,18,2024-10-25 12:39:00,25/10/2024,viernes,12:39:00,2024,10,25,12,3.0,1.0,,,,


In [29]:
counts.describe()

Unnamed: 0,ROOT_ID,TIME_MINUTES,Factor,Volumen Bicicletas,Year,Month,Day,Hour,NUMBER_BICYCLES,NUMBER_WOMEN_CYCLIST,NUMBER_CHILDREN,NUMBER_SENIORS,NUMBER_SCOOTERS,NUMBER_VEHICLES
count,929.0,929.0,929.0,929.0,929.0,929.0,929.0,929.0,906.0,799.0,407.0,438.0,811.0,370.0
mean,1049844.0,10.322928,5.935414,46.557589,2024.0,11.051668,16.155005,15.218515,8.249448,2.846058,0.253071,0.349315,3.086313,0.435135
std,11195.06,2.522127,0.504425,45.792297,0.0,0.333993,8.533509,4.414494,8.717267,3.504037,0.763924,0.788294,3.231779,1.684412
min,747957.0,10.0,2.0,0.0,2024.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1048268.0,10.0,6.0,18.0,2024.0,11.0,8.0,10.0,3.0,1.0,0.0,0.0,1.0,0.0
50%,1048955.0,10.0,6.0,30.0,2024.0,11.0,16.0,17.0,6.0,2.0,0.0,0.0,2.0,0.0
75%,1052805.0,10.0,6.0,60.0,2024.0,11.0,23.0,18.0,11.0,4.0,0.0,0.0,4.0,0.0
max,1061432.0,30.0,6.0,450.0,2024.0,12.0,31.0,22.0,129.0,44.0,11.0,5.0,37.0,16.0


### Explore counts

In [26]:
attributes = ['ROOT_ID', 'Year', 'Month', 'Day', 'Hour','Volumen Bicicletas']
df = counts[attributes].copy()
#Make 'Year', 'Month', 'Day', 'Hour' int
# Fill NaN values with 0 before converting to int
df[['Year', 'Month', 'Day', 'Hour']] = df[['Year', 'Month', 'Day', 'Hour']].astype(int)
df = df.rename(columns={'Volumen Bicicletas': 'Count'})
df

Unnamed: 0,ROOT_ID,Year,Month,Day,Hour,Count
0,1060335,2024,11,29,17,52
1,1048229,2024,11,29,17,70
2,1060335,2024,11,29,17,42
3,1048229,2024,11,29,17,68
4,1048249,2024,12,1,17,54
...,...,...,...,...,...,...
924,1047975,2024,10,28,9,270
925,1047953,2024,10,28,8,192
926,1046343,2024,10,25,13,18
927,1046328,2024,10,25,13,24


### Join with location

In [41]:
gdf = df.merge(locations[["ROOT_ID", "Latitud", "Longitud"]], on="ROOT_ID", how="left")
print(gdf.isna().sum().sum())
print(gdf.shape)
gdf.head()

0
(929, 8)


Unnamed: 0,ROOT_ID,Year,Month,Day,Hour,Count,Latitud,Longitud
0,1060335,2024,11,29,17,52,41.40476,2.20095
1,1048229,2024,11,29,17,70,41.40476,2.20095
2,1060335,2024,11,29,17,42,41.40476,2.20095
3,1048229,2024,11,29,17,68,41.40476,2.20095
4,1048249,2024,12,1,17,54,41.43044,2.19482


## Save output

In [43]:
if SAVE_OUTPUT:
    df.to_parquet(f'{OUTPUT_DATA_PATH}/bicizen.parquet')

## Watermark

In [None]:
!python -m pip install watermark --quiet

In [None]:
%load_ext watermark

In [None]:
%watermark

Last updated: 2024-08-23T15:55:33.641180+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.1.85+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit



In [None]:
%watermark --iversions

json  : 2.0.9
pandas: 2.1.4
google: 2.0.3
numpy : 1.26.4



In [None]:
!lsb_release -a

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.3 LTS
Release:	22.04
Codename:	jammy
