## Config

In [4]:
# parameters
SAVE_OUTPUT = True
SIZE_PLOTS = (30,30)
YEARS = [2024]#[2017, 2018, 2019, 2020,2021,2022,2023,2024]

#Location of the data
INPUT_DATA_PATH = "../data/raw/"
OUTPUT_DATA_PATH = "../data/interim/"

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import papermill as pm

import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Execute notebooks

## 1. Get data as geoparquet

### 1.1 Store station data as parquet

In [6]:
for year in YEARS:
    pm.execute_notebook(
       '011-luca_data-management_station-data.ipynb',
       '011-luca_data-management_station-data.ipynb',
       parameters=dict(YEAR=year, SAVE_OUTPUT=True)
    )


Executing:   0%|          | 0/22 [00:00<?, ?cell/s]

### 1.2  Store counters data as parquet

In [7]:
for year in YEARS:
    pm.execute_notebook(
       '011-luca_data-management_counters-data.ipynb',
       '011-luca_data-management_counters-data.ipynb',
       parameters=dict(YEAR=year, SAVE_OUTPUT=True)
    )

Executing:   0%|          | 0/45 [00:00<?, ?cell/s]

## 2. Clean data

### 2.1 Remove observations where the error is not 0

In [9]:
pm.execute_notebook(
    '012-luca_data-cleaning_counters.ipynb',
    '012-luca_data-cleaning_counters.ipynb',
    parameters=dict(YEARS=YEARS, SAVE_OUTPUT=True)
)

Executing:   0%|          | 0/38 [00:00<?, ?cell/s]

PapermillExecutionError: 
---------------------------------------------------------------------------
Exception encountered at "In [22]":
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[22], line 1
----> 1 stations_data_clean=counters_data_clean[['Id_aforament','Desc_aforament', 'Num_carrils', 'Codi_districte', 'Codi_barri',
      2        'Codi_tipus_equip_mesura', 'Desc_tipus_equip_mesura', 'geometry']]
      3 stations_data_clean=stations_data_clean.drop_duplicates()
      4 print(stations_data_clean.shape)

File ~\anaconda3\envs\ridership_model\Lib\site-packages\geopandas\geodataframe.py:1475, in GeoDataFrame.__getitem__(self, key)
   1469 def __getitem__(self, key):
   1470     """
   1471     If the result is a column containing only 'geometry', return a
   1472     GeoSeries. If it's a DataFrame with any columns of GeometryDtype,
   1473     return a GeoDataFrame.
   1474     """
-> 1475     result = super().__getitem__(key)
   1476     # Custom logic to avoid waiting for pandas GH51895
   1477     # result is not geometry dtype for multi-indexes
   1478     if (
   1479         pd.api.types.is_scalar(key)
   1480         and key == ""
   (...)
   1483         and not is_geometry_type(result)
   1484     ):

File ~\anaconda3\envs\ridership_model\Lib\site-packages\pandas\core\frame.py:3767, in DataFrame.__getitem__(self, key)
   3765     if is_iterator(key):
   3766         key = list(key)
-> 3767     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   3769 # take() does not accept boolean indexers
   3770 if getattr(indexer, "dtype", None) == bool:

File ~\anaconda3\envs\ridership_model\Lib\site-packages\pandas\core\indexes\base.py:5877, in Index._get_indexer_strict(self, key, axis_name)
   5874 else:
   5875     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 5877 self._raise_if_missing(keyarr, indexer, axis_name)
   5879 keyarr = self.take(indexer)
   5880 if isinstance(key, Index):
   5881     # GH 42790 - Preserve name from an Index

File ~\anaconda3\envs\ridership_model\Lib\site-packages\pandas\core\indexes\base.py:5941, in Index._raise_if_missing(self, key, indexer, axis_name)
   5938     raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   5940 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 5941 raise KeyError(f"{not_found} not in index")

KeyError: "['Codi_barri'] not in index"


### 2.2 Remove observation whith strange patterns

In [None]:
for year in YEARS:
    pm.execute_notebook(
       '012-luca_flag_counters.ipynb',
       '012-luca_flag_counters.ipynb',
       parameters=dict(YEAR=year, SAVE_OUTPUT=True)
    )

## 3.1 Traffic indices

### Calculate traffic indices

In [None]:
for year in YEARS:
    pm.execute_notebook(
       '013-luca_traffic_indices_1.ipynb',
       '013-luca_traffic_indices_1.ipynb',
       parameters=dict(YEAR=year, SAVE_OUTPUT=True)
    )

Executing:   0%|          | 0/35 [00:00<?, ?cell/s]

Executing:   0%|          | 0/35 [00:00<?, ?cell/s]

Executing:   0%|          | 0/35 [00:00<?, ?cell/s]

Executing:   0%|          | 0/35 [00:00<?, ?cell/s]

Executing:   0%|          | 0/35 [00:00<?, ?cell/s]

Executing:   0%|          | 0/35 [00:00<?, ?cell/s]

Executing:   0%|          | 0/35 [00:00<?, ?cell/s]

Executing:   0%|          | 0/35 [00:00<?, ?cell/s]

### Cluster by traffic indices

In [None]:
pm.execute_notebook(
    '02_luca_traffic_indices_1.ipynb',
    '02_luca_traffic_indices_1.ipynb',
    parameters=dict(YEARS=YEARS, SAVE_OUTPUT=True)
)

## 5. Clean explanatory variables data

### Population

In [None]:
for year in YEARS:
    pm.execute_notebook(
       '001-luca-explore-population.ipynb',
       '001-luca-explore-population.ipynb',
       parameters=dict(YEAR=year, SAVE_OUTPUT=True)
    )

## Save output

## Watermark

In [None]:
!python -m pip install watermark --quiet

In [None]:
%load_ext watermark

In [None]:
%watermark

In [None]:
%watermark --iversions

In [None]:
!lsb_release -a