## Config

In [28]:
# parameters
SAVE_OUTPUT = True
SIZE_PLOTS = (12,10)
YEAR = 2021

#Location of the data
INPUT_DATA_PATH = "../data/raw/"
INPUT_PROCESSED_DATA_PATH = "../data/processed/"
OUTPUT_DATA_PATH = "../data/interim/"


In [29]:
import numpy as np
import pandas as pd
import geopandas as gpd

from pathlib import Path
import json

## Load data

In [30]:

if YEAR != 2023:
    colnames=['id', 'date', 'weekday', 'hour', 'intensity', 'error'] 
    counters_data = pd.read_csv(f"{INPUT_DATA_PATH}/{YEAR}.csv", sep=";",names=colnames, header=None)
else: 
    counters_data = pd.read_csv(f"{INPUT_DATA_PATH}/{YEAR}.csv", sep=";")

## Data management

In [31]:
print(counters_data.shape)
print(counters_data.columns)
counters_data.head()

(9156288, 6)
Index(['id', 'date', 'weekday', 'hour', 'intensity', 'error'], dtype='object')


Unnamed: 0,id,date,weekday,hour,intensity,error
0,20001,2021-01-01,Domingo y Festivos,00:00:00.0000000,7,0.0
1,20001,2021-01-01,Domingo y Festivos,00:15:00.0000000,19,0.0
2,20001,2021-01-01,Domingo y Festivos,00:30:00.0000000,27,0.0
3,20001,2021-01-01,Domingo y Festivos,00:45:00.0000000,69,0.0
4,20001,2021-01-01,Domingo y Festivos,01:00:00.0000000,42,0.0


In [32]:
counters_data.dtypes

id             int64
date          object
weekday       object
hour          object
intensity      int64
error        float64
dtype: object

### Duplicates

In [33]:
counters_data = counters_data[~counters_data[["id", "date", "hour"]].duplicated()]
counters_data.shape

(9156288, 6)

In [34]:
len(counters_data["id"].unique())

268

### Errors

In [35]:
counters_data_clean=counters_data
# print(counters_data["error"].value_counts(dropna=False))
# errors=counters_data[(counters_data["error"]!=0.0)&(counters_data["intensity"].notna())].shape[0]/counters_data.shape[0]

# print(f"Percentage of observations where the couter was malfunctioning: {round(errors,3)}%")

In [36]:
# counters_data_clean=counters_data[(counters_data["error"]==0.0)|(counters_data["intensity"].isna())].copy()
# counters_data_clean=counters_data_clean.drop(columns=["error"])


### Get weekday

In [37]:
counters_data_clean['date'] = pd.to_datetime(counters_data_clean['date'])
counters_data_clean["weekday"]=counters_data_clean["date"].dt.day_name()
counters_data_clean["weekday"].value_counts()

weekday
Friday       1329792
Thursday     1306176
Monday       1305984
Saturday     1304064
Sunday       1304064
Tuesday      1303104
Wednesday    1303104
Name: count, dtype: int64

## Estimate DBT, MBT, AADBT


### DBT


In [38]:
def flag_insuficient_daily_data(df):
    df['duration']=0.25
    df_day = df.groupby(['id', 'date'])['duration'].sum().reset_index()
    df_day=df_day.rename(columns={'duration':'Number_of_hrs_data_collected'})
    print(df_day['Number_of_hrs_data_collected'].describe())

    df=pd.merge(df, df_day, on=['id', 'date'], how='inner')

    flagged_rows = df.loc[(df['Number_of_hrs_data_collected'] <= 23)]
    non_flagged_rows = df.loc[(df['Number_of_hrs_data_collected'] > 23)]
    print('Number of flagged rows:', flagged_rows.shape[0], 'Number of non-flagged rows:', non_flagged_rows.shape[0], 'Total rows:', df.shape[0], 'The sum of flagged and non-flagged rows is equal to the total rows:', flagged_rows.shape[0] + non_flagged_rows.shape[0] == df.shape[0])
    return non_flagged_rows,flagged_rows 


In [39]:
non_flagged_rows,flagged_quarters = flag_insuficient_daily_data(counters_data_clean.copy())


count    95378.0
mean        24.0
std          0.0
min         24.0
25%         24.0
50%         24.0
75%         24.0
max         24.0
Name: Number_of_hrs_data_collected, dtype: float64
Number of flagged rows: 0 Number of non-flagged rows: 9156288 Total rows: 9156288 The sum of flagged and non-flagged rows is equal to the total rows: True


In [40]:
DBT = non_flagged_rows.groupby(['id', 'date','weekday'])['intensity'].sum().reset_index()
DBT=DBT.rename(columns={'intensity':'DBT'})
DBT.head()

Unnamed: 0,id,date,weekday,DBT
0,20001,2021-01-01,Friday,2927
1,20001,2021-01-02,Saturday,4825
2,20001,2021-01-03,Sunday,4381
3,20001,2021-01-04,Monday,5759
4,20001,2021-01-05,Tuesday,6378


### MBT

In [41]:
# def flag_insuficient_monthly_data(df):
#     df['duration']=1

#     df['day']=df['date'].dt.day
#     df['month']=df['date'].dt.month
#     df_day = df.groupby(['id', 'month', 'day'])['duration'].sum().reset_index()
#     df_month = df_day.groupby(['id', 'month'])['day'].count().reset_index()
#     df_month=df_month.rename(columns={'day':'Number_of_days_data_collected'})


#     df=pd.merge(df, df_month, on=['id', 'month'], how='inner')

#     flagged_rows = df.loc[(df['Number_of_days_data_collected'] <= 25)]
#     non_flagged_rows = df.loc[(df['Number_of_days_data_collected'] > 25)]
#     print('Number of flagged rows:', flagged_rows.shape[0], 'Number of non-flagged rows:', non_flagged_rows.shape[0], 'Total rows:', df.shape[0], 'The sum of flagged and non-flagged rows is equal to the total rows:', flagged_rows.shape[0] + non_flagged_rows.shape[0] == df.shape[0])
#     return non_flagged_rows,flagged_rows 


In [42]:
# non_flagged_rows,flagged_days=flag_insuficient_monthly_data(DBT)


In [43]:
# MBT = non_flagged_rows.groupby(['id','month'])['DBT'].sum().reset_index()
# MBT=MBT.rename(columns={'DBT':'MBT'})
# print(MBT.shape)
# MBT.head()


### ABT

In [44]:
# def flag_insuficient_annual_data(df):
#     df['duration']=1

#     df_year = df.groupby(['id'])['duration'].sum().reset_index()
#     df_year=df_year.rename(columns={'duration':'Number_of_days_data_collected'})


#     df=pd.merge(df, df_year, on=['id'], how='inner')

#     flagged_rows = df.loc[(df['Number_of_days_data_collected'] <= 11)]
#     non_flagged_rows = df.loc[(df['Number_of_days_data_collected'] > 11)]
#     print('Number of flagged rows:', flagged_rows.shape[0], 'Number of non-flagged rows:', non_flagged_rows.shape[0], 'Total rows:', df.shape[0], 'The sum of flagged and non-flagged rows is equal to the total rows:', flagged_rows.shape[0] + non_flagged_rows.shape[0] == df.shape[0])
#     return non_flagged_rows,flagged_rows 


In [45]:
# non_flagged_rows,flagged_months=flag_insuficient_annual_data(MBT)


In [46]:
# ABT = non_flagged_rows.groupby(['id'])['MBT'].sum().reset_index()
# ABT=ABT.rename(columns={'MBT':'ABT'})
# print(ABT.shape)
# ABT.head()


### AADBT

In [47]:
AADBT=DBT.groupby(['id'])['DBT'].mean().round().reset_index()
AADBT=AADBT.rename(columns={'DBT':'AADBT'})
AADBT.head()

Unnamed: 0,id,AADBT
0,20001,7663.0
1,20002,10436.0
2,20003,8714.0
3,20005,2345.0
4,20006,1084.0


## Save output

In [48]:
if SAVE_OUTPUT:
    counters_data_clean.to_parquet(f'{OUTPUT_DATA_PATH}/counters_data{YEAR}.parquet')
    # DBT.to_parquet(f'{OUTPUT_DATA_PATH}/DBT{YEAR}.parquet')
    # MBT.to_parquet(f'{OUTPUT_DATA_PATH}/MBT{YEAR}.parquet')
    # ABT.to_parquet(f'{OUTPUT_DATA_PATH}/ABT{YEAR}.parquet')
    AADBT.to_parquet(f'{OUTPUT_DATA_PATH}/AADBT{YEAR}.parquet')
    

In [49]:
# Define the new row data
new_row = {
    'Year': YEAR,
    'Columns': counters_data.shape[1],
    'Rows': counters_data.shape[0],
    'Rows after cleaning': counters_data_clean.shape[0],
    'Deleted rows':  counters_data.shape[0]-counters_data_clean.shape[0],
    # 'Percentage': errors,
    'Bike Counters': counters_data_clean['id'].nunique(),
    'Flagged hours': flagged_quarters.shape[0]/4,
    # 'Flagged days': flagged_days.shape[0],
    # 'Flagged months': flagged_months.shape[0],
}

# Convert the new row data to a DataFrame
new_row_df = pd.DataFrame([new_row])

# Define the path to the CSV file
csv_file_path = f"{INPUT_PROCESSED_DATA_PATH}/log_observations.csv"

# Check if the CSV file exists
if Path(csv_file_path).is_file():
    # If the file exists, append the new row
    new_row_df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    # If the file does not exist, create it with the new row
    new_row_df.to_csv(csv_file_path, mode='w', header=True, index=False)

## Watermark

In [50]:
!python -m pip install watermark --quiet

In [51]:
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [52]:
%watermark

Last updated: 2024-10-20T23:34:39.580835+02:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.25.0

Compiler    : MSC v.1938 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : AMD64 Family 25 Model 68 Stepping 1, AuthenticAMD
CPU cores   : 16
Architecture: 64bit



In [53]:
%watermark --iversions

pandas   : 2.0.3
json     : 2.0.9
numpy    : 1.24.4
geopandas: 0.13.2



In [54]:
!lsb_release -a

"lsb_release" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
