## Config

In [1]:
# parameters
# SHAREDRIVE_PATH = "/content/drive/Shareddrives/Idealista/"
INPUT_DATA_PATH = "../data/interim/"
INPUT_PROCESSED_DATA_PATH = "../data/processed/"

YEAR = 2017
SAVE_OUTPUT = True
OUTPUT_DATA_PATH = "../data/interim/"

In [2]:
# Parameters
YEAR = 2023
SAVE_OUTPUT = True


In [3]:
# import necessary library
import pandas as pd
import geopandas as gpd
import geoarrow.pandas as _
from pathlib import Path
import plotly.express as px
from matplotlib import pyplot as plt

from shapely.geometry import shape
from shapely.ops import nearest_points



In [4]:
#Useful functions
# Define an empty DataFrame to store the summary
summary_df = pd.DataFrame(columns=['DataFrame Name', 'Number of Stations', 'Number of Observations', 'Flagged observations'])

def calculate_station_observation_summary(df,flagged, df_name):
    global summary_df  # Access the global DataFrame
    
    # Calculate the number of stations
    num_stations = df['Id_aforament'].nunique()
    
    # Calculate the number of observations
    num_flagged = flagged.shape[0]
    num_observations = df.shape[0]
    
    # Append the summary information to the DataFrame


    # Inserting the new row
    summary_df.loc[len(summary_df)] = {'DataFrame Name': df_name, 'Number of Stations': num_stations,'Number of Observations': num_observations, 'Flagged observations': num_flagged}
    # Print the summary
    print('DataFrame Name: ',df_name)
    print('Number of stations: ',num_stations)
    print('Number of observations: ',num_observations)
    #if(num_flagged!=0 and num_observations!=0):
    print('Percetage of flagged observations',round((num_flagged/num_observations)*100,4),'%')


def interquantile_range(serie):
    # Calculate quartiles
    Q1 = serie.quantile(0.25)
    Q3 = serie.quantile(0.75)

    # Calculate IQR
    IQR = Q3 - Q1

    # Define thresholds for mild outliers
    mild_lower_bound = Q1 - 1.5 * IQR
    mild_upper_bound = Q3 + 1.5 * IQR

    # Define thresholds for severe outliers
    severe_lower_bound = Q1 - 3 * IQR
    severe_upper_bound = Q3 + 3 * IQR
    print('Mild lower bound:',mild_lower_bound,'Mild upper bound:',mild_upper_bound,'Severe lower bound:',severe_lower_bound,'Severe upper bound:',severe_upper_bound)
    return mild_lower_bound, mild_upper_bound, severe_lower_bound, severe_upper_bound

## Load data

In [5]:
# load data
observations = pd.read_parquet(Path(INPUT_DATA_PATH) /'observations_clean.parquet')

# Data management


In [6]:
observations = observations[observations['date'].dt.year == YEAR]
observations["year"] = observations['date'].dt.year 


In [7]:
# aggregate by hour
observations_number= observations.shape[0]
observations['hour_count'] = 1 
observations['hour'] = observations['hour'].apply(lambda x: x.split(':')[0])
observations = observations.groupby(['Id_aforament','date','hour']).agg({'intensity':'sum',"hour_count":'sum'}).reset_index()
hour_count_4 = observations[observations["hour_count"] == 4].shape[0]
hour_count_3 = observations[observations["hour_count"] == 3].shape[0]
hour_count_2 = observations[observations["hour_count"] == 2].shape[0]
hour_count_1 = observations[observations["hour_count"] == 1].shape[0]
print(observations["hour_count"].value_counts())

observations=observations[observations["hour_count"] >= 2]
observations.drop(columns=["hour_count"],inplace=True)


hour_count
4    2817817
3      29359
2      15974
1       6544
Name: count, dtype: int64


In [8]:
print(observations.shape)
print(observations.columns)
observations.head()
# 393210.25

(2863150, 4)
Index(['Id_aforament', 'date', 'hour', 'intensity'], dtype='object')


Unnamed: 0,Id_aforament,date,hour,intensity
0,20001,2023-01-01,0,147
1,20001,2023-01-01,1,146
2,20001,2023-01-01,2,150
3,20001,2023-01-01,3,141
4,20001,2023-01-01,4,64


## 1: Site-hours with Prolonged Zero Counts

After it, it has been plotted and it has been decided to use **90h** as the threshold. It has been decided to do so after comparing the following tresholds and counts.

### Objective
Flag site-hours with X hours or more of consecutive zeros.

In [9]:
#The data has been 
data1=observations.copy()
print(data1.shape)
data1.head()


(2863150, 4)


Unnamed: 0,Id_aforament,date,hour,intensity
0,20001,2023-01-01,0,147
1,20001,2023-01-01,1,146
2,20001,2023-01-01,2,150
3,20001,2023-01-01,3,141
4,20001,2023-01-01,4,64


In [10]:
def flag_rolling_cumulative_zeros(df, time_column, value_column, site_column, threshold=48):

    # Initialize a flag column to mark rows where the rolling cumulative sum of the previous 25 or more rows has consecutive zeros
    df['flag'] = False

    # Group DataFrame by site and year
    grouped_df = df.groupby([site_column])

    for (site), group_data in grouped_df:
        rolling_cumulative_zeros = group_data[value_column].rolling(window=threshold, min_periods=1).sum()

        # Update the flag column based on the rolling cumulative sum
        df.loc[rolling_cumulative_zeros[rolling_cumulative_zeros == 0].index, 'flag'] = True

    # Return flagged and non-flagged rows
    flagged_rows = df[df['flag'] == True].copy()  # Use copy to avoid SettingWithCopyWarning
    non_flagged_rows = df[df['flag'] == False].copy()  # Use copy to avoid SettingWithCopyWarning

    # Drop the intermediate columns if needed
    df = df.drop([ 'flag'], axis=1)
    #flagged_rows = flagged_rows.drop(['year', 'flag'], axis=1)
    #non_flagged_rows = non_flagged_rows.drop(['year', 'flag'], axis=1)

    return non_flagged_rows,flagged_rows

In [11]:
non_flagged_rows,flagged_rows = flag_rolling_cumulative_zeros(data1, 'timestamp', 'intensity', 'Id_aforament', threshold=90)
# lets calculate how many hours of data flagged
calculate_station_observation_summary(non_flagged_rows,flagged_rows,'Task1')

DataFrame Name:  Task1
Number of stations:  369
Number of observations:  2828044
Percetage of flagged observations 1.2414 %


In [12]:

print("\n")
# Print the number of flagged and non-flagged rows
print('Number of flagged rows:', flagged_rows.shape[0], 'Number of non-flagged rows:', non_flagged_rows.shape[0], 'Total rows:', data1.shape[0], 'The sum of flagged and non-flagged rows is equal to the total rows:', flagged_rows.shape[0] + non_flagged_rows.shape[0] == data1.shape[0])
print("\n")

#Check if the number of unique stations is the same
print( 'The number of unique stations in the flagged rows:', flagged_rows['Id_aforament'].nunique(), 'The number of unique stations in the non-flagged rows:', non_flagged_rows['Id_aforament'].nunique(), 'The number of unique stations in the original data:', data1['Id_aforament'].nunique())
print("\n")

# Print the summary statistics for the 'intensity' column
print(non_flagged_rows['intensity'].describe())
print("\n")

print(flagged_rows['intensity'].describe())



Number of flagged rows: 35106 Number of non-flagged rows: 2828044 Total rows: 2863150 The sum of flagged and non-flagged rows is equal to the total rows: True


The number of unique stations in the flagged rows: 80 The number of unique stations in the non-flagged rows: 369 The number of unique stations in the original data: 369


count    2.828044e+06
mean     1.977756e+02
std      3.766258e+02
min      0.000000e+00
25%      3.600000e+01
50%      1.080000e+02
75%      2.500000e+02
max      1.567500e+04
Name: intensity, dtype: float64




count    35106.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: intensity, dtype: float64



## 2: Site-hours with Repeated Identical Values

### Objective
Flag site-hours with six or more identical hourly values exceeding five.




In [13]:
# we will use the dataframe obtained from previous task for this task
data2=non_flagged_rows.drop([ 'flag'],axis=1).copy()
print(data2.shape)
data2.head()

(2828044, 4)


Unnamed: 0,Id_aforament,date,hour,intensity
0,20001,2023-01-01,0,147
1,20001,2023-01-01,1,146
2,20001,2023-01-01,2,150
3,20001,2023-01-01,3,141
4,20001,2023-01-01,4,64


In [14]:
# Function to flag 6 repeated consecutive identical values greater than 10 by each site
def flag_values_equal_consecutive_values(df, hours, value):
    df['flagged'] = False

    for site in df['Id_aforament'].unique():
        site_data = df[df['Id_aforament'] == site]
        identical = (site_data['intensity'] == site_data['intensity'].shift(1)) & (site_data['intensity'] > value)
        consecutive_counts = identical.rolling(window=hours, min_periods=hours).sum() == hours
        
        # Find the indices to flag
        flag_indices = site_data[consecutive_counts].index

        # Flag all positions in the window of 'hours' before and including the flagged positions
        for idx in flag_indices:
            df.loc[idx-hours+1:idx, 'flagged'] = True

    flagged_rows = df[df['flagged'] == True].copy()
    non_flagged_rows = df[df['flagged'] == False].copy()

    df.drop(['flagged'], axis=1, inplace=True)

    return non_flagged_rows,flagged_rows

In [15]:
non_flagged_rows,flagged_rows= flag_values_equal_consecutive_values(data2.copy(), 6, 5)
calculate_station_observation_summary(non_flagged_rows,flagged_rows,'Task2')


DataFrame Name:  Task2
Number of stations:  369
Number of observations:  2827784
Percetage of flagged observations 0.0092 %


## 3: Site-days with Time Discrepancies

### Objective
Examine site-days where 3 a.m. counts surpass counts at 3 p.m., provided the 3 p.m. count exceeds 2.



In [16]:
# we will use the dataframe obtained from previous task for this task
data3=non_flagged_rows.drop(['flagged'],axis=1).copy()
print(data3.shape)
data3.head()

(2827784, 4)


Unnamed: 0,Id_aforament,date,hour,intensity
0,20001,2023-01-01,0,147
1,20001,2023-01-01,1,146
2,20001,2023-01-01,2,150
3,20001,2023-01-01,3,141
4,20001,2023-01-01,4,64


In [17]:
# Function to flag 6 repeated consecutive identical values greater than 10 by each site
def flag_values_3am_3pm(df):
    # Filter the DataFrame to keep only rows where the time is '3:00 AM'
    df_3am = df[df['hour'] == 3]

    # Filter the DataFrame to keep only rows where the time is '3:00 PM' and the count is greater than 2
    df_3pm = df[(df['hour'] == 15) & (df['intensity'] > 2)]

    # Merge the filtered DataFrames on 'day', 'month', 'year', and 'site'
    merged_data = pd.merge(df_3am, df_3pm, on=['Id_aforament',  'date'], suffixes=('_3am', '_3pm'))

    # Filter the merged DataFrame to keep only rows where the count at 3 AM is greater than the count at 3 PM
    flagged_rows = merged_data[merged_data['intensity_3am'] > merged_data['intensity_3pm']]

    # Get the date of the flagges rows
    flagged_sites=flagged_rows[['Id_aforament', 'date']].copy()
    flagged_sites['flagged']=True

    # Merge the flagged sites with the original DataFrame
    results=pd.merge(df,flagged_sites, on=['Id_aforament', 'date'], how='outer')
    results['flagged'] = results['flagged'].fillna(False)

    # Return the flagged rows
    flagged_rows = results[results['flagged'] == True].copy()

    # Return the non-flagged rows
    non_flagged_rows = results[results['flagged'] == False].copy()
    
    return non_flagged_rows,flagged_rows



In [18]:
non_flagged_rows,flagged_rows = flag_values_3am_3pm(data3.copy())
calculate_station_observation_summary(non_flagged_rows,flagged_rows,'Task3')

DataFrame Name:  Task3
Number of stations:  369
Number of observations:  2827784
Percetage of flagged observations 0.0 %


## 4: Unusual Hourly Count Discrepancies

### Objective
Flag site hours with an hourly count at least 10 times larger than the count of the previous hour, provided the previous hour's count exceeds 15.


In [19]:
data4=non_flagged_rows.drop(['flagged'],axis=1).copy()
print(data4.shape)
data4.head()

(2827784, 4)


Unnamed: 0,Id_aforament,date,hour,intensity
0,20001,2023-01-01,0,147
1,20001,2023-01-01,1,146
2,20001,2023-01-01,2,150
3,20001,2023-01-01,3,141
4,20001,2023-01-01,4,64


In [20]:
def flag_unusual_increases(df):

    # Sort the DataFrame by 'Id_aforament', 'date'', and 'hour'
    df.sort_values(by=['Id_aforament','date', 'hour'], inplace=True)
    # Calculate the previous hour's count
    df['prev_hour_count'] = df.groupby(['Id_aforament','date', 'hour'])['intensity'].shift(1)

    # Flag the rows where one hourly count is at least ten times larger than the previous hour’s count
    df['flagged'] = (df['intensity'] >= 10 * df['prev_hour_count']) & (df['prev_hour_count'] > 15)

    # Display the flagged rows
    flagged_rows = df.loc[df['flagged']==True]
    non_flagged_rows = df.loc[df['flagged']==False]
    print('Number of flagged rows:', flagged_rows.shape[0], 'Number of non-flagged rows:', non_flagged_rows.shape[0], 'Total rows:', df.shape[0], 'The sum of flagged and non-flagged rows is equal to the total rows:', flagged_rows.shape[0] + non_flagged_rows.shape[0] == df.shape[0])
    return non_flagged_rows,flagged_rows


In [21]:
non_flagged_rows,flagged_rows = flag_unusual_increases(data4.copy())
calculate_station_observation_summary(non_flagged_rows,flagged_rows,'Task4')


Number of flagged rows: 0 Number of non-flagged rows: 2827784 Total rows: 2827784 The sum of flagged and non-flagged rows is equal to the total rows: True
DataFrame Name:  Task4
Number of stations:  369
Number of observations:  2827784
Percetage of flagged observations 0.0 %


## 5: Threshold for High Hourly Counts

### Objective
Flag site-hourly counts exceeding X bicyclists.

We evaluated the upper bounds using the elbow method and determined 885 as a reasonable threshold. The lower bounds were not considered due to negative values.


In [22]:
data5=non_flagged_rows.drop(['flagged'],axis=1).copy()
print(data5.shape)
data5.head()
mild_lower_bound, mild_upper_bound, severe_lower_bound, severe_upper_bound=interquantile_range(data5['intensity'])

(2827784, 5)
Mild lower bound: -285.0 Mild upper bound: 571.0 Severe lower bound: -606.0 Severe upper bound: 892.0


In [23]:
def flag_outliers_hour(df,treshold):
    # Flag the rows where the intensity is greater than or equal to the threshold
    flagged_rows = df.loc[(df['intensity'] >= treshold)]
    non_flagged_rows = df.loc[ (df['intensity'] < treshold)]

    print('Number of flagged rows:', flagged_rows.shape[0], 'Number of non-flagged rows:', non_flagged_rows.shape[0], 'Total rows:', df.shape[0], 'The sum of flagged and non-flagged rows is equal to the total rows:', flagged_rows.shape[0] + non_flagged_rows.shape[0] == df.shape[0])
    
    return non_flagged_rows,flagged_rows 

In [24]:
non_flagged_rows,flagged_rows = flag_outliers_hour(data5.copy(),885)
calculate_station_observation_summary(non_flagged_rows,flagged_rows,'Task5')


Number of flagged rows: 55846 Number of non-flagged rows: 2771938 Total rows: 2827784 The sum of flagged and non-flagged rows is equal to the total rows: True
DataFrame Name:  Task5
Number of stations:  369
Number of observations:  2771938
Percetage of flagged observations 2.0147 %


## 6: Threshold for High Daily Counts

### Objective
Flag site days with total counts exceeding a specified number of bicyclists.

The severe upper bound was discarded due to being higher than the maximum value. The mild upper bound was evaluated, and 11225 was determined as a good threshold.

In [25]:
data6=non_flagged_rows.copy()
print(data6.shape)
data6.head()

df_day=data6.groupby(['Id_aforament', 'date'])['intensity'].sum().reset_index()
df_day=df_day.rename(columns={'intensity':'Daily_Counts'})
mild_lower_bound, mild_upper_bound, severe_lower_bound, severe_upper_bound=interquantile_range(df_day['Daily_Counts'])

(2771938, 5)


Mild lower bound: -4118.0 Mild upper bound: 11202.0 Severe lower bound: -9863.0 Severe upper bound: 16947.0


In [26]:
def flag_outliers_day(df, treshold):
    df_day=df.groupby(['Id_aforament', 'date'])['intensity'].sum().reset_index()
    df_day=df_day.rename(columns={'intensity':'Daily_Counts'})


    df=pd.merge(df, df_day, on=['Id_aforament', 'date'], how='inner')

    flagged_rows = df.loc[(df['Daily_Counts'] >= treshold)]
    non_flagged_rows = df.loc[(df['Daily_Counts'] < treshold)]


    print('Number of flagged rows:', flagged_rows.shape[0], 'Number of non-flagged rows:', non_flagged_rows.shape[0], 'Total rows:', df.shape[0], 'The sum of flagged and non-flagged rows is equal to the total rows:', flagged_rows.shape[0] + non_flagged_rows.shape[0] == df.shape[0])
    return non_flagged_rows,flagged_rows 


In [27]:
non_flagged_rows,flagged_rows =flag_outliers_day(data6.copy(),11225.625 )
calculate_station_observation_summary(non_flagged_rows,flagged_rows,'Task6')

Number of flagged rows: 13886 Number of non-flagged rows: 2758052 Total rows: 2771938 The sum of flagged and non-flagged rows is equal to the total rows: True
DataFrame Name:  Task6
Number of stations:  369
Number of observations:  2758052
Percetage of flagged observations 0.5035 %


## 7: Insufficient Daily Data

### Objective
Exclude site-daily counts collected over less than 22 valid hours.



In [28]:
data7=non_flagged_rows
print(data7.shape)
data7.head()

(2758052, 6)


Unnamed: 0,Id_aforament,date,hour,intensity,prev_hour_count,Daily_Counts
0,20001,2023-01-01,0,147,,4633
1,20001,2023-01-01,1,146,,4633
2,20001,2023-01-01,2,150,,4633
3,20001,2023-01-01,3,141,,4633
4,20001,2023-01-01,4,64,,4633


In [29]:
def flag_insuficient_daily_data(df):
    df['duration']=1
    df_day = df.groupby(['Id_aforament', 'date'])['duration'].sum().reset_index()
    df_day=df_day.rename(columns={'duration':'Number_of_hrs_data_collected'})
    print(df_day['Number_of_hrs_data_collected'].describe())

    df=pd.merge(df, df_day, on=['Id_aforament', 'date'], how='inner')

    flagged_rows = df.loc[(df['Number_of_hrs_data_collected'] <= 23)]
    non_flagged_rows = df.loc[(df['Number_of_hrs_data_collected'] > 23)]
    print('Number of flagged rows:', flagged_rows.shape[0], 'Number of non-flagged rows:', non_flagged_rows.shape[0], 'Total rows:', df.shape[0], 'The sum of flagged and non-flagged rows is equal to the total rows:', flagged_rows.shape[0] + non_flagged_rows.shape[0] == df.shape[0])
    return non_flagged_rows,flagged_rows 


In [30]:
non_flagged_rows,flagged_rows = flag_insuficient_daily_data(data7.copy())
calculate_station_observation_summary(non_flagged_rows,flagged_rows,'Task7')

count    119209.000000
mean         23.136273
std           2.961448
min           1.000000
25%          24.000000
50%          24.000000
75%          24.000000
max          24.000000
Name: Number_of_hrs_data_collected, dtype: float64


Number of flagged rows: 310340 Number of non-flagged rows: 2447712 Total rows: 2758052 The sum of flagged and non-flagged rows is equal to the total rows: True
DataFrame Name:  Task7
Number of stations:  369
Number of observations:  2447712
Percetage of flagged observations 12.6788 %


## 8: Inadequate Monthly Coverage

### Objective
Flag site months with valid days totaling less than three weeks (21 days out of 31).

No changes have been made to the criteria.

### Flagging Data
- Percentage of data flagged: 6.05%
- DataFrame Name: Task8
- Number of stations: 352
- Number of observations: 2,501,544
- Flagged observations: 151,416
- Deleted: 6.05%

In [31]:
data8=non_flagged_rows
print(data8.shape)
data8.head()

(2447712, 8)


Unnamed: 0,Id_aforament,date,hour,intensity,prev_hour_count,Daily_Counts,duration,Number_of_hrs_data_collected
0,20001,2023-01-01,0,147,,4633,1,24
1,20001,2023-01-01,1,146,,4633,1,24
2,20001,2023-01-01,2,150,,4633,1,24
3,20001,2023-01-01,3,141,,4633,1,24
4,20001,2023-01-01,4,64,,4633,1,24


In [32]:
def flag_insuficient_monthly_data(df):
    df['day']=df['date'].dt.day
    df['month']=df['date'].dt.month

    df_day = df.groupby(['Id_aforament', 'month', 'day'])['duration'].sum().reset_index()
    df_month = df_day.groupby(['Id_aforament', 'month'])['day'].count().reset_index()
    df_month=df_month.rename(columns={'day':'Number_of_days_data_collected'})


    df=pd.merge(df, df_month, on=['Id_aforament', 'month'], how='inner')

    flagged_rows = df.loc[(df['Number_of_days_data_collected'] <= 21)]
    non_flagged_rows = df.loc[(df['Number_of_days_data_collected'] > 21)]
    print('Number of flagged rows:', flagged_rows.shape[0], 'Number of non-flagged rows:', non_flagged_rows.shape[0], 'Total rows:', df.shape[0], 'The sum of flagged and non-flagged rows is equal to the total rows:', flagged_rows.shape[0] + non_flagged_rows.shape[0] == df.shape[0])
    return non_flagged_rows,flagged_rows 


In [33]:
non_flagged_rows,flagged_rows=flag_insuficient_monthly_data(data8)
calculate_station_observation_summary(non_flagged_rows,flagged_rows,'Task8')

Number of flagged rows: 180576 Number of non-flagged rows: 2267136 Total rows: 2447712 The sum of flagged and non-flagged rows is equal to the total rows: True
DataFrame Name:  Task8
Number of stations:  347
Number of observations:  2267136
Percetage of flagged observations 7.9649 %


## Final results

In [34]:
summary_df["Deleted"]=round((summary_df["Flagged observations"]/summary_df["Number of Observations"])*100,2)
summary_df

Unnamed: 0,DataFrame Name,Number of Stations,Number of Observations,Flagged observations,Deleted
0,Task1,369,2828044,35106,1.24
1,Task2,369,2827784,260,0.01
2,Task3,369,2827784,0,0.0
3,Task4,369,2827784,0,0.0
4,Task5,369,2771938,55846,2.01
5,Task6,369,2758052,13886,0.5
6,Task7,369,2447712,310340,12.68
7,Task8,347,2267136,180576,7.96


# Calculations
* DBT (Daily Bicycle Traffic): This is the count of bicycles passing a point in a single day.
* ADBT (Average Daily Bicycle Traffic): This is the average number of bicycles passing a point per day over a specified period.

In [35]:
data10=non_flagged_rows
non_flagged_rows.head()



Unnamed: 0,Id_aforament,date,hour,intensity,prev_hour_count,Daily_Counts,duration,Number_of_hrs_data_collected,day,month,Number_of_days_data_collected
0,20001,2023-01-01,0,147,,4633,1,24,1,1,28
1,20001,2023-01-01,1,146,,4633,1,24,1,1,28
2,20001,2023-01-01,2,150,,4633,1,24,1,1,28
3,20001,2023-01-01,3,141,,4633,1,24,1,1,28
4,20001,2023-01-01,4,64,,4633,1,24,1,1,28


### DBT

In [36]:
DBT = data10.groupby(['Id_aforament', 'date'])['intensity'].sum().reset_index()
DBT=DBT.rename(columns={'intensity':'DBT'})
DBT.head()
#gDBT.explore()

Unnamed: 0,Id_aforament,date,DBT
0,20001,2023-01-01,4633
1,20001,2023-01-02,6969
2,20001,2023-01-03,7848
3,20001,2023-01-04,8792
4,20001,2023-01-05,9068


### AADBT


In [37]:
AADBT=DBT.groupby(['Id_aforament']).agg({'DBT':'mean',"date":'count'}).reset_index()
AADBT=AADBT.rename(columns={'DBT':'AADBT','date':'Number_of_days'})
print(AADBT.shape)
AADBT=AADBT[AADBT['Number_of_days']>=120]
print(AADBT.shape)
AADBT.head()

(347, 3)
(306, 3)


Unnamed: 0,Id_aforament,AADBT,Number_of_days
0,20001,7577.685976,328
2,20003,2700.275362,138
3,20005,1415.30597,268
4,20006,3479.024768,323
5,20007,4406.002976,336


# Save output

In [38]:
if SAVE_OUTPUT:
    non_flagged_rows.to_parquet(f'{OUTPUT_DATA_PATH}/cleaned_data{YEAR}.parquet', index=False)
    AADBT.to_parquet(f'{OUTPUT_DATA_PATH}/flagged/AADBT{YEAR}.parquet')
    DBT.to_parquet(f'{OUTPUT_DATA_PATH}/flagged/DBT{YEAR}.parquet')



In [39]:
# Define the new row data
new_row = {
    'Year': YEAR,
    'Rows': observations_number,
    'Hour_count_4': hour_count_4,
    'Hour_count_3': hour_count_3,
    'Hour_count_2': hour_count_2,
    'Hour_count_1': hour_count_1,
    'Valid data:' :hour_count_4+hour_count_3+hour_count_2,
    'Task1': data2.shape[0],
    'Task2': data3.shape[0],
    'Task3': data4.shape[0],
    'Task4': data5.shape[0],
    'Task5': data6.shape[0],
    'Task6': data7.shape[0],
    'Task7': data8.shape[0],
    'Task8': data10.shape[0],
    'Total': data10.shape[0],
    'DBT': DBT.shape[0],
    'AADBT': AADBT.shape[0]
    

}


# Convert the new row data to a DataFrame
new_row_df = pd.DataFrame([new_row])

# Define the path to the CSV file
csv_file_path = f"{INPUT_PROCESSED_DATA_PATH}/log_flagging.csv"

# Check if the CSV file exists
if Path(csv_file_path).is_file():
    # If the file exists, append the new row
    new_row_df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    # If the file does not exist, create it with the new row
    new_row_df.to_csv(csv_file_path, mode='w', header=True, index=False)

## Watermark

In [40]:
!python -m pip install watermark --quiet

In [41]:
%load_ext watermark

In [42]:
%watermark

Last updated: 2024-11-02T14:54:02.593390+01:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.25.0

Compiler    : MSC v.1938 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : AMD64 Family 25 Model 68 Stepping 1, AuthenticAMD
CPU cores   : 16
Architecture: 64bit



In [43]:
%watermark --iversions

plotly    : 5.22.0
matplotlib: 3.8.4
pandas    : 2.0.3
geopandas : 0.13.2



In [44]:
!lsb_release -a

"lsb_release" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
