## Config

In [1]:
# parameters
INPUT_DATA_PATH = "../data/interim/"
INPUT_PROCESSED_DATA_PATH = "../data/processed/"

YEAR = 2017
SAVE_OUTPUT = True
OUTPUT_DATA_PATH = "../data/interim/"


In [2]:
# Parameters
YEAR = 2024
SAVE_OUTPUT = True


In [3]:
# Import all necessary libraries
import time
start = time.time()
import geopandas as gpd
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt
import warnings
from pathlib import Path


# Load data

In [4]:
#Read observations by hour 
counters_hour = pd.read_parquet(Path(INPUT_DATA_PATH) / f'cleaned_data{YEAR}.parquet')

# Explore the data (shape, columns, head, etc)
print(counters_hour.shape)
print(counters_hour.columns)
counters_hour.head()

(2484288, 11)
Index(['Id_aforament', 'date', 'hour', 'intensity', 'prev_hour_count',
       'Daily_Counts', 'duration', 'Number_of_hrs_data_collected', 'day',
       'month', 'Number_of_days_data_collected'],
      dtype='object')


Unnamed: 0,Id_aforament,date,hour,intensity,prev_hour_count,Daily_Counts,duration,Number_of_hrs_data_collected,day,month,Number_of_days_data_collected
0,20001,2024-01-01,0,126.0,,3461.0,1,24,1,1,25
1,20001,2024-01-01,1,138.0,,3461.0,1,24,1,1,25
2,20001,2024-01-01,2,103.0,,3461.0,1,24,1,1,25
3,20001,2024-01-01,3,95.0,,3461.0,1,24,1,1,25
4,20001,2024-01-01,4,76.0,,3461.0,1,24,1,1,25


## Fix formats

In [5]:
counters_hour.dtypes

Id_aforament                              int32
date                             datetime64[ns]
hour                                     object
intensity                               float64
prev_hour_count                         float64
Daily_Counts                            float64
duration                                  int64
Number_of_hrs_data_collected              int64
day                                       int32
month                                     int32
Number_of_days_data_collected             int64
dtype: object

In [6]:
counters_hour.rename(columns={'Id_aforament': 'id'}, inplace=True)
counters_hour["intensity"] = counters_hour["intensity"].astype(int)
#Make date column datetime
counters_hour['date'] = pd.to_datetime(counters_hour['date'])
counters_hour["weekday"] = counters_hour["date"].dt.weekday
counters_hour["hour"] = counters_hour["hour"].astype(int)
counters_hour["year"] = counters_hour["date"].dt.year

In [7]:
essential_attributes = [
    'id',
    'date', 
    'weekday', 
    'hour', 
    'intensity', 
    'month', 
    'day'
]
data_indices = counters_hour[essential_attributes].copy()
indices = counters_hour["id"].drop_duplicates().reset_index(drop=True)

In [8]:
data_indices.dtypes

id                    int32
date         datetime64[ns]
weekday               int32
hour                  int32
intensity             int32
month                 int32
day                   int32
dtype: object

# Traffic Indices Of Permanent Counters


## Calculate Bicycle Traffic Indices

### WW1

The weekend/weekday index (WWI) is the ratio of the average weekend daily traffic to a particular
site's average weekday daily traffic.

In [9]:
def calculate_wwi(df):
    # Identify weekends (Saturday=5, Sunday=6) and weekdays (Monday=0 to Friday=4)
    df['is_weekend'] = df['weekday'].isin([5, 6])

    # Calculate average daily traffic for weekends and weekdays by counter id
    daily_traffic = df.groupby(['id', 'date', 'is_weekend'])['intensity'].sum().reset_index()

    avg_traffic = daily_traffic.groupby(['id', 'is_weekend'])['intensity'].mean().unstack()

    # Ensure columns exist for both weekend (True) and weekday (False)
    avg_traffic = avg_traffic.rename(columns={True: 'weekend_avg', False: 'weekday_avg'}).fillna(0)

    # Calculate WWI (weekend/weekday index)
    avg_traffic['WWI'] = avg_traffic['weekend_avg'] / avg_traffic['weekday_avg']

    # Prepare final indices DataFrame
    indices = avg_traffic.reset_index()[['id', 'WWI']]

    #Round WWI to 2 decimal places
    indices['WWI'] = indices['WWI'].round(2)

    indices.reset_index(drop=True, inplace=True)
    return indices

WWI_indices = calculate_wwi(data_indices)

print(WWI_indices.shape)
print(WWI_indices["WWI"].describe())


(364, 2)
count    364.000000
mean       0.762555
std        0.110293
min        0.410000
25%        0.700000
50%        0.760000
75%        0.820000
max        1.190000
Name: WWI, dtype: float64


### AMI


The AM to midday index (AMI) is the ratio of the average weekday volume observed between 7 and 9 am to the average weekday volume between 11 am and 1 pm. 

In [10]:
def calculate_ami(df):
    # Filter for weekdays (Monday=0 to Friday=4)
    weekday_df = df[df['weekday'].between(0, 4)]

    # Calculate traffic for AM (7-9) and midday (11-13) by id and date
    am_traffic = weekday_df[weekday_df['hour'].between(7, 9)].groupby(['id', 'date'])['intensity'].sum().reset_index()
    midday_traffic = weekday_df[weekday_df['hour'].between(11, 13)].groupby(['id', 'date'])['intensity'].sum().reset_index()

    # Calculate mean traffic for AM and midday by id
    am_mean = am_traffic.groupby('id')['intensity'].mean()
    midday_mean = midday_traffic.groupby('id')['intensity'].mean()

    # Combine and calculate AMI (AM to midday index)
    ami_df = pd.DataFrame({'id': am_mean.index, 'AM_mean': am_mean, 'Midday_mean': midday_mean}).fillna(0)
    ami_df['AMI'] = ami_df['AM_mean'] / ami_df['Midday_mean']

    #Round AMI to 2 decimal places
    ami_df['AMI'] = ami_df['AMI'].round(2)

    ami_df.reset_index(drop=True, inplace=True)
    return ami_df[['id', 'AMI']]

AMI_indices = calculate_ami(data_indices)

print(AMI_indices.shape)
print(AMI_indices["AMI"].describe())


(364, 2)
count    364.000000
mean       1.174038
std        0.383460
min        0.380000
25%        0.920000
50%        1.110000
75%        1.370000
max        3.010000
Name: AMI, dtype: float64


### PMI

The PM to midday index (PMI) is the ratio of the average weekday volume observed between 5 pm and 7 pm to the average weekday volume observed between 11 am and 1 pm.

In [11]:
def calculate_pmi(df):
    # Filter for weekdays (Monday=0 to Friday=4)
    weekday_df = df[df['weekday'].between(0, 4)]

    # Calculate traffic for PM (17-19) and midday (11-13) by id and date
    pm_traffic = weekday_df[weekday_df['hour'].between(17, 19)].groupby(['id', 'date'])['intensity'].sum().reset_index()
    midday_traffic = weekday_df[weekday_df['hour'].between(11, 13)].groupby(['id', 'date'])['intensity'].sum().reset_index()

    # Calculate mean traffic for PM and midday by id
    pm_mean = pm_traffic.groupby('id')['intensity'].mean()
    midday_mean = midday_traffic.groupby('id')['intensity'].mean()

    # Combine and calculate PMI (PM to midday index)
    pmi_df = pd.DataFrame({'id': pm_mean.index, 'PM_mean': pm_mean, 'Midday_mean': midday_mean}).fillna(0)
    pmi_df['PMI'] = pmi_df['PM_mean'] / pmi_df['Midday_mean']

    #Round PMI to 2 decimal places
    pmi_df['PMI'] = pmi_df['PMI'].round(2)

    pmi_df.reset_index(drop=True, inplace=True)
    return pmi_df[['id', 'PMI']]
PMI_indices = calculate_pmi(data_indices)

print(PMI_indices.shape)
print(PMI_indices["PMI"].describe())

(364, 2)


count    364.000000
mean       1.583104
std        0.263192
min        0.670000
25%        1.417500
50%        1.560000
75%        1.740000
max        2.480000
Name: PMI, dtype: float64


### PHI



The peak hour index (PHI) is introduced here as the maximum value of the AMI and PMI for a given site. This index was created to help classify sites where a given “count station” might only capture one predominant direction of travel, such as in the case of a one-way couplet, where the morning commute volume peak might be observed by a different counter than that observing the evening commute volume peak. By considering the maximum value, the degree to which a given site has “commute-related” peaking can be better represented.


In [12]:
def calculate_phi(ami_df, pmi_df):
    # Merge AMI and PMI
    phi_df = ami_df.merge(pmi_df, on='id', how='outer').fillna(0)

    # Calculate PHI (Peak Hour Index)
    phi_df['PHI'] = phi_df[['AMI', 'PMI']].max(axis=1)

    #Round PHI to 2 decimal places
    phi_df['PHI'] = phi_df['PHI'].round(2)

    phi_df.reset_index(drop=True, inplace=True)
    return phi_df[['id', 'PHI']]

PHI_indices = calculate_phi(AMI_indices, PMI_indices)

print(PHI_indices.shape)
print(PHI_indices["PHI"].describe())

(364, 2)
count    364.000000
mean       1.644863
std        0.280117
min        0.790000
25%        1.457500
50%        1.610000
75%        1.810000
max        3.010000
Name: PHI, dtype: float64


### SF


Seasonal Factor (SF): This study created seasonal factors by dividing the average daily volume in June, July, and August by the average daily volume in December, January, and February. This SF is useful for identifying sites that have seasonal activity variations. The factor varies from zero to infinite. If the factor is calculated for multiple years in a single site, the average factor was taken to decide the final seasonality of that site. This factor was developed to separate the MOY pattern.

In [13]:
def calculate_sf(df):
    # Calculate daily volume by id and date
    daily_volume = df.groupby(['id', 'date', 'month'])['intensity'].sum().reset_index()

    # Identify summer (June=6, July=7, August=8) and winter (December=12, January=1, February=2) months
    summer_volume = daily_volume[daily_volume['month'].isin([6, 7, 8])]
    winter_volume = daily_volume[daily_volume['month'].isin([12, 1, 2])]

    # Mean daily volume by season and id
    summer_mean = summer_volume.groupby('id')['intensity'].mean()
    winter_mean = winter_volume.groupby('id')['intensity'].mean()
    print(summer_mean.shape)
    print(winter_mean.shape)
    # Keep only ids with data for both seasons
    summer_mean = summer_mean[summer_mean.index.isin(winter_mean.index)]
    winter_mean = winter_mean[winter_mean.index.isin(summer_mean.index)]
    print(summer_mean.shape)
    print(winter_mean.shape)
    # Combine and calculate Seasonal Factor (SF)
    sf_df = pd.DataFrame({'id': summer_mean.index, 'Summer_mean': summer_mean, 'Winter_mean': winter_mean}).fillna(0)
    sf_df['SF'] = sf_df['Summer_mean'] / sf_df['Winter_mean']

    #Round SF to 2 decimal places
    sf_df['SF'] = sf_df['SF'].round(3)

    sf_df.reset_index(drop=True, inplace=True)
    return sf_df[['id', 'SF']]
SF_indices = calculate_sf(data_indices)

print(SF_indices.shape)
print(SF_indices["SF"].describe())

(347,)
(349,)
(332,)
(332,)
(332, 2)
count    332.000000
mean       1.138614
std        0.543820
min        0.046000
25%        0.986000
50%        1.090500
75%        1.209250
max        9.557000
Name: SF, dtype: float64


### SRR ~Pending

The ratio of Strava annual recreational trips and Strava annual total trips is known as the Strava recreational ratio. The ratio varies from 0 to 1. The higher value of the recreational ratio indicates more recreational activities on that site. SRR was developed to separate the weekend HOD factor pattern

## Summary Indices

In [14]:
# Merge all indices by id
indices = reduce(lambda left, right: pd.merge(left, right, on='id', how='outer'), [WWI_indices, AMI_indices, PMI_indices, PHI_indices, SF_indices])
indices.describe()

Unnamed: 0,id,WWI,AMI,PMI,PHI,SF
count,364.0,364.0,364.0,364.0,364.0,332.0
mean,20219.629121,0.762555,1.174038,1.583104,1.644863,1.138614
std,126.561203,0.110293,0.38346,0.263192,0.280117,0.54382
min,20001.0,0.41,0.38,0.67,0.79,0.046
25%,20111.75,0.7,0.92,1.4175,1.4575,0.986
50%,20212.5,0.76,1.11,1.56,1.61,1.0905
75%,20319.25,0.82,1.37,1.74,1.81,1.20925
max,20446.0,1.19,3.01,2.48,3.01,9.557


# Save output

In [15]:
# Add a column with the year
indices['year'] = YEAR

# Add the results to a csv file, if it is the first year, create the file, if not, append the results
if SAVE_OUTPUT:
    output_file = Path(OUTPUT_DATA_PATH) / 'indices.csv'
    if output_file.exists():
        indices.to_csv(output_file, mode='a', header=False, index=False)
    else:
        indices.to_csv(output_file, index=False)
    print("Indices saved to csv")


Indices saved to csv
