In [1]:
import pandas as pd
import numpy as np

# Data processing. Stops

**Resume:** 

This notebook contains the necessary steps to process the thermal surveys data (votes of each participant and each stop in the public space). First, all individual responses (votes) are digitized from the heat perception surveys, along with the driver sheets corresponding to each thermal walk. Then, three basic statistics are computed, the median, the mode and the inter-quantile range (iqr). Finally, the average temperature and humidex are computed at each stop, using the time window of the stop (for this last step we need the corresponding sensor trajectory dataset to compute the averaged quantities).


**Index:**

    1. Median, Mode and IQR
    2. Average Temperature and Humidex
    
  

<br>

## 1. Median, Mode and IQR
Functions to obtain the Median, the Mode and the IQR for a given DataFrame (survey dataset). These functions need to be applied for each "thermal question" separately (therefore three times, for thermal comfort, thermal sensation and walking thermal comfort votes). 



In [2]:
def compute_median(df, stop_codes):
    '''
    Function that calculates the median of votes for each stop of a survey.

    The function assumes that the categories (tags) are **ordered from worst to best** (e.g., "very uncomfortable" to "very comfortable").

    For each stop:
        1. It sums all the votes.
        2. Computes the cumulative sum across ordered options.
        3. Finds the category where the vote number corresponding to the 50% percentile is located.

    Parameters:
        df (pd.DataFrame): DataFrame with a 'tags' column (ordered options) and one column per stop with vote counts.
        stop_codes (list): List of strings, each representing a column name (stop code).

    Returns:
        list: One median category (string) per stop. If no votes, returns np.nan.
    '''
    
    medians = []

    for code in stop_codes:  # Iterate over all stops

        if df[code].isnull().all():  # If all values are missing (no votes)
            medians.append(np.nan)
            continue

        total_votes = df[code].sum()  # Total number of votes at this stop
        cumsum = df[code].cumsum()    # Cumulative sum of ordered votes

        # Determine position of median vote
        if total_votes % 2 == 0:
            threshold = total_votes / 2
        else:
            threshold = (total_votes + 1) / 2

        # Find the first index where cumulative sum exceeds or equals the threshold
        median_index = cumsum[cumsum >= threshold].index[0]
        medians.append(df['tags'].iloc[median_index])

    return medians



def compute_mode(df, stop_codes):
    '''
    Function that calculates the mode(s) of votes for each stop of a survey.

    For each stop:
        1. Finds the maximum number of votes.
        2. Returns all categories (tags) that have that maximum count.
        3. Stores the result as a dictionary: {category: vote_count}

    Parameters:
        df (pd.DataFrame): DataFrame with a 'tags' column (categories) and one column per stop with vote counts.
        stop_codes (list): List of strings, each representing a column name (stop code).

    Returns:
        list: A list of dictionaries, one per stop, where keys are the most voted tags and values are vote counts.
              If no votes, returns np.nan.
    '''
    
    mode_list = []

    for code in stop_codes:

        if df[code].isnull().all():  # No votes
            mode_list.append(np.nan)
        else:
            max_votes = df[code].max()  # Highest vote count
            tags = df['tags'][df[code] == max_votes].tolist()     # Tags with max votes
            counts = df[code][df[code] == max_votes].tolist()     # Corresponding counts

            mode_dict = dict(zip(tags, counts))
            mode_list.append(mode_dict)

    return mode_list




def compute_iqr(df, stop_codes):
    '''
    Function that calculates the Interquartile Range (IQR) of votes for each stop of a survey.

    The IQR is defined by the 25th percentile (Q1) and 75th percentile (Q3) categories.

    For each stop:
        1. Computes the cumulative sum of votes across ordered categories.
        2. Finds the category where the cumulative sum first reaches or exceeds 25% of total votes (Q1).
        3. Repeats for 75% (Q3).

    Parameters:
        df (pd.DataFrame): DataFrame with a 'tags' column (ordered categories) and one column per stop with vote counts.
        stop_codes (list): List of strings, each representing a column name (stop code).

    Returns:
        list: A list of [Q1_tag, Q3_tag] pairs (one per stop). If no votes, returns [np.nan, np.nan].
    '''

    iqr_list = []

    for code in stop_codes:

        if df[code].isnull().all():  # No votes
            iqr_list.append([np.nan, np.nan])
            continue

        total_votes = df[code].sum()
        q1_threshold = total_votes * 0.25
        q3_threshold = total_votes * 0.75

        cumsum = df[code].cumsum()

        q1_index = cumsum[cumsum >= q1_threshold].index[0]
        q3_index = cumsum[cumsum >= q3_threshold].index[0]

        q1_tag = df['tags'].iloc[q1_index]
        q3_tag = df['tags'].iloc[q3_index]

        iqr_list.append([q1_tag, q3_tag])

    return iqr_list



<br>

## 2. Average temperature and humidex
Using the arrival and the departure time of the stop, we obtain the average temperature and humidex (as well as the latitude and longitude coordinates). We need the processed trajectories files  together with the surveys (to combine the departure/arrival time with the average values).

The average values are stored as new columns of the surveys dataframes

In [4]:
def average_values_T_HDX(df_trajectory, df_survey):
    '''
    Function that calculates the average temperature, humidex, and derived quantities
    for each stop in the survey using the time window between arrival and departure.

    It works with the raw (non-smoothed) data only.

    Parameters:
        df_trajectory (pd.DataFrame): DataFrame with trajectory data. Must contain:
            - 'Time', 'Temp[°C]', 'HDX[°C]', 'T-T_fixed', 'HDX-HDX_fixed',
              'T-T_fixed+<T>', 'HDX-HDX_fixed+<HDX>', 'Lat', 'Lon'

        df_survey (pd.DataFrame): Survey data. Must contain:
            - 'date', 'arrival_time', 'departure_time' (all as strings)

    Returns:
        pd.DataFrame: The survey DataFrame with new columns added:
            - Averages of T, HDX, and derived values for each stop
            - Average latitude and longitude
    '''

    # Lists to store average values for each stop
    avg_T = []
    avg_HDX = []
    avg_T_Tfix = []
    avg_HDX_HDXfix = []
    avg_Tnew = []
    avg_HDXnew = []
    avg_lat = []
    avg_lon = []

    # Ensure Time column is in datetime format
    df_trajectory['Time'] = pd.to_datetime(df_trajectory['Time'], format='%Y-%m-%d %H:%M:%S')

    for i in range(len(df_survey)):  # Iterate over each stop in the survey

        # Construct full datetime columns
        arrival_dt = pd.to_datetime(df_survey['date'][i] + ' ' + df_survey['arrival_time'][i], format='%Y-%m-%d %H:%M')
        departure_dt = pd.to_datetime(df_survey['date'][i] + ' ' + df_survey['departure_time'][i], format='%Y-%m-%d %H:%M')

        # Filter trajectory data for the time window of this stop
        df_window = df_trajectory[(df_trajectory['Time'] >= arrival_dt) & (df_trajectory['Time'] <= departure_dt)].reset_index(drop=True)

        # Compute average values and append to lists
        avg_T.append(df_window['Temp[°C]'].mean())
        avg_HDX.append(df_window['HDX[°C]'].mean())
        avg_T_Tfix.append(df_window['T-T_fixed'].mean())
        avg_HDX_HDXfix.append(df_window['HDX-HDX_fixed'].mean())
        avg_Tnew.append(df_window['T-T_fixed+<T>'].mean())
        avg_HDXnew.append(df_window['HDX-HDX_fixed+<HDX>'].mean())
        avg_lat.append(df_window['Lat'].mean())
        avg_lon.append(df_window['Lon'].mean())

    # Add computed averages as new columns to the survey DataFrame
    df_survey['<T>'] = avg_T
    df_survey['<HDX>'] = avg_HDX
    df_survey['<T-T_fixed>'] = avg_T_Tfix
    df_survey['<HDX-HDX_fixed>'] = avg_HDX_HDXfix
    df_survey['<T-T_fixed+<T>>'] = avg_Tnew
    df_survey['<HDX-HDX_fixed+<HDX>>'] = avg_HDXnew
    df_survey['<latitude>'] = avg_lat
    df_survey['<longitude>'] = avg_lon

    return df_survey
