In [14]:
import pandas as pd
import urllib.parse

# Load data
file_path = './dataset/Logging (Responses) - Form responses 1.csv'
df = pd.read_csv(file_path)

# Convert Timestamp to datetime and add entry index to maintain sequence
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S')
df['entry_index'] = df.index

# Step 1: Filter out entries before 14 April 2025
df_filtered = df[df['Timestamp'] >= '2025-04-14'].copy()

# Step 2: Extract participant_id from URL query string in the 'state' column
def extract_participant_id(url):
    try:
        parsed_url = urllib.parse.urlparse(url)
        query_params = urllib.parse.parse_qs(parsed_url.query)
        return query_params.get('participant_id', [None])[0]
    except:
        return None

df_filtered['participant_id'] = df_filtered['state'].apply(extract_participant_id)

# Step 3: Filter out rows where participant_id is null
df_filtered_nonull = df_filtered[df_filtered['participant_id'].notnull()].copy()

# Step 4: Define trial start based on 'target' field (p.login-link a or button.login-link)
df_filtered_nonull['is_login_target'] = df_filtered_nonull['target'].isin(['p.login-link a', 'button.login-link'])
df_filtered_nonull['prev_is_login_target'] = df_filtered_nonull.groupby('uid')['is_login_target'].shift(1)
df_filtered_nonull['is_start_target_based'] = df_filtered_nonull['is_login_target'] & (~df_filtered_nonull['prev_is_login_target'].fillna(False))

# Step 5: Calculate trial numbers based on adjusted start logic
df_filtered_nonull['trial_num_target_based'] = df_filtered_nonull.groupby('uid')['is_start_target_based'].cumsum()

# Step 6: Aggregate metrics per trial
df_trials_target_based = df_filtered_nonull[df_filtered_nonull['trial_num_target_based'] > 0].copy()

agg_metrics_target_based = df_trials_target_based.groupby(['uid', 'participant_id', 'trial_num_target_based']).agg(
    first_entry_index=('entry_index', 'first'),
    time_of_first_entry=('Timestamp', 'first'),
    task_time=('time', lambda x: (x.iloc[-1] - x.iloc[0]) / 1000),
    number_of_clicks=('time', 'count'),
    last_url=('state', 'last')
).reset_index()

# Step 7: Add HTML pages visited in each trial
def extract_page_names(state_series):
    page_names = state_series.dropna().apply(lambda x: urllib.parse.urlparse(x).path.split('/')[-1])
    html_pages = page_names[page_names.str.endswith('.html')].unique()
    return ', '.join(html_pages)

agg_metrics_target_based['pages_visited'] = df_trials_target_based.groupby(
    ['uid', 'participant_id', 'trial_num_target_based']
)['state'].apply(extract_page_names).reset_index(drop=True)

# Step 8: Add the 'variant' column based on query string
def extract_variant(state_series):
    def map_variant(url):
        try:
            parsed_url = urllib.parse.urlparse(url)
            query_params = urllib.parse.parse_qs(parsed_url.query)
            variant_value = query_params.get('variant', [None])[0]
            if variant_value == 'login-a-home-a':
                return 'A'
            elif variant_value == 'login-a-home-b':
                return 'B'
            elif variant_value == 'login-b-home-a':
                return 'C'
            elif variant_value == 'login-b-home-b':
                return 'D'
            else:
                return None
        except:
            return None
    
    variants = state_series.dropna().apply(map_variant).dropna().unique()
    return variants[0] if len(variants) > 0 else None

agg_metrics_target_based['variant'] = df_trials_target_based.groupby(
    ['uid', 'participant_id', 'trial_num_target_based']
)['state'].apply(extract_variant).reset_index(drop=True)

# Step 9: Filter aggregated records where pages_visited includes required pages
required_pages = ['personal-info.html', 'manage-student-finance.html', 'view-classes.html']

def contains_required_pages(pages_visited):
    if pd.isna(pages_visited):
        return False
    return any(page in pages_visited for page in required_pages)

agg_metrics_filtered = agg_metrics_target_based[
    agg_metrics_target_based['pages_visited'].apply(contains_required_pages)
].reset_index(drop=True)

  df_filtered_nonull['is_start_target_based'] = df_filtered_nonull['is_login_target'] & (~df_filtered_nonull['prev_is_login_target'].fillna(False))


In [15]:
agg_metrics_filtered.to_csv('agg_metrics_logging.csv', index=False)