In [2]:
import pandas as pd
import urllib.parse

# Load data
file_path = './dataset/Logging (Responses) - Form responses 1.csv'
df = pd.read_csv(file_path)
df.shape

(1839, 8)

In [None]:

# Convert Timestamp to datetime and add entry index to maintain sequence
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S')
df['entry_index'] = df.index

# Step 1: Filter out entries before 14 April 2025
df_filtered = df[df['Timestamp'] >= '2025-04-14'].copy()
df_filtered.shape

(1810, 9)

In [5]:
# Step 2: Extract participant_id from URL query string in the 'state' column
def extract_participant_id(url):
    try:
        parsed_url = urllib.parse.urlparse(url)
        query_params = urllib.parse.parse_qs(parsed_url.query)
        return query_params.get('participant_id', [None])[0]
    except:
        return None

df_filtered['participant_id'] = df_filtered['state'].apply(extract_participant_id)

# Step 3: Filter out rows where participant_id is null
df_filtered_nonull = df_filtered[df_filtered['participant_id'].notnull()].copy()
df_filtered_nonull.shape

df_filtered.to_csv('./filtered_data.csv', index=False)

In [29]:
file_path_qualtrics = './dataset/EduRec_cleaned.csv'
df_qualtrics = pd.read_csv(file_path_qualtrics)
df_qualtrics['timestamp'] = pd.to_datetime(df_qualtrics['timestamp'], format='mixed', errors='coerce')
# remove row with index 1 and 0
df_qualtrics = df_qualtrics.drop(index=0)
df_qualtrics = df_qualtrics.drop(index=1)
print(df_qualtrics.shape)
df_qualtrics.head(5)

(14, 123)


Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,...,Q47_7,Q78,Q6,Q6_6_TEXT,Q55,Q56.1,Q57.1,Random ID,timestamp,arrangement
2,14/4/2025 12:22,14/4/2025 12:37,0,138.75.115.88,100,926,1,14/4/2025 12:37,R_4dGIlBgbqgxz3No,,...,6,1,5,,i like the top nav bar. it is easier to toggle...,"some buttons dont work , \r\ne.g. the home but...","not sure if my computer issue, but some time I...",11096,2025-04-14 12:24:50+08:00,1
3,15/4/2025 20:16,15/4/2025 20:54,0,137.132.26.210,100,2327,1,15/4/2025 20:54,R_9jWbHzaptt7ptew,,...,4,2,1,,Fix the layout of the financials page. Fix the...,The financials layout and the interactions in ...,No,94531,2025-04-15 20:32:23+08:00,3
4,15/4/2025 22:22,15/4/2025 22:41,0,58.96.212.227,100,1148,1,15/4/2025 22:41,R_9CjqogT8bwG4580,,...,3,1,1,,Not sure.,"no, i wish i didnt have to open a new tab ever...","Nope, thanks!",70610,2025-04-15 22:26:27+08:00,2
5,15/4/2025 23:20,15/4/2025 23:55,0,103.6.151.219,100,2147,1,15/4/2025 23:56,R_9kL17GI2jB1NgdK,,...,3,1,1,,'- Bigger text for the items instead of icons ...,'- Link is broken can't go back to home page h...,,68221,2025-04-15 23:37:55+08:00,2
6,18/4/2025 20:24,18/4/2025 20:38,0,42.61.233.210,100,863,1,18/4/2025 20:38,R_9JYtw7vrZpSyBt7,,...,5,1,1,,smaller text size for tiles for portal B/D,Cannot click home button on contacts page,no,47477,2025-04-18 20:28:43+08:00,3


In [15]:
# select timestamp and participant_id columns
df_qualtrics = df_qualtrics[['timestamp', 'Random ID']].copy()
df_qualtrics

Unnamed: 0,timestamp,Random ID
2,2025-04-14T12:24:50+08:00,11096
3,2025-04-15T20:32:23+08:00,94531
4,2025-04-15T22:26:27+08:00,70610
5,2025-04-15T23:37:55+08:00,68221
6,2025-04-18T20:28:43+08:00,47477
7,2025-04-18T22:56:10+08:00,51736
8,2025-04-19T04:20:27+08:00,88314
9,2025-04-19T15:34:09+08:00,74407
10,2025-04-19T17:28:16+08:00,23083
11,2025-04-20T01:00:48+08:00,77207


In [None]:
# for each row in df_filtered, check if Timestamp column is after timestamp column in df_qualtrics
def check_timestamp(row):
    timestamp = row['Timestamp']
    participant_id = row['participant_id']
    # Check if the participant_id exists in df_qualtrics
    if participant_id in df_qualtrics['Random ID'].values:
        # Get the corresponding timestamp from df_qualtrics
        qualtrics_timestamp = df_qualtrics[df_qualtrics['Random ID'] == participant_id]['timestamp'].values[0]
        # Compare the timestamps
        return timestamp > qualtrics_timestamp
    return False

# Apply the function to filter the DataFrame
df_filtered['is_after_qualtrics'] = df_filtered.apply(check_timestamp, axis=1)

# Remove row with is_after_qualtrics = False
df_filtered = df_filtered[df_filtered['is_after_qualtrics']].copy()
df_filtered.to_csv('./filtered_data.csv', index=False)

In [None]:
# Step 4: Define trial start based on 'target' field (p.login-link a or button.login-link)
df_filtered_nonull['is_login_target'] = df_filtered_nonull['target'].isin(['p.login-link a', 'button.login-link'])
df_filtered_nonull['prev_is_login_target'] = df_filtered_nonull.groupby('uid')['is_login_target'].shift(1)
df_filtered_nonull['is_start_target_based'] = df_filtered_nonull['is_login_target'] & (~df_filtered_nonull['prev_is_login_target'].fillna(False))

# Step 5: Calculate trial numbers based on adjusted start logic
df_filtered_nonull['trial_num_target_based'] = df_filtered_nonull.groupby('uid')['is_start_target_based'].cumsum()

# Step 6: Aggregate metrics per trial
df_trials_target_based = df_filtered_nonull[df_filtered_nonull['trial_num_target_based'] > 0].copy()

agg_metrics_target_based = df_trials_target_based.groupby(['uid', 'participant_id', 'trial_num_target_based']).agg(
    first_entry_index=('entry_index', 'first'),
    time_of_first_entry=('Timestamp', 'first'),
    task_time=('time', lambda x: (x.iloc[-1] - x.iloc[0]) / 1000),
    #number_of_clicks=('time', 'count'),
    number_of_clicks=('eventName', lambda x: (x == 'mousedown').sum()),
    last_url=('state', 'last')
).reset_index()

# Step 7: Add HTML pages visited in each trial
def extract_page_names(state_series):
    page_names = state_series.dropna().apply(lambda x: urllib.parse.urlparse(x).path.split('/')[-1])
    html_pages = page_names[page_names.str.endswith('.html')]#.unique()
    return ', '.join(html_pages)

agg_metrics_target_based['pages_visited'] = df_trials_target_based.groupby(
    ['uid', 'participant_id', 'trial_num_target_based']
)['state'].apply(extract_page_names).reset_index(drop=True)

# Step 8: Add the 'variant' column based on query string
def extract_variant(state_series):
    def map_variant(url):
        try:
            parsed_url = urllib.parse.urlparse(url)
            query_params = urllib.parse.parse_qs(parsed_url.query)
            variant_value = query_params.get('variant', [None])[0]
            if variant_value == 'login-a-home-a':
                return 'A'
            elif variant_value == 'login-a-home-b':
                return 'C'
            elif variant_value == 'login-b-home-a':
                return 'D'
            elif variant_value == 'login-b-home-b':
                return 'B'
            else:
                return None
        except:
            return None
    
    variants = state_series.dropna().apply(map_variant).dropna().unique()
    return variants[0] if len(variants) > 0 else None

agg_metrics_target_based['variant'] = df_trials_target_based.groupby(
    ['uid', 'participant_id', 'trial_num_target_based']
)['state'].apply(extract_variant).reset_index(drop=True)

# Step 9: Filter aggregated records where pages_visited includes required pages
required_pages = ['personal-info.html', 'manage-student-finance.html', 'view-classes.html']

def contains_required_pages(pages_visited):
    if pd.isna(pages_visited):
        return False
    return any(page in pages_visited for page in required_pages)

agg_metrics_filtered = agg_metrics_target_based[
    agg_metrics_target_based['pages_visited'].apply(contains_required_pages)
].reset_index(drop=True)

In [2]:
agg_metrics_filtered.to_csv('agg_metrics_logging.csv', index=False)