# Location and Attendance Analysis

In [None]:
import sys
from pathlib import Path

base_dir = Path.cwd().parent
sys.path.insert(0, str(base_dir))

import os
import pickle as pkl
import numpy as np
import pandas as pd
import utils

### EDIT PATHS ###
base_dir = Path.cwd().parent
image_dir = "/path/to/preprocessed_images"
attendance_path = "/path/to/attendance.xlsx"  #(not public)

# Repo-relative paths (do not change)
cache_dir = base_dir / "outputs" / "cache"
labels_path = base_dir / "data" / "cluster_labels.xlsx"
metadata_path = base_dir / "data" / "image_preprocessing_labels.xlsx"
hdbscan_path = "hdbscan_l2norm_90pca_6components_100nn_0dist_cosine_42randseed_100_minclustsize_22minsamples.pkl"

filenames = os.listdir(image_dir)

with open(cache_dir / hdbscan_path, "rb") as file:
    hdbscan_model = pkl.load(file)

hdbscan_model.labels_ = utils.relabel_by_size(hdbscan_model.labels_)

## Location Frequencies

In [None]:
metadata_labeled_df = pd.read_excel(metadata_path, sheet_name="Sheet1")
metadata_labeled_df["is_inlier"] = metadata_labeled_df["filename"].isin(filenames)

print("METADATA: ", metadata_labeled_df["is_inlier"].value_counts())

# NO METADATA FOR 67 IMAGES

location_frequencies = metadata_labeled_df[metadata_labeled_df["is_inlier"] == True]["location"].value_counts()
location_frequencies_percentage = location_frequencies / location_frequencies.sum() * 100
print("Location Frequencies:\n", location_frequencies)
print("Location Frequencies (Percentage):\n", location_frequencies_percentage)

metadata_labeled_df.fillna({"location": "Unknown"}, inplace=True)
print(metadata_labeled_df["location"].value_counts())

## Assign Clusters to Metadata

In [None]:
new_labels = utils.relabel_by_size(hdbscan_model.labels_)

for filename, label in zip(filenames, new_labels):
    label = int(label)
    metadata_labeled_df.loc[metadata_labeled_df["filename"] == filename, "cluster"] = label
    if filename not in metadata_labeled_df["filename"].values:
        metadata_labeled_df = pd.concat(
            [metadata_labeled_df, pd.DataFrame({"filename": [filename], "cluster": [label]})],
            ignore_index=True,
        )

metadata_labeled_df["cluster"] = metadata_labeled_df["cluster"].astype("Int64")

display(metadata_labeled_df)
print(metadata_labeled_df["cluster"].value_counts(sort=False))
print(metadata_labeled_df["cluster"].count())

## Chi-Squared Tests


### Not including unknown location

In [None]:
from scipy.stats import chi2_contingency

# Create a contingency table of cluster vs location
filtered_df = metadata_labeled_df[metadata_labeled_df["cluster"].notna()]
contingency = pd.crosstab(filtered_df["cluster"], filtered_df["location"], dropna=False)

# Perform Chi-squared test
chi2, p, dof, expected = chi2_contingency(contingency)
print(f"Chi-squared: {chi2}, p-value: {p}, Degrees of freedom: {dof}")

# Create a contingency with totals
contingency_totals = pd.crosstab(filtered_df["cluster"], filtered_df["location"], dropna=False)
contingency_totals["Sum"] = contingency.sum(axis=1)
contingency_totals.loc["Sum"] = contingency.sum(axis=0)
print("\nContingency Table with Totals:")
display(contingency_totals)

# Create a contingency table normalized by rows (percentage values)
percentage_contingency = pd.crosstab(filtered_df["cluster"], filtered_df["location"], dropna=False, normalize="index") * 100
percentage_contingency = percentage_contingency.round(2)

overall_total = metadata_labeled_df["cluster"].count()
row_pct = ((contingency.sum(axis=1) / overall_total) * 100).round(2)
col_pct = ((contingency.sum(axis=0) / overall_total) * 100).round(2)
percentage_contingency["RowTotal"] = row_pct
col_pct["Sum"] = 100.0
percentage_contingency.loc["Sum"] = col_pct

print("\nPercentage Contingency Table with Totals:")
display(percentage_contingency)

### Including Uknown Location

In [None]:
from scipy.stats import chi2_contingency

# Create a contingency table of cluster vs location
contingency = pd.crosstab(metadata_labeled_df['cluster'], metadata_labeled_df['location'])

# Perform Chi-squared test
chi2, p, dof, expected = chi2_contingency(contingency)
print(f"Chi-squared: {chi2}, p-value: {p}, Degrees of freedom: {dof}")


# Create a contingency with totals
contingency_totals = pd.crosstab(metadata_labeled_df['cluster'], metadata_labeled_df['location'])
contingency_totals['Sum'] = contingency.sum(axis=1)
contingency_totals.loc['Sum'] = contingency.sum(axis=0)
print("\nContingency Table with Totals:")
display(contingency_totals)

# Create a contingency table normalized by rows (percentage values)
percentage_contingency = pd.crosstab(metadata_labeled_df['cluster'], metadata_labeled_df['location'], normalize='index') * 100
percentage_contingency = percentage_contingency.round(2)

overall_total = metadata_labeled_df['cluster'].count()
row_pct = ((contingency.sum(axis=1) / overall_total) * 100).round(2)
col_pct = ((contingency.sum(axis=0) / overall_total) * 100).round(2)
percentage_contingency['RowTotal'] = row_pct
col_pct['Sum'] = 100.0
percentage_contingency.loc['Sum'] = col_pct

print("\nPercentage Contingency Table with Totals:")
display(percentage_contingency)

## Adjusted Pearson (Standardized) Residuals

Resource: https://cscu.cornell.edu/wp-content/uploads/conttableresid.pdf

In [None]:
import seaborn as sns
from scipy.stats import norm
import matplotlib.pyplot as plt

residuals = (contingency - expected) / np.sqrt(
    expected *
    (1 - contingency.sum(axis=1).values[:, None] / contingency.values.sum()) *
    (1 - contingency.sum(axis=0).values[None, :] / contingency.values.sum())
)

plt.figure(figsize=(6, 8))
sns.heatmap(residuals.round(2), annot=True, cmap="coolwarm")
plt.title("Standardized Residuals Heatmap")
plt.show()

bonferrioni_threshold = 0.05 / (contingency.shape[0] * contingency.shape[1])
critical_value = norm.ppf(1 - bonferrioni_threshold/2)
print(f"Bonferroni-corrected significance threshold: {bonferrioni_threshold}")
print(f"Critical Chi-squared value for Bonferroni correction: {critical_value}")

## Effect Size (Cramer's V)

In [None]:
n = contingency.values.sum()
cramer_v = np.sqrt(chi2 / (n * (min(contingency.shape)-1)))
print("Cramer's V:", cramer_v)

## Attendance Analysis (Not Public)

### HS & DL Sessions

In [None]:
# Session Attendance Analysis
if attendance_path is None:
    raise FileNotFoundError("Set attendance_path to run this section.")

session_attendance_df = pd.read_excel(attendance_path, sheet_name='Sheet1')

# Session counts and percentages
session_counts = session_attendance_df['session'].value_counts()
print("Session Counts:", session_counts)
print("Total:", session_attendance_df['session'].count())
session_percentage = session_counts / session_counts.sum() * 100
print("Session Percentage:")
print(session_percentage)

# Attendance counts by session type
session_type_counts = session_attendance_df.groupby('session')['scanned'].sum()
print("\nAttendance per Session Counts:", session_type_counts)
print("Total:", session_attendance_df['scanned'].count())
print("Attendance per Session Percentage:")
print(session_type_counts / session_attendance_df['scanned'].sum() * 100)

### By Location

In [None]:
# Session counts and percentages
location_counts = session_attendance_df['location'].value_counts()
print("Session Counts:", location_counts)
print("Total:", session_attendance_df['location'].count())
location_percentage = location_counts / location_counts.sum() * 100
print("Session Percentage:")
print(location_percentage)

# Attendance counts by session type
session_type_counts = session_attendance_df.groupby('location')['scanned'].sum()
print("\nAttendance per Session Counts:", session_type_counts)
print("Total:", session_attendance_df['scanned'].count())
print("Attendance per Session Percentage:")
print(session_type_counts / session_attendance_df['scanned'].sum() * 100)