# Assignment of labs to new enumerators.

In [1]:
# Set date and flag for first instance
NEW_ENUMERATORS_DATE = "2025_10_07" # Date of new enumerators list, in YYYY_MM_DD format
FIRST_NEW_ENUMS = False # Set to False if new enumerators have already been added before

In [2]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[1]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill
import os
from lab_assignment import assign_enumerators

In [3]:
# Load datasets
labs = pd.read_csv(config.LABS_LIST / "LabsList_Randomized_Locations.csv")
existing_assignments = pd.read_csv(config.ENUMERATORS / "assignedlabs.csv")
new_enumerators = pd.read_excel(config.ENUMERATORS / f"enumerators_new_{NEW_ENUMERATORS_DATE}.xlsx")
unassigned_labs = pd.read_csv(config.LABS_LIST / "LabsList_Unassigned.csv")
if FIRST_NEW_ENUMS:
    all_enumerators = pd.read_excel(config.ENUMERATORS / "enumerators_list.xlsx")
else:
    all_enumerators = pd.read_csv(config.ENUMERATORS / "all_enumerators.csv")

In [4]:
# Check whether these enumerators are new, if they have already been assigned, show error
# Check for duplicates based on enum_id
duplicates = new_enumerators["foldername"].isin(all_enumerators["foldername"])

if duplicates.any():
    duplicate_foldernames = new_enumerators.loc[duplicates, "foldername"].tolist()
    raise ValueError(f"Error: At least some of the enumerators have already been assigned labs.")

In [5]:
# Create df of new enumerators and assign new enum_ids (randomly chosen, not already in use)
np.random.seed(config.SEED) # For (secure) reproducibility
existing_enum_ids = existing_assignments["enum_id"].unique() # Existing enum_ids
n_new_enums = len(new_enumerators)
possible_ids = np.arange(10, 99)
available_ids = np.setdiff1d(possible_ids, existing_enum_ids) # IDs not already in use
if len(available_ids) < n_new_enums:
    raise ValueError(f"Not enough available IDs for {n_new_enums} new enumerators") # Check if enough 2-digit IDs available
new_enumerators["id"] = np.random.choice(available_ids, size=n_new_enums, replace=False) # Assign new enum_ids
new_enumerators["enum_id"] = new_enumerators["id"]

In [6]:
# Update list of all enumerators
if FIRST_NEW_ENUMS: # for first time adding new enumerators, get enum_ids for existing enumerators
    all_enumerators = all_enumerators.merge(existing_assignments[["enum_id", "enum_email"]],
        left_on="email_cleaned", right_on="enum_email", how="left")
    all_enumerators.drop(columns=["enum_email"], inplace=True)
    all_enumerators = all_enumerators.drop_duplicates()
all_enumerators = pd.concat([all_enumerators, new_enumerators], ignore_index=True)
all_enumerators.drop(columns=["id"], inplace=True)
all_enumerators.to_csv(config.ENUMERATORS / "all_enumerators.csv", index=False)

In [7]:
# Combining data and defining no of treated and control labs to assign

enums_for_assignment = new_enumerators.copy()

# Create n_treat and n_control columns if don't already exist
for col in ["n_treated", "n_control"]:
    if col not in enums_for_assignment:
        enums_for_assignment[col] = pd.NA

# Replace with desired/2 (research team) if existing
def fill_treat_control(row):
    n_t = row.get("n_treated")
    n_c = row.get("n_control")
    desired = row.get("desired labs", pd.NA)

    # Use desired_labs if present
    if pd.notna(desired):
        half = desired // 2
        return half, desired - half  # deal with odd numbers

    return n_t, n_c

enums_for_assignment[["n_treated", "n_control"]] = (
    enums_for_assignment.apply(fill_treat_control, axis=1, result_type="expand")
)

In [8]:
# Run assignment
assignments, leftover_treatment, leftover_control = assign_enumerators(
    labs_df = unassigned_labs,
    enum_df = enums_for_assignment,
    n_treat = 3,
    n_control = 3,
    seed = 110
)

# Check for duplicate assignments
duplicates = assignments[assignments.duplicated(subset="labgroupid", keep=False)]
if not duplicates.empty:
    print("Duplicate labgroupids found:")
    print(duplicates[["labgroupid", "Lab Group", "enum_firstname", "enum_lastname"]])
else:
    print("No duplicates of labgroupid found.")

# Order assignments by enumerator id and labgroupid
assignments = assignments.sort_values(by=["enum_id", "labgroupid"]).reset_index(drop=True)

# Add a new column if out of sample doesn’t already exist
if "out_of_sample" not in existing_assignments.columns:
    existing_assignments["out_of_sample"] = 0

# Add a new column if it doesn’t already exist
if "out_of_sample" not in assignments.columns:
    assignments["out_of_sample"] = 0

# Reorder columns for saving assignments file
assignments_order = [
    "labgroupid", "Lab Group", "Faculty", "Institute", 
    "Professor", "Email", "Source", "Treatment Status", 
    "enum_id", "enum_firstname", "enum_lastname", 
    "enum_email", "out_of_sample"
]

# Save the assignments file
cols_to_save = [col for col in assignments_order if col in assignments.columns]
assignments.to_csv(config.ENUMERATORS / f"assignedlabs_{NEW_ENUMERATORS_DATE}.csv", index=False, columns=cols_to_save)

all_assignments = pd.concat([existing_assignments, assignments]).reset_index(drop=True)
all_assignments.to_csv(config.ENUMERATORS / "assignedlabs.csv", index = False, columns = cols_to_save)

# Combine leftover labs
unassigned_labs = pd.concat([leftover_treatment, leftover_control]).reset_index(drop=True)

# Check that no labgroupid is in both assignments and leftover labs
assigned_ids = set(assignments["labgroupid"])
unassigned_ids = set(unassigned_labs["labgroupid"])
overlap = assigned_ids & unassigned_ids
if overlap:
    print("Error: Some labgroupids are in both assignments and unassigned_labs:")
    print(overlap)
else:
    print("All labgroupids are correctly assigned or unassigned.")

# Save unassigned labs
unassigned_labs.to_csv(config.LABS_LIST / "LabsList_Unassigned.csv", index=False)

No duplicates of labgroupid found.
All labgroupids are correctly assigned or unassigned.


In [9]:
# Create assignments file for each enumerator

# Rename columns for clarity
assignments = assignments.rename(columns={"Professor": "Lab Responsible Person"})
assignments = assignments.rename(columns={"Source": "Website"})

# Columns to include in the enumerator's file
cols_to_include = [
    "labgroupid", "Lab Group", "Faculty", "Institute", 
    "Lab Responsible Person", "Email", "Website", "Treatment Status"
]

# Color treatment yellow and control no color
fill_colors = {
    "treatment": "FFFF00",  # Yellow
    "control": "FFFFFF"     # No color (white)
}

for enum_id, enum_data in assignments.groupby("enum_id"):

    # Get enumerator info
    id = enum_data["enum_id"].iloc[0]
    name = enum_data["enum_foldername"].iloc[0]
    folder_name = f"{name}_data"

    # Columns to include
    labs_for_enum = enum_data[cols_to_include].copy()

    # Create excel path
    filename = os.path.join(config.SWITCHDRIVE_ROOT, folder_name, f"lab_assignment.xlsx")

    # Save first without formatting
    labs_for_enum.to_excel(filename, index=False)

    # Load workbook and select active sheet
    wb = load_workbook(filename)
    ws = wb.active
    ws.title = "Lab Assignments"

    # Bold header row
    for cell in ws[1]:
        cell.font = Font(bold=True)

    # Adjust column widths
    for col in ws.columns:
        max_length = 0
        column = col[0].column_letter  # Get the column name
        for cell in col:
            try:
                if cell.value:
                    max_length = max(max_length, len(str(cell.value)))
            except:
                pass
        adjusted_width = (max_length + 2)
        ws.column_dimensions[column].width = adjusted_width

    # Fill colors based on treatment status
    for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=ws.min_column, max_col=ws.max_column):
        status_cell = row[cols_to_include.index("Treatment Status")]
        status = str(status_cell.value).lower()
        if status in fill_colors:
            status_cell.fill = PatternFill(start_color=fill_colors[status], end_color=fill_colors[status], fill_type="solid")
    
    # Save workbook
    wb.save(filename)