# Reassignment of labs to enumerators, following discovery of new non-responsive or non-existent lab.

In [1]:
# Set date of reassignment
REASSIGNMENT_DATE = "2025_10_02" # Date of running this code, in YYYY_MM_DD format

In [2]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[1]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill
import os
from lab_assignment import assign_enumerators

In [3]:
# Load datasets
existing_assignments = pd.read_csv(config.ENUMERATORS / "assignedlabs.csv")
unassigned_labs = pd.read_csv(config.LABS_LIST / "LabsList_Unassigned.csv")
all_enumerators = pd.read_csv(config.ENUMERATORS / "all_enumerators.csv")
non_existent_responsive_labs = pd.read_excel(config.LABS_LIST / "non_existent_responsive_labs.xlsx")

In [4]:
# Flag labs as out of sample and flag enumerators in need of reassignment

# Add a new column if it doesn’t already exist
if "out_of_sample" not in existing_assignments.columns:
    existing_assignments["out_of_sample"] = 0

# Flag non-responsive or non-existent labs
existing_assignments.loc[
    existing_assignments["labgroupid"].isin(non_existent_responsive_labs["labgroupid"]),
    "new_out_of_sample"] = 1
    
# Flag enumerators that need reassigning and how many new labs they need
new_reassignments = existing_assignments[
    (existing_assignments["out_of_sample"] == 0) & (existing_assignments["new_out_of_sample"] == 1)]
    
enum_ids_to_reassign = (
    new_reassignments
    .groupby(["enum_id", "Treatment Status"])
    .size()
    .reset_index(name="count")
    .pivot(index="enum_id", columns="Treatment Status", values="count")
    .fillna(0)
    .reset_index()
)

# Rename columns for clarity
enum_ids_to_reassign = enum_ids_to_reassign.rename(
    columns={
        "treatment": "n_treated",
        "control": "n_control"
    }
)

# Create necessary id variable
all_enumerators["id"] = all_enumerators["enum_id"]

# Merge enums_to_reassign with the all_enumerators
enums_to_reassign = all_enumerators.merge(enum_ids_to_reassign,
        left_on="enum_id", right_on="enum_id", how="inner")

# Replace old out of sample with new values
existing_assignments["out_of_sample"] = existing_assignments["new_out_of_sample"]

In [6]:
# Check whether have any new reassignments to do
if new_reassignments.empty:
    raise ValueError("Error: No new reassignments to do.")

ValueError: Error: No new reassignments to do.

In [5]:
# Defining no of treated and control labs to assign

enums_for_assignment = enums_to_reassign.copy()

# Create n_treat and n_control columns if don't already exist
for col in ["n_treated", "n_control"]:
    if col not in enums_for_assignment:
        enums_for_assignment[col] = pd.NA

# Replace with 0 if missing
def fill_treat_control(row):
    n_t = row.get("n_treated", 0)
    n_c = row.get("n_control", 0)

    # Fill missing values with 0
    n_t = n_t if pd.notna(n_t) else 0
    n_c = n_c if pd.notna(n_c) else 0

    return n_t, n_c

enums_for_assignment[["n_treated", "n_control"]] = (
    enums_for_assignment.apply(fill_treat_control, axis=1, result_type="expand")
)

ValueError: Columns must be same length as key

In [6]:
# Run assignment
assignments, leftover_treatment, leftover_control = assign_enumerators(
    labs_df = unassigned_labs,
    enum_df = enums_for_assignment,
    n_treat = 3,
    n_control = 3,
    seed = 110
)

# Check for duplicate assignments
duplicates = assignments[assignments.duplicated(subset="labgroupid", keep=False)]
if not duplicates.empty:
    print("Duplicate labgroupids found:")
    print(duplicates[["labgroupid", "Lab Group", "enum_firstname", "enum_lastname"]])
else:
    print("No duplicates of labgroupid found.")

# Order assignments by enumerator id and labgroupid
assignments = assignments.sort_values(by=["enum_id", "labgroupid"]).reset_index(drop=True)

# Add a new column if it doesn’t already exist
if "out_of_sample" not in assignments.columns:
    assignments["out_of_sample"] = 0

# Reorder columns for saving assignments file
assignments_order = [
    "labgroupid", "Lab Group", "Faculty", "Institute", 
    "Professor", "Email", "Source", "Treatment Status", 
    "enum_id", "enum_firstname", "enum_lastname", 
    "enum_email", "out_of_sample"
]

# Save the assignments file
cols_to_save = [col for col in assignments_order if col in assignments.columns]
assignments.to_csv(config.ENUMERATORS / f"reassignedlabs_{REASSIGNMENT_DATE}.csv", index=False, columns=cols_to_save)

all_assignments = pd.concat([existing_assignments, assignments]).reset_index(drop=True)
all_assignments.to_csv(config.ENUMERATORS / "assignedlabs.csv", index = False, columns = cols_to_save)

# Combine leftover labs
unassigned_labs = pd.concat([leftover_treatment, leftover_control]).reset_index(drop=True)

# Check that no labgroupid is in both assignments and leftover labs
assigned_ids = set(assignments["labgroupid"])
unassigned_ids = set(unassigned_labs["labgroupid"])
overlap = assigned_ids & unassigned_ids
if overlap:
    print("Error: Some labgroupids are in both assignments and unassigned_labs:")
    print(overlap)
else:
    print("All labgroupids are correctly assigned or unassigned.")

# Save unassigned labs
unassigned_labs.to_csv(config.LABS_LIST / "LabsList_Unassigned.csv", index=False)

No duplicates of labgroupid found.
All labgroupids are correctly assigned or unassigned.


In [7]:
# Create reassignments file for each enumerator

# Rename columns for clarity
assignments = assignments.rename(columns={"Professor": "Lab Responsible Person"})
assignments = assignments.rename(columns={"Source": "Website"})

# Columns to include in the enumerator's file
cols_to_include = [
    "labgroupid", "Lab Group", "Faculty", "Institute", 
    "Lab Responsible Person", "Email", "Website", "Treatment Status"
]

# Color treatment yellow and control no color
fill_colors = {
    "treatment": "FFFF00",  # Yellow
    "control": "FFFFFF"     # No color (white)
}

for enum_id, enum_data in assignments.groupby("enum_id"):

    # Get enumerator info
    id = enum_data["enum_id"].iloc[0]
    name = enum_data["enum_foldername"].iloc[0]
    folder_name = f"{name}_data"

    # Columns to include
    labs_for_enum = enum_data[cols_to_include].copy()

    # Create excel path
    filename = os.path.join(config.SWITCHDRIVE_ROOT, folder_name, f"new_lab_assignment_{REASSIGNMENT_DATE}.xlsx")

    # Save first without formatting
    labs_for_enum.to_excel(filename, index=False)

    # Load workbook and select active sheet
    wb = load_workbook(filename)
    ws = wb.active
    ws.title = "Lab Assignments"

    # Bold header row
    for cell in ws[1]:
        cell.font = Font(bold=True)

    # Adjust column widths
    for col in ws.columns:
        max_length = 0
        column = col[0].column_letter  # Get the column name
        for cell in col:
            try:
                if cell.value:
                    max_length = max(max_length, len(str(cell.value)))
            except:
                pass
        adjusted_width = (max_length + 2)
        ws.column_dimensions[column].width = adjusted_width

    # Fill colors based on treatment status
    for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=ws.min_column, max_col=ws.max_column):
        status_cell = row[cols_to_include.index("Treatment Status")]
        status = str(status_cell.value).lower()
        if status in fill_colors:
            status_cell.fill = PatternFill(start_color=fill_colors[status], end_color=fill_colors[status], fill_type="solid")
    
    # Save workbook
    wb.save(filename)