# Randomization of lab groups into control and treated groups

In [2]:
# Set date of new lab list
NEWLAB_DATE = "2025_10_28" # Date of running this code, in YYYY_MM_DD format

In [3]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[1]
sys.path.append(str(CODE_ROOT))
import config

In [4]:
# Load datasets
labs = pd.read_excel(config.LABS_LIST / f"newlab_{NEWLAB_DATE}.xlsx")
existing_labs = pd.read_csv(config.LABS_LIST / "LabsList_Randomized.csv")
existing_labs_locations = pd.read_csv(config.LABS_LIST / "LabsList_Randomized_Locations.csv")
existing_unassigned_labs = pd.read_csv(config.LABS_LIST / "LabsList_Unassigned.csv")

In [5]:
# Count labs with Enumerator lab == 1
count_enum1 = (labs["Enumerator lab"] == 1).sum()
print(f"Number of labs where enumerators work: {count_enum1}")

# Count labs with an exclusion reason
count_exclusion_notice = labs["Exclusion Notice"].notna().sum()
print(f"Number of labs with an exclusion notice: {count_exclusion_notice}")

# Flag labs to exclude
labs["Exclude"] = (labs["Enumerator lab"] == 1) | (labs["Exclusion Notice"].notna())
count_exclude = (labs["Exclude"] == 1).sum()
print(f"Number of labs to exclude: {count_exclude}")

# Remove these labs from the sample
labs_sample = labs[~labs["Exclude"]].copy()


Number of labs where enumerators work: 0
Number of labs with an exclusion notice: 0
Number of labs to exclude: 0


In [6]:
# Clean the labs data

# Create a lab group name column
labs_sample.loc[:, "Lab Group"] = (
    labs_sample["Lab Group"]
    .fillna(labs_sample["Lab Group alt"])
    .fillna(labs_sample["Professor"])
)

# Check for missing lab groups names
labs_missing_name = labs_sample[labs_sample["Lab Group"].isna()]
print(f"Lab groups without a name: {len(labs_missing_name)}")

# Check for duplicate lab group names
duplicate_labs = labs_sample[labs_sample.duplicated(subset=["Lab Group"], keep=False)]
print(f"Duplicate lab group names: {len(duplicate_labs)}")

# Check for duplicate professors
duplicate_profs = labs_sample[labs_sample.duplicated(subset=["Professor"], keep=False)]
print(f"Duplicate professors: {len(duplicate_profs)}")

# Check for missing lab group professors
labs_missing_prof = labs_sample[labs_sample["Professor"].isna()]
print(f"Lab groups without a prof: {len(labs_missing_prof)}")

# Check for missing lab group emails
labs_missing_email = labs_sample[labs_sample["Email"].isna()]
print(f"Lab groups without an email: {len(labs_missing_email)}")

# Clean faculty names
labs_sample.loc[labs_sample["Faculty"].isin(["MeF, MNF"]), "Faculty"] = "Joint MNF/MeF"
print(labs_sample["Faculty"].value_counts(dropna=False))

# Check for missing lab group websites
labs_missing_website = labs_sample[labs_sample["Source"].isna()]
print(f"Lab groups without a website: {len(labs_missing_website)}")

# Drop unnecessary columns
labs_sample = labs_sample.dropna(axis=1, how="all")
labs_sample = labs_sample.drop(columns=["Lab Group alt", "Exclude"])

Lab groups without a name: 0
Duplicate lab group names: 0
Duplicate professors: 0
Lab groups without a prof: 0
Lab groups without an email: 0
Faculty
MeF    1
Name: count, dtype: int64
Lab groups without a website: 0


In [7]:
# Create lab group id (random and secure) (exclude existing lab group ids)
np.random.seed(config.SEED)
existing_lab_ids = existing_labs["labgroupid"].unique()
n_labs = len(labs_sample)
possible_ids = np.arange(100, 999)
available_ids = np.setdiff1d(possible_ids, existing_lab_ids) # Available IDs excluding existing ones
labs_sample["labgroupid"] = np.random.choice(available_ids, size=n_labs, replace=False)

# Reorder columns
order = [
    "labgroupid", "Lab Group", "Faculty", "Institute", 
    "Professor", "Email", "Source", "Location SCH", "Location BOT"
]
new_order = [col for col in order if col in labs_sample.columns]
labs_sample = labs_sample[new_order]

In [None]:
# Randomize lab groups into treatment and control (50/50, stratified by faculty)
np.random.seed(2810) # date of running code in MMDD format
def stratified_randomize(df, group_col, treatment_col="Treatment Status"):
    """Randomly assign 50/50 treatment and control within each group,
       randomly assigning the extra lab if group size is odd."""
    def assign(group):
        n = len(group)
        labels = ["treatment"] * (n // 2) + ["control"] * (n // 2)
        if n % 2 == 1:  # randomly assign extra lab if odd
            labels.append(np.random.choice(["treatment", "control"]))
        np.random.shuffle(labels)
        group[treatment_col] = labels
        return group
    return df.groupby(group_col, group_keys=False).apply(assign)

labs_sample = stratified_randomize(labs_sample, group_col="Faculty")

# See how many treatment and control labs
print(labs_sample["Treatment Status"].value_counts())

#See how many treatment and control labs per faculty
print(labs_sample[["Treatment Status", "Faculty"]].value_counts())

# Save the assigned labs list (no locations)
cols_to_save = [col for col in labs_sample.columns if col not in ["Location SCH", "Location BOT"]]
labs_sample.to_csv(config.LABS_LIST / f"LabsList_Randomized_New_{NEWLAB_DATE}.csv", index=False, columns=cols_to_save)

# Save the assigned labs list with locations
labs_sample.to_csv(config.LABS_LIST / f"LabsList_Randomized_Locations_New_{NEWLAB_DATE}.csv", index=False)

Treatment Status
treatment    1
Name: count, dtype: int64
Treatment Status  Faculty
treatment         MeF        1
Name: count, dtype: int64


  return df.groupby(group_col, group_keys=False).apply(assign)


In [10]:
# Combine with existing randomized labs
all_randomized_labs = pd.concat([existing_labs_locations, labs_sample], ignore_index=True)

# Combine with existing unasssigned labs
all_unassigned_labs = pd.concat([existing_unassigned_labs, labs_sample], ignore_index=True)

In [11]:
# Save the labs list (no locations)
cols_to_save = [col for col in all_randomized_labs.columns if col not in ["Location SCH", "Location BOT"]]
all_randomized_labs.to_csv(config.LABS_LIST / f"LabsList_Randomized.csv", index=False, columns=cols_to_save)

# Save the labs list (with locations)
all_randomized_labs.to_csv(config.LABS_LIST / f"LabsList_Randomized_Locations.csv", index=False)

# Save the unassigned labs list
all_unassigned_labs.to_csv(config.LABS_LIST / f"LabsList_Unassigned.csv", index=False)
