# Randomization of new labs at institute into T/C if not already randomized

In [1]:
# Set date of new lab list
NEWLAB_DATE = "2025_12_09" # Date of running this code, in YYYY_MM_DD format

In [2]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[1]
sys.path.append(str(CODE_ROOT))
import config

In [3]:
# Load datasets
institute_labs = pd.read_excel(config.LABS_LIST / f"institute_labs.xlsx")
existing_labs = pd.read_csv(config.LABS_LIST / "LabsList_Randomized.csv")
existing_labs_locations = pd.read_csv(config.LABS_LIST / "LabsList_Randomized_Locations.csv")

In [4]:
# Merge datasets to find matches
merged = institute_labs.merge(
    existing_labs_locations,
    how="left",
    on="Professor",
    indicator=True,
    suffixes=("_new", "_old")
)

# Separate matched and unmatched labs
already_assigned = merged[merged["_merge"] == "both"].copy()
new_labs = merged[merged["_merge"] == "left_only"].copy()

print(f"Already assigned labs: {already_assigned.shape[0]}")
print(f"New labs needing randomization: {new_labs.shape[0]}")

Already assigned labs: 5
New labs needing randomization: 2


In [5]:
# Update emails for already assigned labs
existing_labs_locations = existing_labs_locations.merge(
    institute_labs[["Professor", "Email"]],
    on="Professor",
    how="left",
    suffixes=("", "_new")
)

existing_labs_locations["Email"] = existing_labs_locations["Email_new"].fillna(existing_labs["Email"])
existing_labs_locations = existing_labs_locations.drop(columns=["Email_new"])

In [6]:
# Set random labgroupid for new labs (exclude existing labgroupids)
np.random.seed(config.SEED)
existing_lab_ids = existing_labs["labgroupid"].unique()
n_labs = len(new_labs)
possible_ids = np.arange(100, 999)
available_ids = np.setdiff1d(possible_ids, existing_lab_ids) # Available IDs excluding existing ones
new_labs["labgroupid"] = np.random.choice(available_ids, size=n_labs, replace=False)

# Rename columns
new_labs = new_labs.rename(columns={
    "Lab Group_new": "Lab Group",
    "Faculty_new": "Faculty",
    "Institute_new": "Institute",
    "Professor": "Professor",
    "Email_new": "Email",
    "Source_new": "Source",
    "Location SCH_new": "Location SCH",
    "Location BOT_new": "Location BOT"
})

# Reorder columns
order = [
    "labgroupid", "Lab Group", "Faculty", "Institute", 
    "Professor", "Email", "Source", "Location SCH", "Location BOT"
]
new_order = [col for col in order if col in new_labs.columns]
new_labs = new_labs[new_order]

In [7]:
# Randomize lab groups into treatment and control (50/50, stratified by faculty)
np.random.seed(912) # date of running code in MMDD format
def stratified_randomize(df, group_col, treatment_col="Treatment Status"):
    """Randomly assign 50/50 treatment and control within each group,
       randomly assigning the extra lab if group size is odd."""
    def assign(group):
        n = len(group)
        labels = ["treatment"] * (n // 2) + ["control"] * (n // 2)
        if n % 2 == 1:  # randomly assign extra lab if odd
            labels.append(np.random.choice(["treatment", "control"]))
        np.random.shuffle(labels)
        group[treatment_col] = labels
        return group
    return df.groupby(group_col, group_keys=False).apply(assign)

new_labs = stratified_randomize(new_labs, group_col="Faculty")

# See how many treatment and control labs
print(new_labs["Treatment Status"].value_counts())

Treatment Status
treatment    1
control      1
Name: count, dtype: int64


  return df.groupby(group_col, group_keys=False).apply(assign)


In [8]:
# Save the newly randomized labs list (no locations)
cols_to_save = [col for col in new_labs.columns if col not in ["Location SCH", "Location BOT"]]
new_labs.to_csv(config.LABS_LIST / f"LabsList_Randomized_New_{NEWLAB_DATE}.csv", index=False, columns=cols_to_save)

# Save the assigned labs list with locations
new_labs.to_csv(config.LABS_LIST / f"LabsList_Randomized_Locations_New_{NEWLAB_DATE}.csv", index=False)

In [9]:
# Append newly randomized labs to existing labs and save
updated_labs = pd.concat([existing_labs_locations, new_labs], ignore_index=True)

# Save (no locations)
cols_to_save = [col for col in updated_labs.columns if col not in ["Location SCH", "Location BOT"]]
updated_labs.to_csv(config.LABS_LIST / f"LabsList_Randomized.csv", index=False, columns=cols_to_save)

# Save (with locations)
updated_labs.to_csv(config.LABS_LIST / f"LabsList_Randomized_Locations.csv", index=False)

In [10]:
# Save the institute labs list (both existing and newly randomized)
institute_randomized = updated_labs[
    updated_labs["Professor"].isin(institute_labs["Professor"])
].copy()

institute_randomized.to_csv(config.LABS_LIST / f"institute_labs_randomized.csv", index=False)