# Update the finalized list of sample labs

In [14]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[1]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
from openpyxl.formatting.rule import FormulaRule
from openpyxl.styles import Font, PatternFill
import os
import warnings

# Suppress irrelevant conditional formatting warnings when reading Excel files
warnings.filterwarnings("ignore", message="Conditional Formatting extension is not supported")

In [15]:
# Load datasets
existing_assignments = pd.read_csv(config.ENUMERATORS / "assignedlabs.csv")
all_enumerators = pd.read_csv(config.ENUMERATORS / "all_enumerators.csv")

In [16]:
# Merge to get enum_foldername
assignments = existing_assignments.merge(all_enumerators[["enum_id", "foldername"]], on="enum_id", how="left")

In [17]:
# Create file containing all checked labs
checked_all = []

for enum_id, enum_data in assignments.groupby("enum_id"):

    # Get enumerator info
    id = enum_data["enum_id"].iloc[0]
    name = enum_data["foldername"].iloc[0]
    folder_name = f"{name}_data"

    # Create excel path
    filename = os.path.join(config.SWITCHDRIVE_ROOT, folder_name, f"check_lab_assignment.xlsx")

    # Load the file as dataframe
    checked_file = pd.read_excel(filename, sheet_name="Lab Assignments")

    checked_file["enum_id"] = id
    checked_all.append(checked_file[[
        "labgroupid", "Visited?", 
        "Contact person (if different)", 
        "Contact email (if different)", 
        "Comments?", 
        "enum_id"
    ]])

# Combine all checked files
if checked_all:  # make sure the list is not empty
    combined_checked = pd.concat(checked_all, ignore_index=True)

In [18]:
# Merge with assignments on labgroupid and enum_id
updated_assignments = assignments.merge(
    combined_checked,
    on=["labgroupid", "enum_id"],
    how="left",
    suffixes=('', '_checked')
)

In [19]:
# Check how many enumerators have updated their file 
# I.e. have non-NA in contact person or comments, or differing values of Visited and out_of_sample

visited_yes_out1 = (updated_assignments["Visited?"] == "Yes") & (updated_assignments["out_of_sample"] == 1)
visited_no_out0 = (updated_assignments["Visited?"] == "No") & (updated_assignments["out_of_sample"] == 0)
contact_or_comments = updated_assignments[["Contact person (if different)", "Comments?"]].notna().any(axis=1)

# Combine all conditions (and enum_id  84 due to separately informing)
updated_assignments["has_update"] = (
    contact_or_comments 
    | visited_yes_out1 
    | visited_no_out0 
    | (updated_assignments["enum_id"] == 84)
)

# Check how many have updated their file
checked_summary = updated_assignments.groupby("enum_id")["has_update"].any()
num_checked = checked_summary.sum()
print(f"{num_checked} enumerators have at least one update in their file.")

# Enumerators who have NOT checked anything
unchecked_enum = checked_summary[~checked_summary].index.tolist()
print(f"{len(unchecked_enum)} enumerators have not checked their file yet.")
print("Enumerators:", unchecked_enum)


26 enumerators have at least one update in their file.
8 enumerators have not checked their file yet.
Enumerators: [14, 22, 46, 66, 67, 76, 83, 92]


In [20]:
# Replace one lab with correct info (comments from enumerator) - make list that can be easily modified
correction_list = [409] # labgroupids to correct

for lab_to_update in correction_list:
    mask = updated_assignments["labgroupid"] == lab_to_update
    updated_assignments.loc[mask, "Visited?"] = "No"
    print(f"Updated labgroupid {lab_to_update}: Visited? to No")

Updated labgroupid 409: Visited? to No


In [21]:
# Check how many labs in our sample (Visited? = Yes)
num_visited = updated_assignments[updated_assignments["Visited?"] == "Yes"].shape[0]
print(f"{num_visited} labs have been marked as visited in total.")
# Check how many treatment and control labs in our sample
num_treatment_visited = updated_assignments[(updated_assignments["Visited?"] == "Yes") &
                                            (updated_assignments["Treatment Status"] == "treatment")].shape[0]
num_control_visited = updated_assignments[(updated_assignments["Visited?"] == "Yes") &
                                            (updated_assignments["Treatment Status"] == "control")].shape[0]
print(f"{num_treatment_visited} treatment labs have been marked as visited.")
print(f"{num_control_visited} control labs have been marked as visited.")

166 labs have been marked as visited in total.
83 treatment labs have been marked as visited.
83 control labs have been marked as visited.


In [22]:
# Rename columns
updated_assignments = updated_assignments.rename(columns={"out_of_sample": "out_of_sample_original"})
updated_assignments = updated_assignments.rename(columns={"Visited?": "bl_visit"})

# Drop unneeded columns
updated_assignments = updated_assignments.drop(columns=[
    "has_update"
])

In [23]:
# Create dataset of only those labs marked as visited (sample labs)
sample_labs = updated_assignments[updated_assignments["bl_visit"] == "Yes"].copy()

# Drop columns not needed
sample_labs = sample_labs.drop(columns=[
    "bl_visit", "out_of_sample_original"
])

In [24]:
# Save the sample labs
sample_labs.to_csv(config.BL_RAW_SAMPLE / "final_sample.csv", index=False)
sample_labs.to_csv(config.BL_RAW_SAMPLE_BACKUP / "final_sample.csv", index=False)

In [25]:
# Create dataset of labs not in sample (Visited? = No)
out_of_sample_labs = updated_assignments[updated_assignments["bl_visit"] == "No"].copy()

In [26]:
# Save the out of sample labs
out_of_sample_labs.to_csv(config.LABS_LIST / "out_of_sample_labs.csv", index=False)
updated_assignments.to_csv(config.LABS_LIST / "final_all_labs.csv", index=False)