# Update the finalized list of sample labs with the labs that were visited in November

In [1]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[1]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
from openpyxl.formatting.rule import FormulaRule
from openpyxl.styles import Font, PatternFill
import os
import warnings

# Suppress irrelevant conditional formatting warnings when reading Excel files
warnings.filterwarnings("ignore", message="Conditional Formatting extension is not supported")

In [2]:
# Load datasets
existing_assignments = pd.read_csv(config.ENUMERATORS / "assignedlabs.csv")
all_enumerators = pd.read_csv(config.ENUMERATORS / "all_enumerators.csv")

In [3]:
# Merge to get enum_foldername
assignments = existing_assignments.merge(all_enumerators[["enum_id", "foldername"]], on="enum_id", how="left")

In [4]:
# Load original files

# Load original final sample
original_final_sample = pd.read_csv(config.BL_RAW_SAMPLE / "final_sample_original.csv")

# Load original out of sample labs
original_out_of_sample_labs = pd.read_csv(config.LABS_LIST / "out_of_sample_labs_original.csv")

# Load original final all labs
original_updated_assignments = pd.read_csv(config.LABS_LIST / "final_all_labs_original.csv")

In [5]:
# Create file containing all checked labs
checked_all = []

# Load the file as dataframe
nov_checked_file = pd.read_excel(config.ENUMERATORS/ "check_lab_assignment_nov_labs.xlsx", sheet_name="Sheet1")

# Append relevant columns to checked_all list
checked_all.append(nov_checked_file[[
    "labgroupid", "Visited?", 
    "Contact person (if different)", 
        "Contact email (if different)", 
        "Comments?", 
        "enum_id"
    ]])

# Combine all checked files
if checked_all:  # make sure the list is not empty
    combined_checked = pd.concat(checked_all, ignore_index=True)

In [6]:
# Merge with assignments on labgroupid and enum_id
updated_assignments = assignments.merge(
    combined_checked,
    on=["labgroupid", "enum_id"],
    how="inner",
    suffixes=('', '_checked')
)

In [7]:
# Remove these labs from the all labs to create the list of all other labs
other_updated_assignments = original_updated_assignments[
    ~original_updated_assignments['labgroupid'].isin(updated_assignments['labgroupid'])]

# Rename cols
other_updated_assignments = other_updated_assignments.rename(columns={"out_of_sample_original": "out_of_sample"})
other_updated_assignments = other_updated_assignments.rename(columns={"bl_visit": "Visited?"})

In [8]:
#Create indicator for Nov labs
updated_assignments["november_lab"] = True
other_updated_assignments["november_lab"] = False

# Append the Nov labs to the other labs
updated_assignments = pd.concat([other_updated_assignments, updated_assignments], ignore_index=True)

In [9]:
# Check how many labs in our sample (Visited? = Yes)
num_visited = updated_assignments[updated_assignments["Visited?"] == "Yes"].shape[0]
print(f"{num_visited} labs have been marked as visited in total.")
# Check how many treatment and control labs in our sample
num_treatment_visited = updated_assignments[(updated_assignments["Visited?"] == "Yes") &
                                            (updated_assignments["Treatment Status"] == "treatment")].shape[0]
num_control_visited = updated_assignments[(updated_assignments["Visited?"] == "Yes") &
                                            (updated_assignments["Treatment Status"] == "control")].shape[0]
print(f"{num_treatment_visited} treatment labs have been marked as visited.")
print(f"{num_control_visited} control labs have been marked as visited.")

147 labs have been marked as visited in total.
75 treatment labs have been marked as visited.
72 control labs have been marked as visited.


In [10]:
# Rename columns
updated_assignments = updated_assignments.rename(columns={"out_of_sample": "out_of_sample_original"})
updated_assignments = updated_assignments.rename(columns={"Visited?": "bl_visit"})

In [11]:
# Create dataset of only those labs marked as visited (sample labs)
sample_labs = updated_assignments[updated_assignments["bl_visit"] == "Yes"].copy()

# Drop columns not needed
sample_labs = sample_labs.drop(columns=[
    "bl_visit", "out_of_sample_original"
])

In [12]:
# Save the sample labs
sample_labs.to_csv(config.BL_RAW_SAMPLE / "final_sample.csv", index=False)
sample_labs.to_csv(config.BL_RAW_SAMPLE_BACKUP / "final_sample.csv", index=False)

In [13]:
# Create dataset of labs not in sample (Visited? = No)
out_of_sample_labs = updated_assignments[updated_assignments["bl_visit"] == "No"].copy()

In [14]:
# Save the out of sample labs
out_of_sample_labs.to_csv(config.LABS_LIST / "out_of_sample_labs.csv", index=False)
updated_assignments.to_csv(config.LABS_LIST / "final_all_labs.csv", index=False)