# Save each of the EL survey excel files to the server and check whether completed

In [1]:
# Date of transfer
TRANSFER_DATE = "2026_01_30"

In [2]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[1]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
from openpyxl.formatting.rule import FormulaRule
from openpyxl.styles import Font, PatternFill
import os
import shutil
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

In [3]:
# Load dataset of all sample labs
labs = pd.read_csv(config.BL_RAW_SAMPLE / "final_sample_with_BL_file_status.csv")

In [4]:
# Create unique date-stamped backup folder
date_stamped_backup_folder = config.EL_RAW_SURVEY_BACKUP / TRANSFER_DATE

counter = 1
while date_stamped_backup_folder.exists():
    date_stamped_backup_folder = config.EL_RAW_SURVEY_BACKUP / f"{TRANSFER_DATE}_{counter}"
    counter += 1

date_stamped_backup_folder.mkdir(parents=True, exist_ok=True)

In [5]:
# For each lab group save the EL survey excel file to the server (main, main backup, date-stamped backup)

# Filter to only labs with filled BL files
labs_el = labs[(labs["file_filled"] == True)]

# Initialize lists to track missing and copied files
missing_files = []
copied_files = []

# For each lab group, copy the EL survey file to the server
for labgroupid, group in labs_el.groupby("labgroupid"):

    name = group["foldername"].iloc[0]
    folder_name = f"{name}_data"

    # Source file
    src_file = config.SWITCHDRIVE_ROOT / folder_name / "2_EL" / f"EL_{labgroupid}" / f"EL_{labgroupid}.xlsx"

    # Destination files
    dest_main = config.EL_RAW_SURVEY / "1_LabExcels" / f"EL_{labgroupid}.xlsx"
    dest_backup = config.EL_RAW_SURVEY_BACKUP / "1_LabExcels" / f"EL_{labgroupid}.xlsx"
    dest_backup_date_stamped = date_stamped_backup_folder / f"EL_{labgroupid}.xlsx"

    if src_file.exists():
        shutil.copy2(src_file, dest_main)
        shutil.copy2(src_file, dest_backup)
        shutil.copy2(src_file, dest_backup_date_stamped)
        copied_files.append(labgroupid)
    else:
        missing_files.append({"labgroupid": labgroupid, "name": name})

print(f"Files copied: {len(copied_files)}")
if missing_files:
    missing_df = pd.DataFrame(sorted(missing_files, key=lambda x: x["name"]))
    missing_df.to_excel(config.EL_RAW_SURVEY / "1_LabExcels" / "0_missing_files.xlsx", index=False)
    missing_df.to_excel(config.EL_RAW_SURVEY_BACKUP / "1_LabExcels" / "0_missing_files.xlsx", index=False)
    print(f"Files missing: {len(missing_files)}")
else:
    print("All expected files exist.")

Files copied: 138
All expected files exist.


In [6]:
# Indicate in labs dataframe which files were copied or missing
labs_el = labs_el.copy()
labs_el["el_file_copied"] = labs_el["labgroupid"].isin(copied_files)
labs_el["el_file_missing"] = labs_el["labgroupid"].isin([m["labgroupid"] for m in missing_files])

In [7]:
# Check whether copied files have been completed (have date of visit or have data in EL-only sheet)

# Filter to labs where EL survey copied
labs_to_check = labs_el[labs_el["el_file_copied"] == True]

filled_date_labs = []
missing_date_labs = []
filled_awareness_labs = []
missing_awareness_labs = []
empty_labs = []
error_labs = []

for labgroupid, group in labs_to_check.groupby("labgroupid"):

    # Get enumerator info from current group
    enum_id = group["enum_id"].iloc[0]
    name = group["foldername"].iloc[0]

    filename = config.EL_RAW_SURVEY / "1_LabExcels" / f"EL_{labgroupid}.xlsx"

    # Open the sheet "1. Main" and check whether cell "B4" has data
    try:
        wb = load_workbook(filename=filename, read_only=True, data_only=True)
        date_sheet = wb["1. Main"]
        date_of_visit = date_sheet["B4"].value
        awareness_sheet = wb["13. Awareness"]
        awareness_q = awareness_sheet["B5"].value
        wb.close()

        if date_of_visit is not None and date_of_visit != "":
            filled_date_labs.append(labgroupid)
        else:
            missing_date_labs.append(labgroupid)
        
        if awareness_q is not None and awareness_q != "":
            filled_awareness_labs.append(labgroupid)
        else:
            missing_awareness_labs.append(labgroupid)

    # Handle any errors during reading
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        error_labs.append(labgroupid)
        continue

    # If both date and awareness question are missing, add to empty labs
    if labgroupid in missing_date_labs and labgroupid in missing_awareness_labs:
        empty_labs.append({"labgroupid": labgroupid, "name": name})
    else:
        continue

# Summary of checking
print(f"Labs with date completed: {len(filled_date_labs)}")
print(f"Labs missing date: {len(missing_date_labs)}")
print(f"Labs with awareness question completed: {len(filled_awareness_labs)}")
print(f"Labs missing awareness question: {len(missing_awareness_labs)}")
print(f"Labs with both date and awareness question missing: {len(empty_labs)}")
print(f"Labs with errors during checking: {len(error_labs)}")


# Summary of empty files
if empty_labs:
    empty_df = pd.DataFrame(empty_labs)
    empty_df.to_excel(config.EL_RAW_SURVEY/"1_LabExcels"/"0_empty_files.xlsx", index=False)
    empty_df.to_excel(config.EL_RAW_SURVEY_BACKUP/"1_LabExcels"/"0_empty_files.xlsx", index=False)
    print(f"Labs with empty files: {len(empty_labs)}")
else:
    print("All expected files filled.")

Labs with date completed: 91
Labs missing date: 47
Labs with awareness question completed: 91
Labs missing awareness question: 47
Labs with both date and awareness question missing: 47
Labs with errors during checking: 0
Labs with empty files: 47


In [8]:
# Indicate in labs dataframe whether date filled, awareness filled, or empty
labs_el["el_date_filled"] = labs_el["labgroupid"].isin(filled_date_labs)
labs_el["el_awareness_filled"] = labs_el["labgroupid"].isin(filled_awareness_labs)

# Save updated labs dataframe
labs_el.to_csv(config.EL_RAW_SAMPLE / "final_sample_with_EL_file_status.csv", index=False)
labs_el.to_csv(config.EL_RAW_SAMPLE_BACKUP / "final_sample_with_EL_file_status.csv", index=False)

In [9]:
# Count how many treatment and control labs have filled awareness question
treatment_filled = labs_el[(labs_el["el_awareness_filled"]==True) & (labs_el["Treatment Status"]=="treatment")].shape[0]
control_filled = labs_el[(labs_el["el_awareness_filled"]==True) & (labs_el["Treatment Status"]=="control")].shape[0]
print(f"Treatment labs with awareness question filled: {treatment_filled}")
print(f"Control labs with awareness question filled: {control_filled}")

Treatment labs with awareness question filled: 45
Control labs with awareness question filled: 46
