# Save each of the BL survey excel files to the server and check whether exists and whether filled

In [1]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[1]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
from openpyxl.formatting.rule import FormulaRule
from openpyxl.styles import Font, PatternFill
import os
import shutil
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

In [2]:
# Load dataset of all sample labs
labs = pd.read_csv(config.BL_RAW_SAMPLE / "final_sample.csv")

In [3]:
# For each lab group save the BL survey excel file to the server 

missing_files = []
copied_files = []

for labgroupid, group in labs.groupby("labgroupid"):
    
    # Get enumerator info from current group
    enum_id = group["enum_id"].iloc[0]
    name = group["foldername"].iloc[0]
    folder_name = f"{name}_data"

    # Source file path
    filename = os.path.join(
        config.SWITCHDRIVE_ROOT,
        folder_name,
        "1_BL",
        f"BL_{labgroupid}",
        f"BL_{labgroupid}.xlsx"
    )

    # Destination file path
    destfile = os.path.join(
        config.BL_RAW_SURVEY,
        "1_LabExcels",
        f"BL_{labgroupid}.xlsx"
    )

     # Check if file exists
    if not os.path.exists(filename):
        print(f"File does not exist for labgroupid {labgroupid}")
        missing_files.append({"labgroupid": labgroupid, "name": name})
    else:
        # Copy the file to the destination
        shutil.copy2(filename, destfile)
        copied_files.append(labgroupid)

# Summary of copying
print(f"Files copied: {len(copied_files)}")

# Summary
if missing_files:
    missing_df = pd.DataFrame(sorted(missing_files, key=lambda x: x["name"]))
    missing_df.to_excel(config.BL_RAW_SURVEY/"1_LabExcels"/"0_missing_files.xlsx", index=False)
    print(f"Files missing: {len(missing_files)}")
else:
    print("All expected files exist.")

File does not exist for labgroupid 112
File does not exist for labgroupid 219
File does not exist for labgroupid 221
File does not exist for labgroupid 288
File does not exist for labgroupid 450
File does not exist for labgroupid 585
File does not exist for labgroupid 639
File does not exist for labgroupid 841
File does not exist for labgroupid 967
Files copied: 138
Files missing: 9


In [4]:
# Indicate in labs dataframe which files were copied or missing
labs["file_copied"] = labs["labgroupid"].isin(copied_files)
labs["file_missing"] = labs["labgroupid"].isin([m["labgroupid"] for m in missing_files])

In [5]:
# For each labgroupid where copied = 1, open the BL survey file and check the first sheet has data
labs_to_check = labs[labs["file_copied"] == True]

filled_labs = []
empty_labs = []
error_labs = []

for labgroupid, group in labs_to_check.groupby("labgroupid"):

    # Get enumerator info from current group
    enum_id = group["enum_id"].iloc[0]
    name = group["foldername"].iloc[0]

    # Open the sheet "1. Characteristics" and check whether cells "C5:C7" have data
    filename = config.BL_RAW_SURVEY / "1_LabExcels" / f"BL_{labgroupid}.xlsx"
    try:
        wb = load_workbook(filename=filename, read_only=True, data_only=True)
        sheet = wb["1. Characteristics"]
        values = [sheet[f"C{row}"].value for row in range(5, 8)]
        wb.close()

        if any(v is not None and v != "" for v in values):
            filled_labs.append(labgroupid)
        else:
            empty_labs.append({"labgroupid": labgroupid, "name": name})
    
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        error_labs.append(labgroupid)

# Summary of checking
print(f"Labs with filled data: {len(filled_labs)}")

# Summary of empty files
if empty_labs:
    empty_df = pd.DataFrame(empty_labs)
    empty_df.to_excel(config.BL_RAW_SURVEY/"1_LabExcels"/"0_empty_files.xlsx", index=False)
    print(f"Labs with empty files: {len(empty_labs)}")
else:
    print("All expected files filled.")

Labs with filled data: 138
All expected files filled.


In [6]:
# Indicate in labs dataframe which files were filled or empty
labs["file_filled"] = labs["labgroupid"].isin(filled_labs)
labs["file_empty"] = labs["labgroupid"].isin([m["labgroupid"] for m in empty_labs])

# Save updated labs dataframe
labs.to_csv(config.BL_RAW_SAMPLE / "final_sample_with_BL_file_status.csv", index=False)