# Save each of the BL checklist excel files to the server and check whether exists

In [6]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[1]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
from openpyxl.formatting.rule import FormulaRule
from openpyxl.styles import Font, PatternFill
import os
import shutil
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

In [7]:
# Load dataset of all sample labs
labs = pd.read_csv(config.BL_RAW_SAMPLE / "final_sample.csv")

In [8]:
# For each treated lab group save the BL checklist excel file to the server 
treated_labs = labs[labs["Treatment Status"] == "treatment"]

missing_files = []
copied_files = []

for labgroupid, group in treated_labs.groupby("labgroupid"):
    
    # Get enumerator info from current group
    enum_id = group["enum_id"].iloc[0]
    name = group["foldername"].iloc[0]
    folder_name = f"{name}_data"

    # Source file path
    filename = os.path.join(
        config.SWITCHDRIVE_ROOT,
        folder_name,
        "1_BL",
        f"BL_{labgroupid}",
        f"checklist_{labgroupid}.xlsx"
    )

    # Destination file path
    destfile = os.path.join(
        config.BL_RAW_CHECKLIST,
        "1_LabExcels",
        f"checklist_{labgroupid}.xlsx"
    )

     # Check if file exists
    if not os.path.exists(filename):
        print(f"File does not exist for labgroupid {labgroupid}")
        missing_files.append({"labgroupid": labgroupid, "name": name})
    else:
        # Copy the file to the destination
        shutil.copy2(filename, destfile)
        copied_files.append(labgroupid)

# Summary of copying
print(f"Files copied: {len(copied_files)}")

# Summary
if missing_files:
    missing_df = pd.DataFrame(sorted(missing_files, key=lambda x: x["name"]))
    missing_df.to_excel(config.BL_RAW_CHECKLIST/"1_LabExcels"/"0_missing_files.xlsx", index=False)
    print(f"Files missing: {len(missing_files)}")
else:
    print("All expected files exist.")

File does not exist for labgroupid 112
File does not exist for labgroupid 133
File does not exist for labgroupid 175
File does not exist for labgroupid 222
File does not exist for labgroupid 236
File does not exist for labgroupid 258
File does not exist for labgroupid 330
File does not exist for labgroupid 451
File does not exist for labgroupid 468
File does not exist for labgroupid 585
File does not exist for labgroupid 639
File does not exist for labgroupid 689
File does not exist for labgroupid 748
File does not exist for labgroupid 802
File does not exist for labgroupid 841
File does not exist for labgroupid 857
File does not exist for labgroupid 892
File does not exist for labgroupid 967
File does not exist for labgroupid 972
Files copied: 61
Files missing: 19


In [9]:
# Indicate in labs dataframe which files were copied or missing
treated_labs["file_copied"] = treated_labs["labgroupid"].isin(copied_files)
treated_labs["file_missing"] = treated_labs["labgroupid"].isin([m["labgroupid"] for m in missing_files])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_labs["file_copied"] = treated_labs["labgroupid"].isin(copied_files)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_labs["file_missing"] = treated_labs["labgroupid"].isin([m["labgroupid"] for m in missing_files])


In [10]:
# Save updated labs dataframe
treated_labs.to_csv(config.BL_RAW_SAMPLE / "treated_labs_with_checklist_status.csv", index=False)