# Cleaning 1.0 - Combine individual excel files into individual-level dataset

In [1]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[0]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
from openpyxl.formatting.rule import FormulaRule
from openpyxl.styles import Font, PatternFill
from openpyxl.utils import range_boundaries
import os
import shutil
import matplotlib.pyplot as plt
import string
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

In [2]:
# Load data
labs = pd.read_csv(config.EL_RAW_SAMPLE / "final_sample_with_EL_file_status.csv")

# BL surveys folder
bl_surveys_folder = config.BL_RAW_SURVEY / "1_LabExcels"

# EL surveys folder
el_surveys_folder = config.EL_RAW_SURVEY / "1_LabExcels"

# BL checklists folder
bl_checklists_folder = config.BL_RAW_CHECKLIST / "1_LabExcels"

# Calculators folder
calculators_folder = config.CALCULATORS_WITH_TIPS

In [3]:
# Load survey dictionaries (helper for mapping variables to survey files, sheets and cells)
other_qs_dict = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="Other")
# el_check_rows_dict = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="EL check")
spec_equip_cols_dict = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="Specialized equipment")
# equip_mappings = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="Equipment")
checklist_dict = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="Checklist")

## Individual-level dataset: 
- survey dates (BL + EL)
- characteristics (BL)
- sharing and comms (BL)
- consent to merge (BL)
- awareness (EL)
- attitudes (EL)
- waste (BL)
- specialized equipment (BL)
- SPARK (BL spark survey, EL)
- calculator (calc, T only)

In [4]:
# List of labgroupids to process
labgroupids = labs["labgroupid"].unique().tolist()

labs = labs.copy()

treated_labs = labs[labs["Treatment Status"] == "treatment"].copy()
labgroupids_t_only = treated_labs["labgroupid"].unique().tolist()

In [5]:
# Function to create variable(s) from cell(s) in individual survey files
def create_var(ws, labs, mask, var_name,
               cell,
               multiple_cells=False,
               no_variables=False,
               comment_cell=None,
               fc_cell=None) :

    # Multiple cells
    if multiple_cells:
        cells = ws[cell]  # cell range
        values = [c.value for row in cells for c in row]
        values = [v for v in values]

        if comment_cell:
            comment_cells = ws[comment_cell] # comment range
            comment_values = [c.value for row in comment_cells for c in row]
            comment_values = [v for v in comment_values]

        if fc_cell:
            fc_cells = ws[fc_cell] # free text range
            fc_values = [c.value for row in fc_cells for c in row]
            fc_values = [v for v in fc_values]

        # Create separate variables per cell (cell, comment, free text)
        if no_variables: 
            for i, v in enumerate(values, start=1):
                labs.loc[mask, f"{var_name}_{i}"] = v

            if comment_cell:
                for i, v in enumerate(comment_values, start=1):
                    labs.loc[mask, f"{var_name}_{i}_co"] = v
            
            if fc_cell:
                for i, v in enumerate(fc_values, start=1):
                    labs.loc[mask, f"{var_name}_{i}_fc"] = v

        # Create single variable with all cells joined by ";"
        else:
            labs.loc[mask, var_name] = ";".join(str(v) for v in values if v not in (None, ""))

            if comment_cell:
                labs.loc[mask, f"{var_name}_co"] = ";".join(str(v) for v in comment_values if v not in (None, ""))
            
            if fc_cell:
                labs.loc[mask, f"{var_name}_fc"] = ";".join(str(v) for v in fc_values if v not in (None, ""))

    # Single cell
    else:
        v = ws[cell].value
        labs.loc[mask, var_name] = v

        # Comment
        if comment_cell:
            v = ws[comment_cell].value
            labs.loc[mask, f"{var_name}_co"] = v

        # Free text response
        if fc_cell:
            v = ws[fc_cell].value
            labs.loc[mask, f"{var_name}_fc"] = v


In [6]:
# Create the variables in the main dataframe as object dtype to allow for mixed types
cols_to_add = []
seen = set()

def add(col):
    if col not in seen:
        cols_to_add.append(col)
        seen.add(col)

# Loop over all variables in other qs
for _, row in other_qs_dict.iterrows():
    var = row["Variable"]

    # Multiple variables (_1, _2, ...)
    if pd.notna(row["No variables"]):
        n = int(row["No variables"])
        for i in range(1, n + 1):
            add(f"{var}_{i}")

            if pd.notna(row["Comment"]):
                add(f"{var}_{i}_co")
            if pd.notna(row["Free text"]):
                add(f"{var}_{i}_fc")

    else:
        add(var)
        if pd.notna(row["Comment"]):
            add(f"{var}_co")
        if pd.notna(row["Free text"]):
            add(f"{var}_fc")

# Loop over all variables in specialized equipment
for _, row in spec_equip_cols_dict.iterrows():
    var = row["Specialized equipment"]
    add(f"{var}_ind")
    add(f"{var}_no")
    add(f"{var}_share")
    add(f"{var}_co")

# Create checklist variables (16 bronze, 18 silver, 15 gold)
for i in range(1, 17):
    add(f"bronze_q_{i}_bl")
    add(f"bronze_q_{i}_bl_co")
    add(f"bronze_q_{i}_el")
    add(f"bronze_q_{i}_el_co")

for i in range(1, 19):
    add(f"silver_q_{i}_bl")
    add(f"silver_q_{i}_bl_co")
    add(f"silver_q_{i}_el")
    add(f"silver_q_{i}_el_co")

for i in range(1, 16):
    add(f"gold_q_{i}_bl")
    add(f"gold_q_{i}_bl_co")
    add(f"gold_q_{i}_el")
    add(f"gold_q_{i}_el_co")

new_cols = [col for col in cols_to_add if col not in labs.columns]
if new_cols:
    labs = pd.concat([labs, pd.DataFrame(columns=new_cols, index=labs.index, dtype="object")], axis=1)
    

In [7]:
# Prepare dataset
labs = labs.astype(object)

In [8]:
# Extract data from individual BL survey sheets
for labgroupid in labgroupids:

    mask = labs["labgroupid"] == labgroupid
    if not mask.any():
        continue

    treatment_status = labs.loc[mask, "Treatment Status"].iloc[0]

    bl_path = bl_surveys_folder / f"BL_{labgroupid}.xlsx"
    if not bl_path.exists():
        continue

    wb = load_workbook(bl_path, data_only=True)

    # Other qs
    for _, row in other_qs_dict.iterrows():

        if row["Survey"] != "BL":
            continue

        tc_only = row["T or C only"]
        if pd.notna(tc_only) and tc_only != treatment_status:
            continue

        ws = wb[row["Sheet"]]

        create_var(
            ws=ws,
            labs=labs,
            mask=mask,
            var_name=row["Variable"],
            cell=row["Cell(s)"],
            multiple_cells=(row["Multiple cells"] == "Y"),
            no_variables=pd.notna(row["No variables"]),
            comment_cell=row["Comment"] if pd.notna(row["Comment"]) else None,
            fc_cell=row["Free text"] if pd.notna(row["Free text"]) else None
        )

    # Specialized equipment qs
    ws = wb["14. Specialized Equipment"]

    for _, row in spec_equip_cols_dict.iterrows():
        create_var(
            ws = ws, labs=labs, mask=mask,
            var_name=f"{row['Specialized equipment']}_ind",
            cell=f"{row['Column']}6"
        )
        create_var(
            ws = ws, labs=labs, mask=mask,
            var_name=f"{row['Specialized equipment']}_no",
            cell=f"{row['Column']}7"
        )
        create_var(
            ws = ws, labs=labs, mask=mask,
            var_name=f"{row['Specialized equipment']}_share",
            cell=f"{row['Column']}8"
        )
        create_var(
            ws = ws, labs=labs, mask=mask,
            var_name=f"{row['Specialized equipment']}_co",
            cell=f"{row['Column']}9"
        )

In [9]:
# Extract data from individual EL survey sheets
for labgroupid in labgroupids:

    mask = labs["labgroupid"] == labgroupid
    if not mask.any():
        continue

    treatment_status = labs.loc[mask, "Treatment Status"].iloc[0]

    el_path = el_surveys_folder / f"EL_{labgroupid}.xlsx"
    if not el_path.exists():
        continue

    wb = load_workbook(el_path, data_only=True)

    # Other qs
    for _, row in other_qs_dict.iterrows():

        if row["Survey"] != "EL":
            continue

        tc_only = row["T or C only"]
        if pd.notna(tc_only) and tc_only != treatment_status:
            continue

        ws = wb[row["Sheet"]]

        create_var(
            ws=ws,
            labs=labs,
            mask=mask,
            var_name=row["Variable"],
            cell=row["Cell(s)"],
            multiple_cells=(row["Multiple cells"] == "Y"),
            no_variables=pd.notna(row["No variables"]),
            comment_cell=row["Comment"] if pd.notna(row["Comment"]) else None,
            fc_cell=row["Free text"] if pd.notna(row["Free text"]) else None
        )

    # Checklist qs (T only)
    if treatment_status == "treatment":

        ws = wb["15. SPARK Checklist"]

        for _, row in checklist_dict.iterrows():
            category = row["Category"]
            q_no = row["Question number"]
            row_no = row["Row"]

            create_var(
                ws=ws, labs=labs, mask=mask,
                var_name=f"{category}_q_{q_no}_el",
                cell=f"C{row_no}",
                comment_cell=f"F{row_no}"
            )

In [10]:
# Extract data from BL checklists (T only)
for labgroupid in labgroupids_t_only:

    mask = labs["labgroupid"] == labgroupid
    if not mask.any():
        continue

    checklist_path = bl_checklists_folder / f"checklist_{labgroupid}.xlsx"
    if not checklist_path.exists():
        display(f"Missing checklist for labgroup {labgroupid}.")
        continue

    wb = load_workbook(checklist_path, data_only=True)

    # Checklist qs
    ws = wb["SPARK Checklist"]

    for _, row in checklist_dict.iterrows():
        category = row["Category"]
        q_no = row["Question number"]
        row_no = row["Row"]

        create_var(
            ws=ws, labs=labs, mask=mask,
            var_name=f"{category}_q_{q_no}_bl",
            cell=f"C{row_no}",
            comment_cell=f"F{row_no}"
        )

'Missing checklist for labgroup 468.'

'Missing checklist for labgroup 175.'

'Missing checklist for labgroup 330.'

'Missing checklist for labgroup 451.'

'Missing checklist for labgroup 748.'

'Missing checklist for labgroup 802.'

In [11]:
# Extract data from calculators - still needs work

In [12]:
# Save processed dataset
labs.to_csv(config.PROCESSED_DATA / "individual_processed_1.csv", index = False)
