# Cleaning 1.0 - Combine individual excel files into individual-level dataset

In [1]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[0]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
import xlwings as xw
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
from create_variable_from_survey import create_var

In [2]:
# Load data
labs = pd.read_csv(config.EL_RAW_SAMPLE / "final_sample_with_EL_file_status.csv")

# BL surveys folder
bl_surveys_folder = config.BL_RAW_SURVEY / "1_LabExcels"

# EL surveys folder
el_surveys_folder = config.EL_RAW_SURVEY / "1_LabExcels"

# BL checklists folder
bl_checklists_folder = config.BL_RAW_CHECKLIST / "1_LabExcels"

# Calculators folder
calculators_folder = config.CALCULATORS_WITH_TIPS

In [3]:
# Load survey dictionaries (helper for mapping variables to survey files, sheets and cells)
other_qs_dict = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="Other")
spec_equip_cols_dict = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="Specialized equipment")
checklist_dict = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="Checklist")
equip_mappings = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="Equipment")

# Calculator equipment types
calculator_equip = equip_mappings[equip_mappings["Equipment type"] != "fc"]

In [4]:
# Email confirmation only labs
email_confirmation = pd.read_excel(config.ENUMERATORS / "EL_visits_completed.xlsx", sheet_name = "No visit email conf")
labs["el_email_conf"] = labs["labgroupid"].isin(email_confirmation["labgroupid"]).astype(bool)

# Recovered data (technical issues etc.)
recovered_data = pd.read_excel(config.ENUMERATORS / "EL_visits_completed.xlsx", sheet_name = "Recovered")
labs["recovered_data"] = labs["labgroupid"].isin(recovered_data["labgroupid"]).astype(bool)
# Inidicator to replace EL data with missing for awareness, attitudes, checklist (if "To do" = "Replace EL qs with missing")
labs["replace_el_with_missing"] = labs["labgroupid"].isin(
    recovered_data[recovered_data["To do"] == "Replace EL qs with missing"]["labgroupid"]).astype(bool)

## Individual-level dataset: 
- survey dates (BL + EL)
- characteristics (BL)
- sharing and comms (BL)
- consent to merge (BL)
- awareness (EL)
- attitudes (EL)
- waste (BL)
- specialized equipment (BL)
- SPARK (BL checklist, EL)
- calculator (calc)

In [5]:
# List of labgroupids to process (all)
labgroupids = labs["labgroupid"].tolist()

labs = labs.copy()

# List of treated labgroupids
treated_labs = labs[labs["Treatment Status"] == "treatment"].copy()
labgroupids_t_only = treated_labs["labgroupid"].tolist()

# List of labs with EL data collected
el_done_labs = labs[labs["el_awareness_filled"] == True].copy()
labgroupids_el_done = el_done_labs["labgroupid"].tolist()

# List of labs to replace EL data with missing (from recovered data sheet)
replace_el_with_missing_labs = labs[labs["replace_el_with_missing"] == True].copy()
labgroupids_replace_el_missing = replace_el_with_missing_labs["labgroupid"].tolist()

# List of labs we are aware have no BL checklist
no_bl_checklists = pd.read_excel(config.LABS_LIST / "labs_no_BL_SPARK_checklist.xlsx")
labgroupids_no_bl_checklist = no_bl_checklists["labgroupid"].tolist()

In [6]:
# Create the variables in the main dataframe as object dtype to allow for mixed types
cols_to_add = []
seen = set()

def add(col):
    if col not in seen:
        cols_to_add.append(col)
        seen.add(col)

# Loop over all variables in other qs
for _, row in other_qs_dict.iterrows():
    var = row["Variable"]

    # Multiple variables (_1, _2, ...)
    if pd.notna(row["No variables"]):
        n = int(row["No variables"])
        for i in range(1, n + 1):
            add(f"{var}_{i}")

            if pd.notna(row["Comment"]):
                add(f"{var}_{i}_co")
            if pd.notna(row["Free text"]):
                add(f"{var}_{i}_fc")

    else:
        add(var)
        if pd.notna(row["Comment"]):
            add(f"{var}_co")
        if pd.notna(row["Free text"]):
            add(f"{var}_fc")

# Loop over all variables in specialized equipment
for _, row in spec_equip_cols_dict.iterrows():
    var = row["Specialized equipment"]
    add(f"{var}_ind")
    add(f"{var}_no")
    add(f"{var}_share")
    add(f"{var}_co")

# Create checklist variables (16 bronze, 18 silver, 15 gold)
for i in range(1, 17):
    add(f"bronze_q_{i}_bl")
    add(f"bronze_q_{i}_bl_co")
    add(f"bronze_q_{i}_el")
    add(f"bronze_q_{i}_el_co")

for i in range(1, 19):
    add(f"silver_q_{i}_bl")
    add(f"silver_q_{i}_bl_co")
    add(f"silver_q_{i}_el")
    add(f"silver_q_{i}_el_co")

for i in range(1, 16):
    add(f"gold_q_{i}_bl")
    add(f"gold_q_{i}_bl_co")
    add(f"gold_q_{i}_el")
    add(f"gold_q_{i}_el_co")

# Calculator variables
add("calc_total_energy")
add("calc_total_co2")
for _, row in calculator_equip.iterrows():
    var = row["Equipment type"]
    add(f"calc_{var}_energy")
    add(f"calc_{var}_co2")

new_cols = [col for col in cols_to_add if col not in labs.columns]
if new_cols:
    labs = pd.concat([labs, pd.DataFrame(columns=new_cols, index=labs.index, dtype="string")], axis=1)
    

In [8]:
# Extract data from individual BL survey sheets
for labgroupid in labgroupids:

    mask = labs["labgroupid"] == labgroupid
    if not mask.any():
        continue

    treatment_status = labs.loc[mask, "Treatment Status"].iloc[0]

    bl_path = bl_surveys_folder / f"BL_{labgroupid}.xlsx"
    if not bl_path.exists():
        continue

    wb = load_workbook(bl_path, data_only=True)

    # Other qs
    for _, row in other_qs_dict.iterrows():

        if row["Survey"] != "BL":
            continue

        tc_only = row["T or C only"]
        if pd.notna(tc_only) and tc_only != treatment_status:
            continue

        ws = wb[row["Sheet"]]

        create_var(
            ws=ws,
            labs=labs,
            mask=mask,
            var_name=row["Variable"],
            cell=row["Cell(s)"],
            multiple_cells=(row["Multiple cells"] == "Y"),
            no_variables=pd.notna(row["No variables"]),
            comment_cell=row["Comment"] if pd.notna(row["Comment"]) else None,
            fc_cell=row["Free text"] if pd.notna(row["Free text"]) else None
        )

    # Specialized equipment qs
    ws = wb["14. Specialized Equipment"]

    for _, row in spec_equip_cols_dict.iterrows():
        create_var(
            ws = ws, labs=labs, mask=mask,
            var_name=f"{row['Specialized equipment']}_ind",
            cell=f"{row['Column']}6"
        )
        create_var(
            ws = ws, labs=labs, mask=mask,
            var_name=f"{row['Specialized equipment']}_no",
            cell=f"{row['Column']}7"
        )
        create_var(
            ws = ws, labs=labs, mask=mask,
            var_name=f"{row['Specialized equipment']}_share",
            cell=f"{row['Column']}8"
        )
        create_var(
            ws = ws, labs=labs, mask=mask,
            var_name=f"{row['Specialized equipment']}_co",
            cell=f"{row['Column']}9"
        )

In [9]:
# Extract data from individual EL survey sheets (only where EL data collected)
for labgroupid in labgroupids_el_done:

    mask = labs["labgroupid"] == labgroupid
    if not mask.any():
        continue

    # Flag to indicate whether to replace EL survey data with missing (i.e. skip reading for all but survey date)
    replace_with_missing = labgroupid in labgroupids_replace_el_missing

    treatment_status = labs.loc[mask, "Treatment Status"].iloc[0]

    el_path = el_surveys_folder / f"EL_{labgroupid}.xlsx"
    if not el_path.exists():
        continue

    wb = load_workbook(el_path, data_only=True)

    # Other qs
    for _, row in other_qs_dict.iterrows():

        if row["Survey"] != "EL":
            continue

        tc_only = row["T or C only"]
        if pd.notna(tc_only) and tc_only != treatment_status:
            continue

        ws = wb[row["Sheet"]]
        var_name=row["Variable"]

        # Create variables from EL survey if not replacing with missing
        if replace_with_missing and var_name != "survey_date_el":
            continue
        else:
            create_var(
                ws=ws,
                labs=labs,
                mask=mask,
                var_name=var_name,
                cell=row["Cell(s)"],
                multiple_cells=(row["Multiple cells"] == "Y"),
                no_variables=pd.notna(row["No variables"]),
                comment_cell=row["Comment"] if pd.notna(row["Comment"]) else None,
                fc_cell=row["Free text"] if pd.notna(row["Free text"]) else None
            )

    # Checklist qs (T and if not replacing with missing only) 
    if treatment_status == "treatment" and not replace_with_missing:

        ws = wb["15. SPARK Checklist"]

        for _, row in checklist_dict.iterrows():
            category = row["Category"]
            q_no = row["Question number"]
            row_no = row["Row"]

            create_var(
                ws=ws, labs=labs, mask=mask,
                var_name=f"{category}_q_{q_no}_el",
                cell=f"C{row_no}",
                comment_cell=f"F{row_no}"
            )

In [10]:
# Extract data from BL checklists (T only)

missing_checklist = []

for labgroupid in labgroupids_t_only:

    mask = labs["labgroupid"] == labgroupid
    if not mask.any():
        continue

    checklist_path = bl_checklists_folder / f"checklist_{labgroupid}.xlsx"
    if not checklist_path.exists():
        missing_checklist.append(labgroupid)
        continue

    wb = load_workbook(checklist_path, data_only=True)

    # Checklist qs
    ws = wb["SPARK Checklist"]

    for _, row in checklist_dict.iterrows():
        category = row["Category"]
        q_no = row["Question number"]
        row_no = row["Row"]

        create_var(
            ws=ws, labs=labs, mask=mask,
            var_name=f"{category}_q_{q_no}_bl",
            cell=f"C{row_no}",
            comment_cell=f"F{row_no}"
        )

# Check that we are already aware of all groups with missing checklists
assert set(missing_checklist).issubset(set(labgroupids_no_bl_checklist))

In [11]:
# Extract data from calculators (have to use xlwings as not loaded previously)

missing_calculator = []

for labgroupid in labgroupids:

    mask = labs["labgroupid"] == labgroupid
    if not mask.any():
        continue

    calculator_path = calculators_folder / str(labgroupid) / "Energy_Use_Report.xlsx"
    if not calculator_path.exists():
        display(f"Missing calculator for labgroup {labgroupid}.")
        missing_calculator.append(labgroupid)
        continue

    wb = xw.Book(calculator_path)

    # Read total energy use and CO2
    ws = wb.sheets["Introduction"]
    labs.loc[mask, "calc_total_energy"] = ws.range("B12").value
    labs.loc[mask, "calc_total_co2"] = ws.range("B13").value

    # Read equipment energy use and CO2
    for _, row in calculator_equip.iterrows():

        sheet = row["Calculator sheet"]
        equip_type = row["Equipment type"]
        var_name_energy = f"calc_{equip_type}_energy"
        var_name_co2 = f"calc_{equip_type}_co2"

        ws = wb.sheets[sheet]
        labs.loc[mask, var_name_energy] = ws.range("B2").value
        labs.loc[mask, var_name_co2] = ws.range("B3").value

    wb.close()

# Check that no groups missing calculator (commented out for now)
#assert not missing_calculator

'Missing calculator for labgroup 541.'

In [12]:
# Check dtypes
labs.dtypes

labgroupid                     int64
Lab Group                     object
Faculty                       object
Institute                     object
Professor                     object
                           ...      
calc_incubator_co2    string[python]
calc_heater_energy    string[python]
calc_heater_co2       string[python]
calc_it_energy        string[python]
calc_it_co2           string[python]
Length: 408, dtype: object

In [13]:
# Save processed dataset
labs.to_csv(config.PROCESSED_DATA / "individual_processed_1.csv", index = False)