# Cleaning 2 - Combine individual excel files into equipment-type-panel-level dataset

In [1]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[0]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import load_workbook
from openpyxl.formatting.rule import FormulaRule
from openpyxl.styles import Font, PatternFill
from openpyxl.utils import range_boundaries
import os
import shutil
import matplotlib.pyplot as plt
import string
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
from create_variable_from_survey import create_var

In [2]:
# Load data
labs = pd.read_csv(config.EL_RAW_SAMPLE / "final_sample_with_EL_file_status.csv")

# BL surveys folder
bl_surveys_folder = config.BL_RAW_SURVEY / "1_LabExcels"

# EL surveys folder
el_surveys_folder = config.EL_RAW_SURVEY / "1_LabExcels"

In [3]:
# Load survey dictionaries (helper for mapping variables to survey files, sheets and cells)
el_check_rows_dict = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="EL check")
equip_mappings = pd.read_excel(config.SURVEY_DICTIONARIES / "helper_survey_dictionary.xlsx", sheet_name="Equipment")

In [4]:
# List of labgroupids to process
labgroupids = labs['labgroupid'].tolist()

labs = labs.copy()

# List of labs with EL data collected
el_done_labs = labs[labs["el_awareness_filled"] == True].copy()
labgroupids_el_done = el_done_labs["labgroupid"].tolist()

## Equipment-type-panel-level dataset:
- survey: "bl" or "el"
- equipment
- type
- equipment-specific fields
- checks for changes (EL only)

In [5]:
# Rename columns in equip_mappings
equip_mappings = equip_mappings.rename(columns=lambda c: c.strip().replace(" ", "_"))

In [6]:
# Function to get value and comment columns for a type (e.g. 1 = C, D)
def get_type_columns(type_number):
    col_index = 2 + (type_number - 1) * 2
    co_col_index = col_index + 1
    return col_index, co_col_index

In [7]:
# Extract data from BL surveys

# Find the columns corresponding to each type
max_type_overall = equip_mappings["Max_types"].max()
type_cols = {t: get_type_columns(t) for t in range(1, max_type_overall + 1)}

# Initialize for storing data
all_bl_updates = []

# For each labgroupid, read the data from the individual survey file
for labgroupid in labgroupids:

    bl_path = bl_surveys_folder / f"BL_{labgroupid}.xlsx"
    if not bl_path.exists():
        continue

    for row in equip_mappings.itertuples(index=False): # Loop over equipments

        equipment = row.Equipment_type
        max_types = int(row.Max_types)
        no_vars = int(row.No_Vars)

        df_sheet = pd.read_excel(bl_path, sheet_name=row.Survey_sheet)

        for type_no in range(1, max_types + 1): # Loop over types

            updates = {
                "labgroupid": labgroupid,
                "survey": "BL",
                "equipment": equipment,
                "type_no": type_no
            }

            col_index, co_col_index = type_cols[type_no] # columns

            for v in range(1, no_vars + 1): # Loop through variables
                var_name = getattr(row, f"Variable_{v}")
                excel_row = 3 + v

                val = df_sheet.iloc[excel_row, col_index]
                co  = df_sheet.iloc[excel_row, co_col_index]

                updates[var_name] = val
                updates[f"{var_name}_co"] = co
            
            # Sharing variable
            updates["share"] = df_sheet.iloc[no_vars+4, col_index]
            updates["share_co"] = df_sheet.iloc[no_vars+4, co_col_index]

            # Append to all_updates
            all_bl_updates.append(updates)

# Create dataframe
bl_updates_df = pd.DataFrame(all_bl_updates)

In [8]:
# Extract data from EL surveys

# Find the columns corresponding to each type
max_type_overall = equip_mappings["Max_types"].max()
type_cols = {t: get_type_columns(t) for t in range(1, max_type_overall + 1)}

# Initialize for storing data
all_el_updates = []

# For each labgroupid, read the data from the individual survey file
for labgroupid in labgroupids_el_done:

    el_path = el_surveys_folder / f"EL_{labgroupid}.xlsx"
    if not el_path.exists():
        continue

    for row in equip_mappings.itertuples(index=False): # Loop over equipments

        equipment = row.Equipment_type
        max_types = int(row.Max_types)
        no_vars = int(row.No_Vars)

        df_sheet = pd.read_excel(el_path, sheet_name=row.Survey_sheet)

        for type_no in range(1, max_types + 1): # Loop over types

            updates = {
                "labgroupid": labgroupid,
                "survey": "EL",
                "equipment": equipment,
                "type_no": type_no
            }

            col_index, co_col_index = type_cols[type_no] # columns

            for v in range(1, no_vars + 1): # Loop through variables
                var_name = getattr(row, f"Variable_{v}")
                excel_row = 3 + v

                val = df_sheet.iloc[excel_row, col_index]
                co  = df_sheet.iloc[excel_row, co_col_index]

                updates[var_name] = val
                updates[f"{var_name}_co"] = co

            updates["share"] = df_sheet.iloc[no_vars+4, col_index]
            updates["share_co"] = df_sheet.iloc[no_vars+4, co_col_index]

            updates["el_check"] = df_sheet.iloc[no_vars+5, col_index]
            updates["el_check_co"] = df_sheet.iloc[no_vars+5, co_col_index]

            # Append to all_updates
            all_el_updates.append(updates)

# Create dataframe
el_updates_df = pd.DataFrame(all_el_updates)

In [9]:
# Combine BL and EL dataframes and save

# Combine dataframes
panel_df = pd.concat([bl_updates_df, el_updates_df], ignore_index=True)

# Save processed dataset
panel_df.to_csv(config.PROCESSED_DATA / "panel_processed_1.csv", index = False)

In [11]:
# Count how many rows missing everything apart from labgroupid, survey, equipment, type_no
keep_cols = ["labgroupid", "survey", "equipment", "type_no"]
cols_to_check = panel_df.columns.difference(keep_cols)
missing_everything = panel_df[cols_to_check].isna() | (panel_df[cols_to_check] == "")
rows_missing_everything = missing_everything.all(axis=1).sum()
print(f"Number of empty rows: {rows_missing_everything}")

Number of empty rows: 11719
