# Cleaning 1.1 - Check for unique values for each variable (including free text and comment) in the individual dataset to inform cleaning procedure

In [1]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[0]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import Workbook
from openpyxl.styles import Alignment
import os

In [2]:
# Load data
labs = pd.read_csv(config.PROCESSED_DATA / "individual_processed_1.csv")

In [3]:
# Load survey dictionaries
other_qs_data_dict = pd.read_excel(config.DATA_DICTIONARIES / "data_dictionary.xlsx", sheet_name="Other")

In [4]:
# List of variables to check unique values for
variables_to_check = other_qs_data_dict["Variable"].tolist()

In [5]:
# Function to build unique value dataframe for a given variable, handling both single and multi-variable cases
def build_unique_df(labs, var_name, comment, free_text, no_variables):
    
    var_name_co = f"{var_name}_co"
    var_name_fc = f"{var_name}_fc"

    # Single variable case
    if no_variables is None:
        
        cols = [var_name]

        if comment == "Y":
            cols.append(var_name_co)

        if free_text == "Y":
            cols.append(var_name_fc)

        df_unique = (
            labs[cols]
            .dropna(how="all")
            .drop_duplicates()
            .reset_index(drop=True)
        )

    # Multi variable case (stack on top of each other to reduce checking effort)
    else:
        stacked_parts = []

        for i in range(1, no_variables + 1):

            cols = []
            rename_dict = {}

            main_col = f"{var_name}_{i}"
            cols.append(main_col)
            rename_dict[main_col] = "raw_value"

            if comment == "Y":
                co_col = f"{var_name_co}_{i}"
                cols.append(co_col)
                rename_dict[co_col] = "comment"

            if free_text == "Y":
                fc_col = f"{var_name_fc}_{i}"
                cols.append(fc_col)
                rename_dict[fc_col] = "free_text"

            temp = labs[cols].rename(columns=rename_dict)
            stacked_parts.append(temp)

        df_unique = (
            pd.concat(stacked_parts, ignore_index=True)
            .dropna(how="all")
            .drop_duplicates()
            .reset_index(drop=True)
        )

    return df_unique

In [6]:
# Export unique combinations for other qs to excel, creating one sheet per var
wb = Workbook()
wb.remove(wb.active) # remove default sheet

# Loop through variables
for _, row in other_qs_data_dict.iterrows():

    var_name = row["Variable"]
    comment = row["Comment"]
    no_variables = int(row["No variables"]) if pd.notna(row["No variables"]) else None
    free_text = row["Free text"]

    df_unique = build_unique_df(
        labs,
        var_name,
        comment,
        free_text,
        no_variables
    )

    df_unique = df_unique.fillna("") # replace with empty for better excel display

    sheet_name = var_name[:31] # sheet name (limited to 31 chars)
    ws = wb.create_sheet(title=sheet_name) # create sheet for var

    # Write header
    for col_idx, col_name in enumerate(df_unique.columns, start=1):
        ws.cell(row=1, column=col_idx, value=col_name)

    # Write data
    for row_idx, row_values in enumerate(df_unique.values, start=2):
        for col_idx, value in enumerate(row_values, start=1):
            ws.cell(row=row_idx, column=col_idx, value=value)
    
    # Adjust column widths (A=20, B=50, C=20)
    ws.column_dimensions['A'].width = 50
    ws.column_dimensions['B'].width = 50
    ws.column_dimensions['C'].width = 20

    # Wrap all columns
    for row in ws.iter_rows(min_row=2, min_col=1, max_col=ws.max_column, max_row=ws.max_row):
        for cell in row:
            cell.alignment = Alignment(wrap_text=True)


# Save workbook
output_path = config.DATA_DICTIONARIES / "other_qs_unique_combinations.xlsx"
wb.save(output_path)

In [7]:
# Export unique combinations for checklist qs to excel, combining all into one sheet (for ease of checking)
wb = Workbook()
ws = wb.active
ws.title = "All checklist qs"

# Loop through variables (16 bronze, 18 silver, 15 gold) and create combined dataframe for all checklist qs 
# (stacking them on top of each other to reduce checking effort)
bronze_qs = []
for i in range(1, 17):
    for s in ["bl", "el"]:
        var_name = f"bronze_q_{i}_{s}"
        var_name_co = f"{var_name}_co"
        cols = [var_name]
        cols.append(var_name_co)
        temp = (
            labs[cols]
            .dropna(how="all")
            .drop_duplicates().reset_index(drop=True)
        )
        temp.columns = ["raw_value", "comment"]
        bronze_qs.append(temp)

silver_qs = []
for i in range(1, 19):
    for s in ["bl", "el"]:
        var_name = f"silver_q_{i}_{s}"
        var_name_co = f"{var_name}_co"
        cols = [var_name]
        cols.append(var_name_co)
        temp = (
            labs[cols]
            .dropna(how="all")
            .drop_duplicates().reset_index(drop=True)
        )
        temp.columns = ["raw_value", "comment"]
        silver_qs.append(temp)

gold_qs = []
for i in range(1, 16):
    for s in ["bl", "el"]:
        var_name = f"gold_q_{i}_{s}"
        var_name_co = f"{var_name}_co"
        cols = [var_name]
        cols.append(var_name_co)
        temp = (
            labs[cols]
            .dropna(how="all")
            .drop_duplicates().reset_index(drop=True)
        )
        temp.columns = ["raw_value", "comment"]
        gold_qs.append(temp)

# Combine all checklist qs into one dataframe
df_unique = pd.concat(bronze_qs + silver_qs + gold_qs,
                        ignore_index=True).dropna(how="all").drop_duplicates().reset_index(drop=True)

df_unique = df_unique.fillna("") # replace with empty for better excel display

# Write header
for col_idx, col_name in enumerate(df_unique.columns, start=1):
    ws.cell(row=1, column=col_idx, value=col_name)

# Write data
for row_idx, row_values in enumerate(df_unique.values, start=2):
    for col_idx, value in enumerate(row_values, start=1):
        ws.cell(row=row_idx, column=col_idx, value=value)

# Adjust columns widths (A = 15, B = 50)
ws.column_dimensions['A'].width = 15
ws.column_dimensions['B'].width = 50

# Wrap column B (comment)
for row in ws.iter_rows(min_row=2, min_col=2, max_col=2, max_row=ws.max_row):
    for cell in row:
        cell.alignment = Alignment(wrap_text=True)

# Save workbook
output_path = config.DATA_DICTIONARIES / "checklist_qs_unique_combinations.xlsx"
wb.save(output_path)