# Cleaning 2.1 - Check for unique values for each variable (including free text and comment) in the panel dataset to inform cleaning procedure

In [1]:
# Set up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[0]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import Workbook
from openpyxl.styles import Alignment
import os

In [2]:
# Load data
equipment = pd.read_csv(config.PROCESSED_DATA / "panel_processed_1.csv")

  equipment = pd.read_csv(config.PROCESSED_DATA / "panel_processed_1.csv")


In [3]:
# Load data dictionaries
equipment_data_dict = pd.read_excel(config.DATA_DICTIONARIES / "data_dictionary.xlsx", sheet_name="Equipment")

In [4]:
# Function to build unique value dataframe for a given variable, handling both single and multi-variable cases
def build_unique_df(equipment, var_name, free_text):
    
    var_name_co = f"{var_name}_co"
    var_name_mc = f"{var_name}_mc"
    var_name_mc_co = f"{var_name_mc}_co"
    var_name_fc = f"{var_name}_fc"
    var_name_fc_co = f"{var_name_fc}_co"
        
    cols = []

    if free_text != "Y":
        cols.append(var_name)
        cols.append(var_name_co)

    if free_text == "Y":
        cols.append(var_name_mc)
        cols.append(var_name_fc)
        cols.append(var_name_mc_co)
        cols.append(var_name_fc_co)

    df_unique = (
        equipment[cols]
        .dropna(how="all")
        .drop_duplicates()
        .reset_index(drop=True)
    )

    return df_unique

In [5]:
# Export unique combinations for other qs to excel, creating one sheet per var
wb = Workbook()
wb.remove(wb.active) # remove default sheet

# Loop through variables
for _, row in equipment_data_dict.iterrows():

    var_name = row["Variable"]
    free_text = row["Free text"]

    df_unique = build_unique_df(
        equipment,
        var_name,
        free_text
    )

    df_unique = df_unique.fillna("") # replace with empty for better excel display

    sheet_name = var_name[:31] # sheet name (limited to 31 chars)
    ws = wb.create_sheet(title=sheet_name) # create sheet for var

    # Write header
    for col_idx, col_name in enumerate(df_unique.columns, start=1):
        ws.cell(row=1, column=col_idx, value=col_name)

    # Write data
    for row_idx, row_values in enumerate(df_unique.values, start=2):
        for col_idx, value in enumerate(row_values, start=1):
            ws.cell(row=row_idx, column=col_idx, value=value)
    
    # Adjust column widths (A=20, B=20, C=50, D=50)
    ws.column_dimensions['A'].width = 20
    ws.column_dimensions['B'].width = 20
    ws.column_dimensions['C'].width = 50
    ws.column_dimensions['D'].width = 50

    # Wrap all columns
    for row in ws.iter_rows(min_row=2, min_col=1, max_col=ws.max_column, max_row=ws.max_row):
        for cell in row:
            cell.alignment = Alignment(wrap_text=True)

# Do the same for variables "share" and "el_check" (neither have free text components)
for var_name in ["share", "el_check"]:
    df_unique = build_unique_df(equipment, var_name, "N")
    df_unique = df_unique.fillna("") # replace with empty for better excel display

    sheet_name = var_name[:31] # sheet name (limited to 31 chars)
    ws = wb.create_sheet(title=sheet_name) # create sheet for var

    # Write header
    for col_idx, col_name in enumerate(df_unique.columns, start=1):
        ws.cell(row=1, column=col_idx, value=col_name)

    # Write data
    for row_idx, row_values in enumerate(df_unique.values, start=2):
        for col_idx, value in enumerate(row_values, start=1):
            ws.cell(row=row_idx, column=col_idx, value=value)
    
    # Adjust column widths (A=30, B=30, C=50, D=50)
    ws.column_dimensions['A'].width = 30
    ws.column_dimensions['B'].width = 30
    ws.column_dimensions['C'].width = 50
    ws.column_dimensions['D'].width = 50

    # Wrap all columns
    for row in ws.iter_rows(min_row=2, min_col=1, max_col=ws.max_column, max_row=ws.max_row):
        for cell in row:
            cell.alignment = Alignment(wrap_text=True)

# Save workbook
output_path = config.DATA_DICTIONARIES / "equipment_unique_combinations.xlsx"
wb.save(output_path)