# Cleaning 1.2 - Clean unique values in individual dataset

In [1]:
# Indicator for new variables to be cleaned (default is False, set to True if decide to clean more variables)
new_vars_to_clean = True

In [2]:
# Set-up
import pandas as pd
import numpy as np
import sys
from pathlib import Path
CODE_ROOT = Path.cwd().parents[0]
sys.path.append(str(CODE_ROOT))
import config
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment
import os
from create_empty_cleaning_sheet import create_empty_cleaning_sheet
from unique_values_cleaning import clean_unique_values

In [3]:
# Load data
labs = pd.read_csv(config.PROCESSED_DATA / "individual_processed_1.csv")

In [4]:
# Load data dictionaries
other_qs_data_dict = pd.read_excel(config.DATA_DICTIONARIES / "data_dictionary.xlsx", sheet_name="Other")

In [5]:
# Create or update the excel workbook with empty sheets
if new_vars_to_clean:

    # File name
    file_name = config.CLEANING_WORKBOOKS / "individual_cleaning_workbook.xlsx"

    # Other questions (one sheet per variable)
    for _, row in other_qs_data_dict.iterrows():   
        sheet_name = row["Variable"]
        comment = row["Comment"] == "Y"
        free_text = row["Free text"] == "Y"
        create_empty_cleaning_sheet(file_name, sheet_name, comment, free_text)

    # Checklist variables (one sheet for all)
    create_empty_cleaning_sheet(file_name, "Checklist", comment=True)

Sheet survey_date_bl already exists. No changes made.
Sheet survey_date_el already exists. No changes made.
Sheet faculty already exists. No changes made.
Sheet institute already exists. No changes made.
Sheet no_researchers already exists. No changes made.
Sheet no_ft already exists. No changes made.
Sheet no_pt already exists. No changes made.
Sheet share_equip_ind already exists. No changes made.
Sheet share_space_ind already exists. No changes made.
Sheet share_space_freq already exists. No changes made.
Sheet comm_group already exists. No changes made.
Sheet comm_freq_group already exists. No changes made.
Sheet consent_data_merge already exists. No changes made.
Sheet checklist_discussion already exists. No changes made.
Sheet spark_awareness already exists. No changes made.
Sheet spark_awareness_when already exists. No changes made.
Sheet consent_attitudes already exists. No changes made.
Sheet attitude_q already exists. No changes made.
Sheet waste_recycle already exists. No ch

In [6]:
# Run cleaning loop. This will: 
# 1. Merge our data to the cleaning workbook
# 2. Create cleaned variables
# 3. produce a report of the cleaning process
# 4. Update the cleaning workbook with all uncleaned values

file_name = config.CLEANING_WORKBOOKS / "individual_cleaning_workbook.xlsx"

# Other questions (one sheet per variable except for multiple vars with _1, _2 suffix)
for _, row in other_qs_data_dict.iterrows():
    var_name = row["Variable"]
    sheet_name = var_name
    comment = row["Comment"] == "Y"
    free_text = row["Free text"] == "Y"
    no_vars = row["No variables"]
    if pd.notna(row["No variables"]):
        n = int(no_vars)
    multiple_vars = no_vars > 1
    if multiple_vars:
        for i in range(1, n + 1):
            var_name_i = f"{var_name}_{i}"
            labs=clean_unique_values(df=labs, file_name=file_name, sheet_name=sheet_name, var_name=var_name_i, 
                                comment=comment, free_text=free_text, dtype="string", report=True)
    if not multiple_vars:
        labs=clean_unique_values(df=labs, file_name=file_name, sheet_name=sheet_name, var_name=var_name, 
                            comment=comment, free_text=free_text, dtype="string", report=True)
    
# Checklist variables (16 bronze, 18 silver, 15 gold for both bl and el, one sheet for all)
sheet_name = "Checklist"

for s in ["bl", "el"]:
    for i in range(1, 17):
        var_name = f"bronze_q_{i}_{s}"
        labs=clean_unique_values(df=labs, file_name=file_name, sheet_name=sheet_name, var_name=var_name,
                            comment=True, free_text=False, dtype="string", report=True)

    for i in range(1, 19):
        var_name = f"silver_q_{i}_{s}"
        labs=clean_unique_values(df=labs, file_name=file_name, sheet_name=sheet_name, var_name=var_name, 
                            comment=True, free_text=False, dtype="string", mc_fc_vars=False, report=True)
        
    for i in range(1, 16):
        var_name = f"gold_q_{i}_{s}"
        labs=clean_unique_values(df=labs, file_name=file_name, sheet_name=sheet_name, var_name=var_name, 
                            comment=True, free_text=False, dtype="string", mc_fc_vars=False, report=True)

Cleaning progress for survey_date_bl:
Total unique value combinations: 45
Cleaned combinations: 17
Pending combinations: 0
Excluded combinations: 0
Unchecked combinations: 28
Cleaning progress for survey_date_el:
Total unique value combinations: 62
Cleaned combinations: 29
Pending combinations: 3
Excluded combinations: 0
Unchecked combinations: 30
Cleaning progress for faculty:
Total unique value combinations: 11
Cleaned combinations: 11
Pending combinations: 0
Excluded combinations: 0
Unchecked combinations: 0
Cleaning progress for institute:
Total unique value combinations: 83
Cleaned combinations: 83
Pending combinations: 0
Excluded combinations: 0
Unchecked combinations: 0
Cleaning progress for no_researchers:
Total unique value combinations: 53
Cleaned combinations: 0
Pending combinations: 2
Excluded combinations: 0
Unchecked combinations: 51
Cleaning progress for no_ft:
Total unique value combinations: 24
Cleaned combinations: 0
Pending combinations: 0
Excluded combinations: 0
Un