## Dataset Inventory Loader

In [1]:
!pip install rapidfuzz



In [2]:
import os
import re

base_path = r"/Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey"

# Month ordering
month_order = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

# Patterns
month_pattern = re.compile(
    r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
    re.IGNORECASE
)
year_pattern = re.compile(r"(20\d{2})")

# Detect year folders from drive
year_folders = [
    f for f in os.listdir(base_path)
    if os.path.isdir(os.path.join(base_path, f)) and f.isdigit()
]

print("Detected year folders:", sorted(year_folders))

inventory = {}

for year in sorted(year_folders):
    year_path = os.path.join(base_path, year)

    # Accept both CSV and XLSX
    data_files = [
        f for f in os.listdir(year_path)
        if f.lower().endswith(".csv") or f.lower().endswith(".xlsx")
    ]

    inventory[year] = {}

    for file in data_files:
        upper = file.upper()

        # Detect type
        if upper.endswith(".XLSX"):
            filetype = "metadata"  # XLSX = metadata
        else:
            filetype = "survey"    # CSV = survey

        # Detect month
        month_match = month_pattern.search(upper)
        month = (
            month_match.group(1).capitalize()
            if month_match
            else "Unmatched"
        )

        # Detect year inside filename
        year_match = year_pattern.search(upper)
        file_year = year_match.group(1) if year_match else "UNKNOWN"

        # Store into inventory
        if month not in inventory[year]:
            inventory[year][month] = []

        inventory[year][month].append({
            "filename": file,
            "filetype": filetype,
            "file_year": file_year
        })

# Print clean summary
print("\n=== DATASET INVENTORY SUMMARY ===\n")

for yr in sorted(inventory.keys()):
    print(f"Year {yr}:")

    sorted_months = sorted(
        inventory[yr].keys(),
        key=lambda m: month_order.get(m, 99)
    )

    for month in sorted_months:
        print(f"  {month}:")
        for item in inventory[yr][month]:
            print(f"    {item['filename']} ({item['filetype']})")

    print()


Detected year folders: ['2018', '2019', '2022', '2023', '2024']

=== DATASET INVENTORY SUMMARY ===

Year 2018:
  January:
    JANUARY_2018_METADATA.xlsx (metadata)
    JANUARY_2018.CSV (survey)
  April:
    APRIL_2018_METADATA.xlsx (metadata)
    APRIL_2018.CSV (survey)
  July:
    JULY_2018_METADATA.xlsx (metadata)
    JULY_2018.CSV (survey)
  October:
    OCTOBER_2018_METADATA.xlsx (metadata)
    OCTOBER_2018.CSV (survey)

Year 2019:
  January:
    JANUARY_2019_METADATA.xlsx (metadata)
    JANUARY_2019.CSV (survey)
  April:
    APRIL_2019_METADATA.xlsx (metadata)
    APRIL_2019.CSV (survey)
  July:
    JULY_2019_METADATA.xlsx (metadata)
    JULY_2019.CSV (survey)
  October:
    OCTOBER_2019_METADATA.xlsx (metadata)
    OCTOBER_2019.CSV (survey)

Year 2022:
  January:
    JANUARY_2022_METADATA.xlsx (metadata)
    JANUARY_2022.csv (survey)
  February:
    FEBRUARY_2022.csv (survey)
    FEBRUARY_2022_METADATA.xlsx (metadata)
  March:
    MARCH_2022_METADATA.xlsx (metadata)
    MARCH_202

## Load Dataset Function

In [3]:
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load a dataset from the inventory.

    year: str, e.g., "2018"
    month: str, e.g., "January"
    filetype: "survey" or "metadata"
    sheet_number: 0(sheet 1) or 1(sheet 2)
    """
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )
    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])
    
    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)
    
    if sheet_number is not None:
        return pd.read_excel(file_path, sheet_name=sheet_number)
    
    return pd.read_excel(file_path)

Sample: January 2018 Survey

In [4]:
import pandas as pd

# Load the survey sheet of January 2018 metadata
jan_2018_survey = load_dataset("2018", "January","survey")

# View the first few rows
jan_2018_survey.head()

Unnamed: 0,PUFREG,PUFPRV,PUFPRRCD,PUFHHNUM,PUFURB2K10,PUFPWGTPRV,PUFSVYMO,PUFSVYYR,PUFPSU,PUFRPL,...,PUFC33_WEEKS,PUFC34_WYNOT,PUFC35_LTLOOKW,PUFC36_AVAIL,PUFC37_WILLING,PUFC38_PREVJOB,PUFC40_POCC,PUFC41_WQTR,PUFC43_QKB,PUFNEWEMPSTAT
0,14,1,100,1,2,124.9425,1,2018,140,32,...,,6.0,,,,1.0,52.0,2.0,,3.0
1,14,1,100,1,2,131.2126,1,2018,140,32,...,,,,,,,,1.0,1.0,1.0
2,14,1,100,1,2,142.0464,1,2018,140,32,...,,,,,,,,1.0,1.0,1.0
3,14,1,100,1,2,138.2958,1,2018,140,32,...,,,,,,,,,,
4,14,1,100,2,2,195.4152,1,2018,140,32,...,,,,,,,,1.0,41.0,1.0


## Metadata Sheet 1

<H5> Sample: January 2018 Metadata Sheet 1 (Raw) </H5>

In [5]:
# Load the first sheet of January 2018 metadata
january_2018_metadata_sheet1 = load_dataset("2018", "January", "metadata", 0)

# View the first few rows
print("=== January 2018 Metadata Sheet 1 (Raw) ===")
january_2018_metadata_sheet1.head()


=== January 2018 Metadata Sheet 1 (Raw) ===


Unnamed: 0,QUEST,Questionnaire,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,_IDS0,(Id Items),,
1,,,,,PUFREG,Region
2,,,,,PUFPRV,Province
3,,,,,PUFPRRCD,Province Recode
4,,,,,PUFHHNUM,Household Unique Sequential Number


#### Reshaping Metadata Sheet 1

In [6]:
import pandas as pd

def extract_variables(df):
    """
    Extract variable names and descriptions from metadata Sheet 1 (variable dictionary).
    Automatically reads the 4th and 5th columns (E and F in Excel) where variables and descriptions reside.
    
    Returns a clean DataFrame with columns ['Variable', 'Description'].
    """
    
    # Select the 4th and 5th columns (index 4 and 5)
    df_vars = df.iloc[:, 4:6].copy()
    
    # Rename columns
    df_vars.columns = ['Variable', 'Description']
    
    # Drop rows where 'Variable' is empty or NaN
    df_vars = df_vars[df_vars['Variable'].notna() & (df_vars['Variable'].astype(str).str.strip() != '')]
    
    # Strip whitespace from values
    df_vars['Variable'] = df_vars['Variable'].astype(str).str.strip()
    df_vars['Description'] = df_vars['Description'].astype(str).str.strip()
    
    # Reset index
    df_vars = df_vars.reset_index(drop=True)
    
    return df_vars


### Metadata Sheet 1 Reshaped Saving Function

In [7]:
import os
import pandas as pd

def batch_process_sheet1_metadata(inventory, base_output_path):
    """
    Loops through the entire inventory, loads Sheet 1 of the metadata,
    reshapes it, and saves it into a structured folder hierarchy.
    
    Provides a text-based summary report for assurance.
    """
    
    # Counters for the summary report
    success_count = 0
    failure_count = 0
    skipped_count = 0
    errors_log = []

    # 1. Define and Create the Main Parent Folder
    main_folder_name = "Metadata Sheet 1 CSV's"
    main_folder_path = os.path.join(base_output_path, main_folder_name)
    os.makedirs(main_folder_path, exist_ok=True)
    
    print("--- STARTING BATCH PROCESS ---")
    print(f"Target Directory: {main_folder_path}")
    print("-" * 50)

    # 2. Iterate through Years in the Inventory
    for year, months_data in inventory.items():
        
        # Create the Year Subfolder
        year_folder_path = os.path.join(main_folder_path, year)
        os.makedirs(year_folder_path, exist_ok=True)
        
        # 3. Iterate through Months in that Year
        for month, files_list in months_data.items():
            
            if month == "Unmatched":
                continue
            
            # Check for metadata file existence
            has_metadata = any(f.get('filetype') == 'metadata' for f in files_list)
            
            if has_metadata:
                try:
                    # A. Load the Data (Sheet 0 = Sheet 1)
                    raw_df = load_dataset(year, month, "metadata", 0)
                    
                    # B. Reshape the Data
                    clean_df = extract_variables(raw_df)
                    
                    # C. Save to CSV
                    filename = f"Sheet1_{month}_{year}.csv"
                    full_save_path = os.path.join(year_folder_path, filename)
                    
                    clean_df.to_csv(full_save_path, index=False)
                    
                    # Print confirmation for this specific file
                    print(f"[OK] Saved: {year}/{filename}")
                    success_count += 1
                    
                except Exception as e:
                    print(f"[ERROR] Failed {month} {year}: {e}")
                    errors_log.append(f"{month} {year}: {str(e)}")
                    failure_count += 1
            else:
                skipped_count += 1

    # 4. Final Assurance Report
    print("\n" + "="*40)
    print("      PROCESSING SUMMARY REPORT")
    print("="*40)
    print(f"Total Successfully Saved: {success_count}")
    print(f"Total Failed:             {failure_count}")
    print(f"Total Skipped (No File):  {skipped_count}")
    print("-" * 40)
    
    if failure_count == 0:
        print("STATUS: COMPLETE SUCCESS")
        print(f"All files are now located in: {main_folder_path}")
        print("Google Drive is syncing these files now.")
    else:
        print("STATUS: COMPLETED WITH ERRORS")
        print("Check the errors log above.")
        if errors_log:
            print("\nError Details:")
            for err in errors_log:
                print(f" - {err}")
    print("="*40)

In [8]:
# Run the processor
batch_process_sheet1_metadata(inventory, base_path)

--- STARTING BATCH PROCESS ---
Target Directory: /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/Metadata Sheet 1 CSV's
--------------------------------------------------
[OK] Saved: 2018/Sheet1_January_2018.csv
[OK] Saved: 2018/Sheet1_October_2018.csv
[OK] Saved: 2018/Sheet1_April_2018.csv
[OK] Saved: 2018/Sheet1_July_2018.csv
[OK] Saved: 2019/Sheet1_April_2019.csv
[OK] Saved: 2019/Sheet1_July_2019.csv
[OK] Saved: 2019/Sheet1_October_2019.csv
[OK] Saved: 2019/Sheet1_January_2019.csv
[OK] Saved: 2022/Sheet1_December_2022.csv
[OK] Saved: 2022/Sheet1_February_2022.csv
[OK] Saved: 2022/Sheet1_August_2022.csv
[OK] Saved: 2022/Sheet1_March_2022.csv
[OK] Saved: 2022/Sheet1_September_2022.csv
[OK] Saved: 2022/Sheet1_October_2022.csv
[OK] Saved: 2022/Sheet1_January_2022.csv
[OK] Saved: 2022/Sheet1_May_2022.csv
[OK] Saved: 2022/Sheet1_November_2022.csv
[OK] Saved: 2022/Sheet1_July_2022.csv
[OK] Saved: 2022/Sheet1_June_2022.csv
[OK] 

#### Verifying if the variable and description counts of Reshaped Metadata Sheet 1 and Original matches

In [9]:
def batch_verify_sheet1_variable_and_description_count_verbose(inventory, base_path):
    """
    Iterates through all years and months in the inventory and compares
    total variables and descriptions in raw vs reshaped Sheet 1 metadata.
    Prints mismatches immediately, and returns a DataFrame with all results.
    """

    results = []

    for year, months_data in inventory.items():
        for month, files_list in months_data.items():
            if month == "Unmatched":
                continue  # Skip unmatched files

            # --- Load raw Sheet 1 ---
            try:
                raw_df = load_dataset(year, month, "metadata", sheet_number=0)
            except Exception as e:
                print(f"[ERROR] {month} {year}: Could not load raw Sheet 1 ({e})")
                results.append({
                    'Year': year,
                    'Month': month,
                    'Raw Variable Count': 'ERROR',
                    'Reshaped Variable Count': 'ERROR',
                    'Raw Description Count': 'ERROR',
                    'Reshaped Description Count': 'ERROR',
                    'Status': f'FAIL (Raw load error: {e})'
                })
                continue

            # --- Load reshaped CSV Sheet 1 ---
            reshaped_file_path = os.path.join(
                base_path, "Metadata Sheet 1 CSV's", year, f"Sheet1_{month}_{year}.csv"
            )
            if not os.path.exists(reshaped_file_path):
                print(f"[ERROR] {month} {year}: Reshaped Sheet 1 CSV missing!")
                results.append({
                    'Year': year,
                    'Month': month,
                    'Raw Variable Count': 'ERROR',
                    'Reshaped Variable Count': 'ERROR',
                    'Raw Description Count': 'ERROR',
                    'Reshaped Description Count': 'ERROR',
                    'Status': 'FAIL (Reshaped CSV missing)'
                })
                continue

            reshaped_df = pd.read_csv(reshaped_file_path)

            # --- Count non-empty variables & descriptions ---
            raw_vars = raw_df.iloc[:, 4].dropna().astype(str).str.strip()
            raw_vars = raw_vars[raw_vars != '']
            raw_descs = raw_df.iloc[:, 5].dropna().astype(str).str.strip()
            raw_descs = raw_descs[raw_descs != '']

            reshaped_vars = reshaped_df['Variable'].astype(str).str.strip()
            reshaped_vars = reshaped_vars[reshaped_vars != '']
            reshaped_descs = reshaped_df['Description'].astype(str).str.strip()
            reshaped_descs = reshaped_descs[reshaped_descs != '']

            # --- PASS / FAIL ---
            status = "PASS" if (len(raw_vars) == len(reshaped_vars) and len(raw_descs) == len(reshaped_descs)) else "FAIL"

            if status == "FAIL":
                print(f"[MISMATCH] {month} {year} - Variables: {len(raw_vars)} vs {len(reshaped_vars)}, "
                      f"Descriptions: {len(raw_descs)} vs {len(reshaped_descs)}")

            results.append({
                'Year': year,
                'Month': month,
                'Raw Variable Count': len(raw_vars),
                'Reshaped Variable Count': len(reshaped_vars),
                'Raw Description Count': len(raw_descs),
                'Reshaped Description Count': len(reshaped_descs),
                'Status': status
            })

    df = pd.DataFrame(results).sort_values(['Year', 'Month']).reset_index(drop=True)

    # ---------- NEW SUCCESS MESSAGE ----------
    total = len(df)
    passed = (df['Status'] == 'PASS').sum()
    failed = total - passed

    if failed == 0:
        print("\nSUCCESS: All variables and descriptions have been reshaped correctly!\n")
    else:
        print(f"\nCompleted with issues: {passed} PASS, {failed} FAIL.\n")

    return df


In [10]:
verification_df = batch_verify_sheet1_variable_and_description_count_verbose(inventory, base_path)

print("=== Sheet 1 Metadata Variables and Descriptions (Raw vs Reshaped) ===")
verification_df


SUCCESS: All variables and descriptions have been reshaped correctly!

=== Sheet 1 Metadata Variables and Descriptions (Raw vs Reshaped) ===


Unnamed: 0,Year,Month,Raw Variable Count,Reshaped Variable Count,Raw Description Count,Reshaped Description Count,Status
0,2018,April,50,50,50,50,PASS
1,2018,January,50,50,50,50,PASS
2,2018,July,51,51,51,51,PASS
3,2018,October,51,51,51,51,PASS
4,2019,April,49,49,49,49,PASS
5,2019,January,49,49,49,49,PASS
6,2019,July,49,49,49,49,PASS
7,2019,October,49,49,49,49,PASS
8,2022,April,52,52,52,52,PASS
9,2022,August,42,42,42,42,PASS


In [11]:
def batch_verify_sheet1_variable_and_description_count_verbose(inventory, base_path):
    """
    Iterates through all years and months in the inventory and compares
    total variables and descriptions in raw vs reshaped Sheet 1 metadata.
    Prints mismatches immediately, and returns a DataFrame with all results.
    """

    results = []

    for year, months_data in inventory.items():
        for month, files_list in months_data.items():
            if month == "Unmatched":
                continue  # Skip unmatched files

            # --- Load raw Sheet 1 ---
            try:
                raw_df = load_dataset(year, month, "metadata", sheet_number=0)
            except Exception as e:
                print(f"[ERROR] {month} {year}: Could not load raw Sheet 1 ({e})")
                results.append({
                    'Year': year,
                    'Month': month,
                    'Raw Variable Count': 'ERROR',
                    'Reshaped Variable Count': 'ERROR',
                    'Raw Description Count': 'ERROR',
                    'Reshaped Description Count': 'ERROR',
                    'Status': f'FAIL (Raw load error: {e})'
                })
                continue

            # --- Load reshaped CSV Sheet 1 ---
            reshaped_file_path = os.path.join(
                base_path, "Metadata Sheet 1 CSV's", year, f"Sheet1_{month}_{year}.csv"
            )
            if not os.path.exists(reshaped_file_path):
                print(f"[ERROR] {month} {year}: Reshaped Sheet 1 CSV missing!")
                results.append({
                    'Year': year,
                    'Month': month,
                    'Raw Variable Count': 'ERROR',
                    'Reshaped Variable Count': 'ERROR',
                    'Raw Description Count': 'ERROR',
                    'Reshaped Description Count': 'ERROR',
                    'Status': 'FAIL (Reshaped CSV missing)'
                })
                continue

            reshaped_df = pd.read_csv(reshaped_file_path)

            # --- Count non-empty variables and descriptions ---
            raw_vars = raw_df.iloc[:, 4].dropna().astype(str).str.strip()
            raw_vars = raw_vars[raw_vars != '']
            raw_descs = raw_df.iloc[:, 5].dropna().astype(str).str.strip()
            raw_descs = raw_descs[raw_descs != '']

            reshaped_vars = reshaped_df['Variable'].astype(str).str.strip()
            reshaped_vars = reshaped_vars[reshaped_vars != '']
            reshaped_descs = reshaped_df['Description'].astype(str).str.strip()
            reshaped_descs = reshaped_descs[reshaped_descs != '']

            # --- Check if both counts match ---
            status = "PASS" if (len(raw_vars) == len(reshaped_vars) and len(raw_descs) == len(reshaped_descs)) else "FAIL"

            if status == "FAIL":
                # Immediate print for any mismatch
                print(f"[MISMATCH] {month} {year} - Variables: {len(raw_vars)} vs {len(reshaped_vars)}, "
                      f"Descriptions: {len(raw_descs)} vs {len(reshaped_descs)}")

            results.append({
                'Year': year,
                'Month': month,
                'Raw Variable Count': len(raw_vars),
                'Reshaped Variable Count': len(reshaped_vars),
                'Raw Description Count': len(raw_descs),
                'Reshaped Description Count': len(reshaped_descs),
                'Status': status
            })

    return pd.DataFrame(results).sort_values(['Year', 'Month']).reset_index(drop=True)

In [12]:
# Run the Sheet 1 verifier
verification_df = batch_verify_sheet1_variable_and_description_count_verbose(inventory, base_path)

# Print a header and show the first few rows
print("=== Sheet 1 Metadata Variables and Descriptions (Raw vs Reshaped) ===")
verification_df.head()

=== Sheet 1 Metadata Variables and Descriptions (Raw vs Reshaped) ===


Unnamed: 0,Year,Month,Raw Variable Count,Reshaped Variable Count,Raw Description Count,Reshaped Description Count,Status
0,2018,April,50,50,50,50,PASS
1,2018,January,50,50,50,50,PASS
2,2018,July,51,51,51,51,PASS
3,2018,October,51,51,51,51,PASS
4,2019,April,49,49,49,49,PASS


Checking January 2018 Metadata Reshaped Sheet 1

In [13]:
# Load metadata Sheet 1
January_metadata = load_dataset("2018", "January", "metadata", 0)

# Call your function
variables_df = extract_variables(January_metadata)

# View results
variables_df.head()

Unnamed: 0,Variable,Description
0,PUFREG,Region
1,PUFPRV,Province
2,PUFPRRCD,Province Recode
3,PUFHHNUM,Household Unique Sequential Number
4,PUFURB2K10,2010Urban-RuralFIES


Checking August 2024 Metadata Reshaped Sheet 1

In [14]:
# Load metadata Sheet 1
August_2024_metadata = load_dataset("2024", "August", "metadata", 0)

# Call your function
variables_df = extract_variables(August_2024_metadata)

# View results
variables_df.head()

Unnamed: 0,Variable,Description
0,PUFHHNUM,Household Unique Sequential Number
1,PUFPWGTPRV,Final Weight Based on Projection
2,PUFSVYMO,Survey Month
3,PUFSVYYR,Survey Year
4,PUFPSU,Psu Number


## Metadata Sheet 2 Function

<H5> Sample: January 2018 Metadata Sheet 2 (Raw)</H5>

In [15]:
# Load the second sheet of January 2018 metadata
january_2018_metadata_sheet2 = load_dataset("2018", "January", "metadata", 1)

# View the first few rows
print("=== January 2018 Metadata Sheet 2 (Raw) ===")
january_2018_metadata_sheet2.head()

=== January 2018 Metadata Sheet 2 (Raw) ===


Unnamed: 0,PUFREG_VS1,Region,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,National Capital Region,13,,
1,,,Cordillera Administrative Region,14,,
2,,,Region I - Ilocos Region,1,,
3,,,Region II - Cagayan Valley,2,,
4,,,Region III - Central Luzon,3,,


### Reshaping Metadata Sheet 2

In [16]:
import os
import pandas as pd


def reshape_sheet2_robust(df):
    """
    Convert metadata Sheet 2 (the values dictionary) into a clean, long-format table.

    This function reads the sheet exactly as it appears in Excel, without:
    - Assuming any header row
    - Auto-filling missing values
    - Inferencing min/max values
    - Guessing variable names

    Sheet 2 typically has this layout:
        Column A = Variable name (only appears once per block)
        Column B = Variable description (blank except at the start of a block)
        Column C = Label for each value (required)
        Column D = Minimum value (optional)
        Column E = Maximum value (optional)
        Column F+ = Additional text or category notes (optional)

    The function processes rows in order and:
        - Carries forward the most recent non-empty variable name (Column A)
        - Carries forward the most recent non-empty description (Column B)
        - Creates one output row per value label (Column C)
        - Leaves missing min/max/additional values as 0
        - Reads extra info (Column F onward) if present

    Returns:
        A clean pandas DataFrame with columns:
            Variable
            Description
            Label
            min_value
            max_value
            additional_value
    """

    reshaped = []

    # Ensure all blanks are handled consistently
    df = df.fillna('').astype(str)

    # Initialize with the first variable and description
    current_var = df.iloc[0, 0].strip() or 'UNKNOWN_VAR'
    current_desc = df.iloc[0, 1].strip() or ''

    # Iterate row-by-row
    for idx, row in df.iterrows():
        # ---- Column A: Variable name ----
        var_candidate = row.iloc[0].strip()
        if var_candidate:
            current_var = var_candidate

        # ---- Column B: Description ----
        desc_candidate = row.iloc[1].strip()
        if desc_candidate:
            current_desc = desc_candidate

        # ---- PRE-READ Columns D, E, F (Values) ----
        raw_min = row.iloc[3].strip()
        raw_max = row.iloc[4].strip()
        
        # Look for extra values (Column F+)
        extra = '0'
        if len(row) > 5:
            for j in range(5, len(row)):
                extra_candidate = row.iloc[j].strip()
                if extra_candidate:
                    extra = extra_candidate
                    break

        # ---- Column C: Label ----
        label = row.iloc[2].strip()

        # FIX: Don't just continue. Check if values exist.
        if not label:
            # If label is missing BUT we have min, max, or extra -> It's a valid row
            if raw_min or raw_max or extra != '0':
                label = '0'  # Assign default label
            else:
                continue     # Skip only if truly empty

        # ---- Finalize Min/Max ----
        min_value = raw_min if raw_min else '0'
        max_value = raw_max if raw_max else '0'

        # ---- Append clean record ----
        reshaped.append({
            "Variable": current_var,
            "Description": current_desc,
            "Label": label,
            "min_value": min_value,
            "max_value": max_value,
            "additional_value": extra
        })

    return pd.DataFrame(reshaped)


# ============================================================
#   load_dataset()
# ============================================================
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load any dataset (survey or metadata) from the file inventory.

    • For SURVEY CSV: normal pandas.read_csv()
    • For METADATA Excel: read with no header, reshape Sheet 2 automatically
    """
    # Retrieve file information from inventory
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )

    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])

    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)

    # Metadata Excel — always read with no header
    df = pd.read_excel(file_path, sheet_name=sheet_number, header=None)

    # Automatic reshaping ONLY for metadata Sheet 2
    if sheet_number == 1:
        df = reshape_sheet2_robust(df)

    return df


### Metadata Sheet 2 Reshaped Saving Function

In [17]:
import os
import pandas as pd

def batch_process_sheet2_metadata(inventory, base_output_path):
    """
    Loops through the inventory to process 'Sheet 2' (Value Codes).
    """
    
    # Counters for the summary report
    success_count = 0
    failure_count = 0
    skipped_count = 0
    errors_log = []

    # 1. Define Main Folder Name
    main_folder_name = "Metadata Sheet 2 CSV's"
    main_folder_path = os.path.join(base_output_path, main_folder_name)
    os.makedirs(main_folder_path, exist_ok=True)
    
    print("--- STARTING BATCH PROCESS (SHEET 2) ---")
    print(f"Target Directory: {main_folder_path}")
    print("-" * 50)

    # 2. Iterate through Inventory
    for year, months_data in inventory.items():
        
        # Create Year Subfolder
        year_folder_path = os.path.join(main_folder_path, year)
        os.makedirs(year_folder_path, exist_ok=True)
        
        for month, files_list in months_data.items():
            # Skip unmatched files
            if month == "Unmatched":
                continue
            
            # Check if metadata exists for this month
            has_metadata = any(f.get('filetype') == 'metadata' for f in files_list)
            
            if has_metadata:
                try:
                    # A. Load & Reshape
                    # Your load_dataset function handles the cleaning internally
                    clean_df = load_dataset(year, month, "metadata", 1)
                    
                    # B. Generate Filename
                    filename = f"Sheet2_{month}_{year}.csv"
                    full_save_path = os.path.join(year_folder_path, filename)
                    
                    # C. Save
                    clean_df.to_csv(full_save_path, index=False)
                    
                    print(f"[OK] Saved: {year}/{filename}")
                    success_count += 1
                    
                except Exception as e:
                    print(f"[ERROR] Failed {month} {year}: {e}")
                    errors_log.append(f"{month} {year}: {str(e)}")
                    failure_count += 1
            else:
                skipped_count += 1

    # 3. Final Report
    print("\n" + "="*40)
    print("      SHEET 2 PROCESSING SUMMARY")
    print("="*40)
    print(f"Total Saved:    {success_count}")
    print(f"Total Failed:   {failure_count}")
    print(f"Total Skipped:  {skipped_count}")
    print("-" * 40)
    
    if failure_count == 0:
        print("STATUS: COMPLETE SUCCESS")
        print(f"Files are syncing to: {main_folder_path}")
    else:
        print("STATUS: COMPLETED WITH ERRORS")
        for err in errors_log:
            print(f" - {err}")
    print("="*40)

In [18]:
# Run the processor
# (Requires 'inventory' and 'load_dataset' to be defined in your environment)
batch_process_sheet2_metadata(inventory, base_path)

--- STARTING BATCH PROCESS (SHEET 2) ---
Target Directory: /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/Metadata Sheet 2 CSV's
--------------------------------------------------
[OK] Saved: 2018/Sheet2_January_2018.csv
[OK] Saved: 2018/Sheet2_October_2018.csv
[OK] Saved: 2018/Sheet2_April_2018.csv
[OK] Saved: 2018/Sheet2_July_2018.csv
[OK] Saved: 2019/Sheet2_April_2019.csv
[OK] Saved: 2019/Sheet2_July_2019.csv
[OK] Saved: 2019/Sheet2_October_2019.csv
[OK] Saved: 2019/Sheet2_January_2019.csv
[OK] Saved: 2022/Sheet2_December_2022.csv
[OK] Saved: 2022/Sheet2_February_2022.csv
[OK] Saved: 2022/Sheet2_August_2022.csv
[OK] Saved: 2022/Sheet2_March_2022.csv
[OK] Saved: 2022/Sheet2_September_2022.csv
[OK] Saved: 2022/Sheet2_October_2022.csv
[OK] Saved: 2022/Sheet2_January_2022.csv
[OK] Saved: 2022/Sheet2_May_2022.csv
[OK] Saved: 2022/Sheet2_November_2022.csv
[OK] Saved: 2022/Sheet2_July_2022.csv
[OK] Saved: 2022/Sheet2_June_2022

#### Verifying if the variable counts of Reshaped Metadata Sheet 2 and Original matches

In [19]:
import os
import pandas as pd

def batch_verify_sheet2_variable_and_label_count(inventory, base_path):
    """
    Batch verify Sheet 2 metadata (values dictionary) across years/months.
    Compares:
      • Unique variable count (raw vs reshaped)
      • Label count per variable (raw vs reshaped)
    Prints mismatches immediately and returns a summary DataFrame.
    """
    all_results = []

    for year, months_data in inventory.items():
        for month, files_list in months_data.items():
            if month == "Unmatched":
                continue

            # --- Load raw Sheet 2 ---
            # NOTE: Ensure 'load_dataset' is defined in your previous cells
            try:
                raw_df = load_dataset(year, month, "metadata", sheet_number=1)
            except Exception as e:
                print(f"[ERROR] {month} {year}: Could not load raw Sheet 2 ({e})")
                continue

            # --- Load reshaped Sheet 2 CSV ---
            reshaped_path = os.path.join(
                base_path, 
                "Metadata Sheet 2 CSV's", 
                year, 
                f"Sheet2_{month}_{year}.csv"
            )
            
            if not os.path.exists(reshaped_path):
                print(f"[ERROR] {month} {year}: Reshaped Sheet 2 CSV missing!")
                continue

            reshaped_df = pd.read_csv(reshaped_path, dtype=str).fillna("")

            # --- Count unique variables ---
            raw_vars = raw_df.iloc[:, 0].astype(str).str.strip()
            raw_vars = raw_vars[raw_vars != '']  # ignore empty
            raw_unique_vars = pd.Index(raw_vars).unique()

            resh_vars = reshaped_df['Variable'].astype(str).str.strip()
            resh_unique_vars = pd.Index(resh_vars).unique()

            # Check variable count mismatch
            variable_mismatch = len(raw_unique_vars) != len(resh_unique_vars)
            if variable_mismatch:
                print(f"[VARIABLE COUNT MISMATCH] {month} {year}: Raw={len(raw_unique_vars)}, Reshaped={len(resh_unique_vars)}")

            # --- Count labels per variable ---
            label_mismatches = []

            for var in raw_unique_vars:
                # Raw: select rows matching variable
                raw_rows = raw_df[raw_df.iloc[:, 0].astype(str).str.strip() == var]

                # Count non-empty label cells safely (cols 2 to 6 usually contain labels/values)
                raw_label_count = raw_rows.iloc[:, 2:6].astype(str).apply(
                    lambda x: x.str.strip().ne('').any(), axis=1
                ).sum()

                # Reshaped: count rows per variable
                resh_label_count = reshaped_df[reshaped_df['Variable'].astype(str).str.strip() == var].shape[0]

                if raw_label_count != resh_label_count:
                    label_mismatches.append({
                        "Variable": var,
                        "Raw_Label_Count": raw_label_count,
                        "Reshaped_Label_Count": resh_label_count
                    })

            # --- Print immediate label mismatches ---
            for m in label_mismatches:
                print(f"[LABEL COUNT MISMATCH] {month} {year} - Variable: {m['Variable']} | Raw={m['Raw_Label_Count']} vs Reshaped={m['Reshaped_Label_Count']}")

            # --- Record summary ---
            all_results.append({
                "Year": year,
                "Month": month,
                "Raw_Variable_Count": len(raw_unique_vars),
                "Reshaped_Variable_Count": len(resh_unique_vars),
                "Variable_Count_Status": "PASS" if not variable_mismatch else "FAIL",
                "Label_Count_Mismatches": len(label_mismatches)
            })

    # --- Final Summary Report ---
    df_summary = pd.DataFrame(all_results).sort_values(['Year', 'Month']).reset_index(drop=True)
    
    var_fails = (df_summary["Variable_Count_Status"] == "FAIL").sum()
    label_fails = df_summary["Label_Count_Mismatches"].sum()

    if var_fails == 0 and label_fails == 0:
        print("\nSUCCESS: All Sheet 2 variables and labels have been reshaped correctly across the batch!\n")
    else:
        print(f"\nCompleted with issues: {var_fails} variable count mismatches, {label_fails} label mismatches.\n")

    return df_summary

def verify_sheet2_content(original_df, reshaped_df):
    """
    Compare original Sheet 2 with reshaped version.
    Checks: Variables, Descriptions, Labels, Min/Max/Additional values.
    Ignores row order.
    """
    # Normalize to string
    original = original_df.fillna("").astype(str)
    reshaped = reshaped_df.fillna("").astype(str)

    # --- Extract original as dict ---
    def build_original_dict(df):
        data = {}
        current_var = ""
        current_desc = ""
        for _, row in df.iterrows():
            colA = row.iloc[0].strip()
            colB = row.iloc[1].strip()
            colC = row.iloc[2].strip()
            
            if colA: current_var = colA
            if colB: current_desc = colB
            
            if not colC: continue # Skip if label is empty
            
            minv = row.iloc[3].strip() if len(row) > 3 else ""
            maxv = row.iloc[4].strip() if len(row) > 4 else ""
            extra = ""
            
            # Find extra value if it exists beyond standard columns
            if len(row) > 5:
                for j in range(5, len(row)):
                    if row.iloc[j].strip():
                        extra = row.iloc[j].strip()
                        break
                        
            if current_var not in data:
                data[current_var] = []
            
            data[current_var].append({
                "Description": current_desc,
                "Label": colC,
                "min_value": minv,
                "max_value": maxv,
                "additional_value": extra
            })
        return data

    orig_dict = build_original_dict(original)

    # --- Extract reshaped as dict ---
    resh_dict = {
        var: group.drop(columns="Variable").to_dict("records")
        for var, group in reshaped.groupby("Variable")
    }

    # --- Verification ---
    errors = []
    orig_vars = set(orig_dict.keys())
    resh_vars = set(resh_dict.keys())

    missing_vars = orig_vars - resh_vars
    extra_vars = resh_vars - orig_vars
    
    if missing_vars: errors.append(f"Missing variables in reshaped: {missing_vars}")
    if extra_vars: errors.append(f"Extra variables in reshaped: {extra_vars}")

    # Detailed label/content comparison
    for var in orig_vars & resh_vars:
        orig_records = orig_dict[var]
        resh_records = resh_dict[var]
        
        orig_set = {(d["Label"], d["min_value"], d["max_value"], d["additional_value"]) for d in orig_records}
        resh_set = {(d["Label"], d["min_value"], d["max_value"], d["additional_value"]) for d in resh_records}
        
        missing_rec = orig_set - resh_set
        extra_rec = resh_set - orig_set
        
        if missing_rec: errors.append(f"[{var}] Missing records: {missing_rec}")
        if extra_rec: errors.append(f"[{var}] Extra records: {extra_rec}")

    if not errors:
        return "SUCCESS"
    else:
        return "MISMATCH FOUND:\n" + "\n".join(errors)

In [20]:
# ==========================================
# AUTOMATION START
# ==========================================
import pandas as pd
import os

# 1. Run the Batch Verifier (Counts & Structure)
# This quickly checks if the number of variables and labels match.
print("--- Starting Batch Structure Verification ---")
batch_summary_df = batch_verify_sheet2_variable_and_label_count(inventory, base_path)

# Print a quick summary of the batch check
if not batch_summary_df.empty:
    fails = batch_summary_df[batch_summary_df['Variable_Count_Status'] == 'FAIL']
    if not fails.empty:
        print(f"\n[WARNING] Found {len(fails)} structural failures in the following months:")
        print(fails[['Year', 'Month', 'Variable_Count_Status']])
    else:
        print("\n[PASS] Structural batch check passed for all files.")

# 2. Run Deep Content Verification (Every Month/Year)
# This checks the actual text (labels, descriptions, values) for every file.
print("\n--- Starting Deep Content Verification (All Files) ---")

deep_verification_results = []

for year, months_data in inventory.items():
    for month, files_list in months_data.items():
        if month == "Unmatched":
            continue

        print(f"Verifying: {month} {year}...", end=" ")

        try:
            # --- Load Raw Original ---
            # Ensure load_dataset is defined in your environment
            original_df = load_dataset(year, month, "metadata", sheet_number=1)
            
            # --- Load Reshaped CSV ---
            reshaped_path = os.path.join(
                base_path, 
                "Metadata Sheet 2 CSV's", 
                year, 
                f"Sheet2_{month}_{year}.csv"
            )
            
            if not os.path.exists(reshaped_path):
                print("SKIPPED (Reshaped file missing)")
                deep_verification_results.append({"Year": year, "Month": month, "Status": "Missing File"})
                continue

            reshaped_df = pd.read_csv(reshaped_path, dtype=str).fillna("")
            
            # --- Run Verification ---
            # Using the verify_sheet2_content function from the first code block
            result_message = verify_sheet2_content(original_df, reshaped_df)
            
            if result_message == "SUCCESS":
                print("OK")
                deep_verification_results.append({"Year": year, "Month": month, "Status": "PASS"})
            else:
                print("MISMATCH FOUND")
                print(f"   -> {result_message}")
                deep_verification_results.append({"Year": year, "Month": month, "Status": "FAIL", "Error": result_message})
                
        except Exception as e:
            print(f"ERROR ({e})")
            deep_verification_results.append({"Year": year, "Month": month, "Status": "ERROR", "Error": str(e)})

# --- Final Report ---
print("\n" + "="*40)
print("FINAL VERIFICATION REPORT")
print("="*40)
results_df = pd.DataFrame(deep_verification_results)

if not results_df.empty:
    pass_count = len(results_df[results_df['Status'] == 'PASS'])
    fail_count = len(results_df[results_df['Status'] != 'PASS'])
    print(f"Total Files Checked: {len(results_df)}")
    print(f"Passed: {pass_count}")
    print(f"Issues: {fail_count}")

    if fail_count > 0:
        print("\nFiles with Issues:")
        print(results_df[results_df['Status'] != 'PASS'][['Year', 'Month', 'Status']])
else:
    print("No files were processed.")

--- Starting Batch Structure Verification ---

SUCCESS: All Sheet 2 variables and labels have been reshaped correctly across the batch!


[PASS] Structural batch check passed for all files.

--- Starting Deep Content Verification (All Files) ---
Verifying: January 2018... OK
Verifying: October 2018... OK
Verifying: April 2018... OK
Verifying: July 2018... OK
Verifying: April 2019... OK
Verifying: July 2019... OK
Verifying: October 2019... OK
Verifying: January 2019... OK
Verifying: December 2022... OK
Verifying: February 2022... OK
Verifying: August 2022... OK
Verifying: March 2022... OK
Verifying: September 2022... OK
Verifying: October 2022... OK
Verifying: January 2022... OK
Verifying: May 2022... OK
Verifying: November 2022... OK
Verifying: July 2022... OK
Verifying: June 2022... OK
Verifying: April 2022... OK
Verifying: December 2023... OK
Verifying: June 2023... OK
Verifying: February 2023... OK
Verifying: July 2023... OK
Verifying: November 2023... OK
Verifying: April 2023... OK

--------

## Sheet 1 Decoder

In [21]:
def load_dataset(year, month, filetype="survey"):
    """
    Locates and loads a dataset file (CSV or Excel) from the global inventory
    based on the year, month, and requested type.
    """
    # Relies on the global 'inventory' dictionary existing in your notebook
    if year not in inventory or month not in inventory[year]:
        raise ValueError(f"Error: No records found in inventory for {month} {year}.")


    files = inventory[year][month]


    # Locate the specific file type
    found_file = next((f for f in files if f['filetype'] == filetype), None)


    if not found_file:
        raise FileNotFoundError(f"Error: No {filetype} file found for {month} {year}.")


    # Construct the full file path using the global base_path
    file_path = os.path.join(base_path, year, found_file['filename'])


    # Load appropriate file format based on type
    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)
    else:
        return pd.read_excel(file_path)




def load_clean_sheet1(year, month):
    """
    Loads the processed variable definitions (Sheet 1) from the
    'Metadata Sheet 1 CSV's' folder in Google Drive.
    """
    folder_name = "Metadata Sheet 1 CSV's"
    filename = f"Sheet1_{month}_{year}.csv"
    file_path = os.path.join(base_path, folder_name, year, filename)


    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Error: Processed metadata file not found at {file_path}")


    return pd.read_csv(file_path)


def apply_metadata_headers(survey_df, metadata_sheet1_df, year="Unknown", month="Survey"):
    """
    Renames the columns of the raw survey dataset to human-readable labels
    using the provided metadata definitions. Prints a formal status report.
    """
    # 1. Standardization
    metadata_sheet1_df['Variable'] = metadata_sheet1_df['Variable'].astype(str).str.strip()
    metadata_sheet1_df['Description'] = metadata_sheet1_df['Description'].astype(str).str.strip()


    # 2. Map Generation
    header_map = dict(zip(metadata_sheet1_df['Variable'], metadata_sheet1_df['Description']))


    # 3. Analysis
    original_cols = set(survey_df.columns)
    mapped_cols = set(header_map.keys())


    translated_cols = original_cols.intersection(mapped_cols)
    untranslated_cols = original_cols - mapped_cols


    total_columns = len(original_cols)
    translated_count = len(translated_cols)
    untranslated_count = len(untranslated_cols)


    # 4. Execution
    renamed_df = survey_df.rename(columns=header_map)


    # 5. Reporting
    print("\n" + "="*60)
    print(f"METADATA TRANSLATION REPORT: {month.upper()} {year}")
    print("="*60)
    print(f"Total Columns Detected:       {total_columns}")
    print(f"Successfully Decoded:         {translated_count}")
    print(f"Remaining as Raw Codes:       {untranslated_count}")
    print("-" * 60)


    if untranslated_count == 0:
        print("Status: SUCCESS (100% Metadata Coverage)")
        print("All column headers have been successfully translated to descriptions.")
    else:
        print("Status: PARTIAL SUCCESS")
        print("The following columns retained their original codes because")
        print("no matching definition was found in the metadata library:")
        # Sort the list for easier reading
        print(f"\nList of Untranslated Codes: {sorted(list(untranslated_cols))}")


    print("="*60 + "\n")


    return renamed_df


## Automation for Sheet 1 Decoding

In [22]:
def run_batch_header_translation(inventory, base_path):
    """
    Iterates through the inventory, applies header translation to all survey CSVs,
    and saves the results to a temporary output folder.
    """
    output_folder_name = "Header Encoded Surveys"
    output_base_path = os.path.join(base_path, output_folder_name)
    os.makedirs(output_base_path, exist_ok=True)


    print("================================================")
    print("STARTING BATCH HEADER TRANSLATION")
    print(f"Output Directory: {output_base_path}")
    print("================================================\n")


    success_count = 0
    skip_count = 0
    error_count = 0


    # Loop through the existing 'inventory' dictionary
    for year in sorted(inventory.keys()):
       
        # Create Year subfolder in output directory
        year_output_path = os.path.join(output_base_path, year)
        os.makedirs(year_output_path, exist_ok=True)
       
        for month in inventory[year].keys():
            if month == "Unmatched": continue
           
            print(f"Processing: {month.upper()} {year}...")
           
            try:
                # 1. Check if a raw survey CSV exists for this month
                files_list = inventory[year][month]
                survey_file_data = next((f for f in files_list if f['filetype'] == 'survey'), None)
               
                if not survey_file_data:
                    print("   [SKIP] No raw survey CSV found.")
                    skip_count += 1
                    continue


                # 2. Load Data
                # We use the load functions defined above
                raw_survey = load_dataset(year, month, "survey")
               
                # This will raise FileNotFoundError if the clean sheet 1 doesn't exist
                clean_metadata = load_clean_sheet1(year, month)
               
                # 3. Translate
                # We pass year/month explicitly so the report title is correct
                decoded_df = apply_metadata_headers(raw_survey, clean_metadata, year, month)
               
                # 4. Save to "Temporary" Folder using ORIGINAL FILENAME
                # We extract the actual filename (e.g. "JANUARY_2018.CSV") from the inventory data
                original_filename = survey_file_data['filename']
                save_path = os.path.join(year_output_path, original_filename)
               
                decoded_df.to_csv(save_path, index=False)
                print(f"   [OK] Saved File: {original_filename}")
                success_count += 1
               
            except FileNotFoundError:
                print(f"   [SKIP] Missing Metadata Sheet 1 CSV for {month} {year}.")
                skip_count += 1
            except Exception as e:
                print(f"   [ERROR] Failed to process: {e}")
                error_count += 1
           
            print("-" * 40)


    print("\n================================================")
    print("BATCH PROCESS COMPLETE")
    print(f"   Successful: {success_count}")
    print(f"   Skipped:    {skip_count}")
    print(f"   Errors:     {error_count}")
    print("================================================")


In [23]:
if __name__ == "__main__":
    if 'inventory' in locals() and 'base_path' in locals():
        run_batch_header_translation(inventory, base_path)
    else:
        print("Skipping execution: 'inventory' or 'base_path' not found in scope.")

STARTING BATCH HEADER TRANSLATION
Output Directory: /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/Header Encoded Surveys

Processing: JANUARY 2018...

METADATA TRANSLATION REPORT: JANUARY 2018
Total Columns Detected:       50
Successfully Decoded:         50
Remaining as Raw Codes:       0
------------------------------------------------------------
Status: SUCCESS (100% Metadata Coverage)
All column headers have been successfully translated to descriptions.

   [OK] Saved File: JANUARY_2018.CSV
----------------------------------------
Processing: OCTOBER 2018...

METADATA TRANSLATION REPORT: OCTOBER 2018
Total Columns Detected:       51
Successfully Decoded:         51
Remaining as Raw Codes:       0
------------------------------------------------------------
Status: SUCCESS (100% Metadata Coverage)
All column headers have been successfully translated to descriptions.

   [OK] Saved File: OCTOBER_2018.CSV
--------------

In [24]:
def verify_header_decoding_integrity(inventory, base_path):
    """
    Checks if all raw survey columns have been successfully decoded
    using metadata Sheet 1.
    
    Returns a DataFrame with:
    Year | Month | Raw Headers Count | Decoded Headers Count | Integrity Status
    """

    results = []

    for year, months_data in inventory.items():
        for month, files_list in months_data.items():

            if month == "Unmatched":
                continue

            try:
                # ---- Load raw survey ----
                raw_df = load_dataset(year, month, "survey")
                raw_headers = list(raw_df.columns)
                raw_count = len(raw_headers)

                # ---- Load decoded metadata Sheet 1 ----
                meta_df = load_clean_sheet1(year, month)
                meta_df['Variable'] = meta_df['Variable'].astype(str).str.strip()
                meta_df['Description'] = meta_df['Description'].astype(str).str.strip()

                # Build mapping dict
                header_map = dict(zip(meta_df['Variable'], meta_df['Description']))

                # ---- Count decoded columns ----
                decoded_count = sum(col in header_map for col in raw_headers)

                # ---- Determine integrity ----
                status = "PASS" if raw_count == decoded_count else "FAIL"

                results.append({
                    "Year": year,
                    "Month": month,
                    "Raw Headers Count": raw_count,
                    "Decoded Headers Count": decoded_count,
                    "Integrity Status": status
                })

            except Exception as e:
                # Any error → FAIL
                results.append({
                    "Year": year,
                    "Month": month,
                    "Raw Headers Count": "ERROR",
                    "Decoded Headers Count": "ERROR",
                    "Integrity Status": f"FAIL ({e})"
                })
                continue

    result_df = pd.DataFrame(results)

    print("\n===== HEADER DECODING INTEGRITY CHECK COMPLETE =====")
    
    total_failures = (result_df["Integrity Status"] != "PASS").sum()

    if total_failures == 0:
        print("SUCCESS: All survey column headers have been fully decoded.")
    else:
        print(f"Completed with {total_failures} months failing integrity checks.")

    print("====================================================\n")

    return result_df.sort_values(["Year", "Month"]).reset_index(drop=True)


#### Checking if all column headers were decoded successfully

In [25]:
integrity_df = verify_header_decoding_integrity(inventory, base_path)
integrity_df


===== HEADER DECODING INTEGRITY CHECK COMPLETE =====
SUCCESS: All survey column headers have been fully decoded.



Unnamed: 0,Year,Month,Raw Headers Count,Decoded Headers Count,Integrity Status
0,2018,April,50,50,PASS
1,2018,January,50,50,PASS
2,2018,July,51,51,PASS
3,2018,October,51,51,PASS
4,2019,April,49,49,PASS
5,2019,January,49,49,PASS
6,2019,July,49,49,PASS
7,2019,October,49,49,PASS
8,2022,April,52,52,PASS
9,2022,August,42,42,PASS


## Sheet 2 Decoder

In [26]:
def load_clean_sheet2(base_path, year, month):
    """Loads the Clean Sheet 2 Metadata."""
    path = os.path.join(base_path, "Metadata Sheet 2 CSV's", year, f"Sheet2_{month}_{year}.csv")
    if not os.path.exists(path):
        raise FileNotFoundError(f"Metadata not found at: {path}")
    return pd.read_csv(path, dtype=str)


def find_target_column(survey_columns, meta_desc):
    """
    Smart Matcher: Handles 'Highest Grade' vs 'C07-Highest Grade Completed'.
    """
    if pd.isna(meta_desc): return None
    meta_desc = str(meta_desc).strip()
   
    # 1. Exact Match
    if meta_desc in survey_columns: return meta_desc
   
    # 2. Metadata has prefix (Meta="C06-Status" -> Survey="Status")
    clean_meta = re.sub(r'^C\d+[\s\-_]+', '', meta_desc, flags=re.IGNORECASE).strip()
    if clean_meta in survey_columns: return clean_meta
       
    # 3. Survey has prefix (Meta="Status" -> Survey="C06-Status")
    for col in survey_columns:
        if col.endswith(meta_desc):
            prefix = col[:-len(meta_desc)].strip()
            if re.search(r'^C\d+[\s\-_]*$', prefix, re.IGNORECASE) or prefix == "":
                return col
    return None


def decode_survey_safe(survey_df, meta_df):
    """
    Decodes the entire survey using the Smart Matcher and Safe Logic.
    """
    unique_vars = meta_df['Variable'].unique()
    decoded_count = 0
    survey_cols = list(survey_df.columns)
   
    for var_code in unique_vars:
        subset = meta_df[meta_df['Variable'] == var_code].copy()
       
        if subset['Description'].isnull().all(): continue
        raw_desc = subset['Description'].dropna().iloc[0].strip()
       
        target_col = find_target_column(survey_cols, raw_desc)
        if not target_col: continue
           
        mask_zeros = subset['Label'].astype(str).isin(['0', '0.0', '0.00', 'nan', 'NaN'])
        if mask_zeros.all(): continue
           
        lookup = {}
        for _, row in subset.iterrows():
            try:
                label = row['Label']
                if str(label) in ['0', '0.0', 'nan']: continue
               
                min_v = float(row['min_value'])
                max_v = float(row['max_value'])
               
                if max_v > min_v and max_v != 0:
                    for c in range(int(min_v), int(max_v) + 1): lookup[c] = label
                else:
                    lookup[int(min_v)] = label
            except: continue
           
        if not lookup: continue


        def safe_map(val):
            try: return lookup.get(int(float(val)), val)
            except: return val
           
        survey_df[target_col] = survey_df[target_col].apply(safe_map)
        decoded_count += 1


    return survey_df, decoded_count


## Automation for Sheet 2 Decoding

In [27]:
def run_batch_decoding(base_path):
    """
    Scans the folder, decodes all files, and saves to Fully Decoded.
    """
    # --- FOLDER CONFIGURATION ---
    input_folder_name = "Header Encoded Surveys"
    output_folder_name = "Fully Decoded Surveys"
   
    input_root = os.path.join(base_path, input_folder_name)
    output_root = os.path.join(base_path, output_folder_name)
    os.makedirs(output_root, exist_ok=True)
   
    print("================================================")
    print("STARTING BATCH VALUE DECODING")
    print(f"Source: {input_root}")
    print(f"Dest:   {output_root}")
    print("================================================\n")
   
    month_pattern = re.compile(r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)", re.IGNORECASE)
   
    if not os.path.exists(input_root):
        print(f"Error: Input folder not found: {input_root}")
        return


    year_folders = [f for f in os.listdir(input_root) if f.isdigit() and os.path.isdir(os.path.join(input_root, f))]
   
    success = 0
    errors = 0
   
    for year in sorted(year_folders):
        year_in = os.path.join(input_root, year)
        year_out = os.path.join(output_root, year)
        os.makedirs(year_out, exist_ok=True)
       
        files = [f for f in os.listdir(year_in) if f.lower().endswith(".csv")]
       
        for filename in files:
            match = month_pattern.search(filename)
            if not match: continue
            month = match.group(1).capitalize()
           
            print(f"Processing: {month.upper()} {year}...")
           
            try:
                # 1. Load Survey
                survey_path = os.path.join(year_in, filename)
                df_survey = pd.read_csv(survey_path, low_memory=False)
               
                # 2. Load Metadata
                df_meta = load_clean_sheet2(base_path, year, month)
               
                # 3. Decode (CALLS THE FUNCTION ABOVE)
                df_final, count = decode_survey_safe(df_survey, df_meta)
               
                # 4. Save
                save_path = os.path.join(year_out, filename)
                df_final.to_csv(save_path, index=False)
               
                print(f"   [OK] Decoded {count} columns.")
                print(f"   [SAVED] {filename}")
                success += 1
               
            except FileNotFoundError as e:
                print(f"   [SKIP] Metadata missing: {e}")
            except Exception as e:
                print(f"   [ERROR] {e}")
                errors += 1
           
            print("-" * 40)


    print(f"\nCOMPLETED. Success: {success} | Errors: {errors}")


In [28]:
# ==========================================
# EXECUTION
# ==========================================
if __name__ == "__main__":
    run_batch_decoding(base_path)

STARTING BATCH VALUE DECODING
Source: /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/Header Encoded Surveys
Dest:   /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/Fully Decoded Surveys

Processing: JULY 2018...
   [OK] Decoded 37 columns.
   [SAVED] JULY_2018.CSV
----------------------------------------
Processing: OCTOBER 2018...
   [OK] Decoded 39 columns.
   [SAVED] OCTOBER_2018.CSV
----------------------------------------
Processing: APRIL 2018...
   [OK] Decoded 37 columns.
   [SAVED] APRIL_2018.CSV
----------------------------------------
Processing: JANUARY 2018...
   [OK] Decoded 39 columns.
   [SAVED] JANUARY_2018.CSV
----------------------------------------
Processing: JULY 2019...
   [OK] Decoded 41 columns.
   [SAVED] JULY_2019.CSV
----------------------------------------
Processing: JANUARY 2019...
   [OK] Decoded 38 columns.
   [SAVED] JANUARY_20

In [29]:
import os
import pandas as pd
import re

def verify_decoded_record_integrity(base_path):
    """
    Checks if all raw survey records (rows) match the fully decoded records.
    
    Compares:
    - Raw Total Records (Header Encoded Surveys)
    - Decoded Total Records (Fully Decoded Surveys)
    
    Returns: DataFrame summary
    """

    raw_root = os.path.join(base_path, "Header Encoded Surveys")
    decoded_root = os.path.join(base_path, "Fully Decoded Surveys")

    if not os.path.exists(raw_root):
        raise FileNotFoundError(f"Header Encoded Surveys folder missing: {raw_root}")
    if not os.path.exists(decoded_root):
        raise FileNotFoundError(f"Fully Decoded Surveys folder missing: {decoded_root}")

    # Detect months inside filenames
    month_pattern = re.compile(
        r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
        re.IGNORECASE
    )

    results = []

    # Loop through year folders
    year_folders = [y for y in os.listdir(raw_root) if y.isdigit()]

    for year in sorted(year_folders):
        year_raw_folder = os.path.join(raw_root, year)
        year_dec_folder = os.path.join(decoded_root, year)

        if not os.path.exists(year_dec_folder):
            # If missing decoded folder, mark all as FAIL
            files = [f for f in os.listdir(year_raw_folder) if f.lower().endswith(".csv")]
            for f in files:
                match = month_pattern.search(f)
                if not match: continue
                month = match.group(1).capitalize()

                results.append({
                    "Year": year,
                    "Month": month,
                    "Raw Total Records": "N/A",
                    "Decoded Total Records": "Missing",
                    "Integrity Status": "FAIL"
                })
            continue

        raw_files = [f for f in os.listdir(year_raw_folder) if f.lower().endswith(".csv")]

        for filename in raw_files:

            match = month_pattern.search(filename)
            if not match:
                continue

            month = match.group(1).capitalize()

            raw_path = os.path.join(year_raw_folder, filename)
            decoded_path = os.path.join(year_dec_folder, filename)

            try:
                # Load raw records
                raw_df = pd.read_csv(raw_path, low_memory=False)
                raw_count = len(raw_df)

            except Exception as e:
                results.append({
                    "Year": year,
                    "Month": month,
                    "Raw Total Records": f"ERROR: {e}",
                    "Decoded Total Records": "N/A",
                    "Integrity Status": "FAIL"
                })
                continue

            # Load decoded records
            if not os.path.exists(decoded_path):
                results.append({
                    "Year": year,
                    "Month": month,
                    "Raw Total Records": raw_count,
                    "Decoded Total Records": "Missing",
                    "Integrity Status": "FAIL"
                })
                continue

            try:
                decoded_df = pd.read_csv(decoded_path, low_memory=False)
                dec_count = len(decoded_df)
            except Exception as e:
                results.append({
                    "Year": year,
                    "Month": month,
                    "Raw Total Records": raw_count,
                    "Decoded Total Records": f"ERROR: {e}",
                    "Integrity Status": "FAIL"
                })
                continue

            # Determine PASS/FAIL
            status = "PASS" if raw_count == dec_count else "FAIL"

            results.append({
                "Year": year,
                "Month": month,
                "Raw Total Records": raw_count,
                "Decoded Total Records": dec_count,
                "Integrity Status": status
            })

    summary_df = pd.DataFrame(results)

    print("\n===== RECORD DECODING INTEGRITY CHECK COMPLETE =====")
    fails = (summary_df["Integrity Status"] != "PASS").sum()

    if fails == 0:
        print("SUCCESS: All decoded surveys match the raw row counts.")
    else:
        print(f"WARNING: {fails} months failed record integrity checks.")

    print("====================================================\n")

    return summary_df.sort_values(["Year", "Month"]).reset_index(drop=True)


#### Checking if all records were decoded successfully

In [30]:
record_integrity_df = verify_decoded_record_integrity(base_path)
record_integrity_df


===== RECORD DECODING INTEGRITY CHECK COMPLETE =====
SUCCESS: All decoded surveys match the raw row counts.



Unnamed: 0,Year,Month,Raw Total Records,Decoded Total Records,Integrity Status
0,2018,April,179815,179815,PASS
1,2018,January,180262,180262,PASS
2,2018,July,182956,182956,PASS
3,2018,October,179204,179204,PASS
4,2019,April,172284,172284,PASS
5,2019,January,181233,181233,PASS
6,2019,July,175438,175438,PASS
7,2019,October,178067,178067,PASS
8,2022,April,184237,184237,PASS
9,2022,August,45054,45054,PASS


### Coverage Scanner in Metadata and Survey 

To check whether columns with values not found in metadata stayed unchanged:

In [31]:
import os
import pandas as pd
import re
from IPython.display import display, HTML

def check_value_decoding_integrity_smart(base_path):
    """
    Verifies if variables were decoded correctly.
    
    IMPROVEMENT:
    - Distinguishes between "Failed Decoding" vs "Quantitative Variables" (e.g. Household Size).
    - If a variable is in metadata but the labels are numbers (or 0), it marks it as OK.
    """
    input_folder = os.path.join(base_path, "Fully Decoded Surveys")
    meta_root = os.path.join(base_path, "Metadata Sheet 2 CSV's")

    month_pattern = re.compile(
        r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
        re.IGNORECASE
    )

    all_results = []

    # Ensure input folder exists
    if not os.path.exists(input_folder):
        print(f"Folder not found: {input_folder}")
        return

    for year in sorted(os.listdir(input_folder)):
        year_path = os.path.join(input_folder, year)
        if not os.path.isdir(year_path): continue

        for file in sorted(os.listdir(year_path)):
            if not file.lower().endswith(".csv"): continue

            match = month_pattern.search(file)
            if not match: continue

            month = match.group(1).capitalize()
            survey_path = os.path.join(year_path, file)

            # 1. Load Survey
            # Read as object (string) initially to check for numeric-ness accurately
            df_survey = pd.read_csv(survey_path, low_memory=False)

            # 2. Load Metadata
            meta_path = os.path.join(meta_root, year, f"Sheet2_{month}_{year}.csv")
            if not os.path.exists(meta_path):
                print(f"[SKIP] Metadata missing for {month} {year}")
                continue

            df_meta = pd.read_csv(meta_path, dtype=str)
            
            # Create a clean lookup for Description -> Variable Logic
            # We need to know WHICH metadata rows correspond to WHICH survey column
            # Clean descriptions to match survey headers
            df_meta['Description_Clean'] = df_meta['Description'].fillna('').astype(str).str.strip()
            
            # Get set of descriptions present in metadata
            meta_descriptions = set(df_meta["Description_Clean"].unique())

            sheet_results = []
            decoded_count = 0
            unchanged_count = 0 # Correctly unchanged
            failed_count = 0    # Should have decoded but didn't

            # 3. Check Columns
            for col in df_survey.columns:
                # A. Check if Data is Numeric
                # We drop NA and check if the remaining values look like numbers
                col_values = df_survey[col].dropna().astype(str)
                if col_values.empty:
                    is_numeric_data = False # Empty columns are ambiguous
                else:
                    # Check if all values are digits (allowing for .0 decimals)
                    is_numeric_data = col_values.str.replace(r'\.0$', '', regex=True).str.isnumeric().all()

                # B. Check if in Metadata
                # We check if the column header exists in the Metadata Descriptions
                exists_in_metadata = col in meta_descriptions

                status = ""
                
                if not exists_in_metadata:
                    status = "OK (No Metadata)"
                    unchanged_count += 1
                
                elif not is_numeric_data:
                    # It's in metadata AND it's text (e.g. "Male"). Success.
                    status = "OK (Decoded)"
                    decoded_count += 1
                    
                elif is_numeric_data and exists_in_metadata:
                    # --- SMART CHECK: Is it SUPPOSED to be numeric? ---
                    # Get the labels for this specific variable
                    subset = df_meta[df_meta['Description_Clean'] == col]
                    
                    # Check labels: Are they '0', empty, or purely numeric strings?
                    labels = subset['Label'].astype(str).replace(['0', '0.0', 'nan', 'None'], '')
                    
                    # Filter out empty labels
                    real_labels = labels[labels != '']
                    
                    if real_labels.empty:
                        # All labels are '0' -> Quantitative (e.g., Hours)
                        status = "OK (Quantitative - No Labels)"
                        unchanged_count += 1
                    elif real_labels.str.isnumeric().all():
                        # All labels are numbers (e.g., "2018", "1") -> Quantitative (e.g., Year, HH Size)
                        status = "OK (Quantitative - Numeric Labels)"
                        unchanged_count += 1
                    else:
                        # Labels contain Text (e.g., "Single"), but Data is Numeric (1) -> FAIL
                        status = "FAILED (Should be Text)"
                        failed_count += 1

                sheet_results.append({
                    "Column": col,
                    "In_Metadata": "Yes" if exists_in_metadata else "No",
                    "Data_Type": "Numeric" if is_numeric_data else "Text",
                    "Status": status
                })

            # ========== REPORT ==========
            print("\n" + "="*70)
            print(f"VERIFICATION: {month.upper()} {year}")
            print("="*70)
            
            # Filter for failures to show them clearly
            failures = [res for res in sheet_results if "FAILED" in res['Status']]
            
            print(f"Total Columns:      {len(df_survey.columns)}")
            print(f"Successful Decodes: {decoded_count}")
            print(f"Correctly Numeric:  {unchanged_count}")
            print(f"Failures:           {failed_count}")
            
            if failures:
                print(f"\nWARNING: {len(failures)} columns failed to decode:")
                df_fail = pd.DataFrame(failures)
                display(HTML(df_fail.to_html(index=False)))
            else:
                print("\nPASSED: All columns accounted for.")

            all_results.extend(sheet_results)

    return pd.DataFrame(all_results)

# ===================== RUN =====================
# Run this in your notebook
df_integrity_check = check_value_decoding_integrity_smart(base_path)


VERIFICATION: APRIL 2018
Total Columns:      50
Successful Decodes: 40
Correctly Numeric:  10
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: JANUARY 2018
Total Columns:      50
Successful Decodes: 41
Correctly Numeric:  9
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: JULY 2018
Total Columns:      51
Successful Decodes: 40
Correctly Numeric:  11
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: OCTOBER 2018
Total Columns:      51
Successful Decodes: 41
Correctly Numeric:  10
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: APRIL 2019
Total Columns:      49
Successful Decodes: 41
Correctly Numeric:  8
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: JANUARY 2019
Total Columns:      49
Successful Decodes: 41
Correctly Numeric:  8
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: JULY 2019
Total Columns:      49
Successful Decodes: 41
Correctly

Column,In_Metadata,Data_Type,Status
Survey Month,Yes,Numeric,FAILED (Should be Text)



VERIFICATION: JANUARY 2022
Total Columns:      52
Successful Decodes: 44
Correctly Numeric:  8
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: JULY 2022
Total Columns:      52
Successful Decodes: 45
Correctly Numeric:  7
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: JUNE 2022
Total Columns:      42
Successful Decodes: 33
Correctly Numeric:  9
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: MARCH 2022
Total Columns:      41
Successful Decodes: 31
Correctly Numeric:  9
Failures:           1



Column,In_Metadata,Data_Type,Status
Survey Month,Yes,Numeric,FAILED (Should be Text)



VERIFICATION: MAY 2022
Total Columns:      42
Successful Decodes: 33
Correctly Numeric:  9
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: NOVEMBER 2022
Total Columns:      42
Successful Decodes: 34
Correctly Numeric:  8
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: OCTOBER 2022
Total Columns:      52
Successful Decodes: 44
Correctly Numeric:  8
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: SEPTEMBER 2022
Total Columns:      42
Successful Decodes: 33
Correctly Numeric:  9
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: APRIL 2023
Total Columns:      52
Successful Decodes: 44
Correctly Numeric:  8
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: AUGUST 2023
Total Columns:      41
Successful Decodes: 32
Correctly Numeric:  9
Failures:           0

PASSED: All columns accounted for.

VERIFICATION: DECEMBER 2023
Total Columns:      41
Successful Decodes: 32
Corre

### Identical Variable Detector

In [32]:
import os
import pandas as pd
from rapidfuzz import fuzz
from collections import defaultdict
import re

# ===============================================================
# PATHS
# ===============================================================
decoded_path = os.path.join(base_path, "Fully Decoded Surveys")
metadata_path = os.path.join(base_path, "Metadata Sheet 2 CSV's")

# Month ordering for clean display
month_order = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

# ===============================================================
# STEP 1 — GET ALL VARIABLES + WHERE THEY APPEAR (MONTH COUNT)
# ===============================================================
all_columns = []
variable_months = defaultdict(set)

for year in os.listdir(decoded_path):
    year_folder = os.path.join(decoded_path, year)
    if not os.path.isdir(year_folder):
        continue

    for file in os.listdir(year_folder):
        if not file.endswith(".CSV"):
            continue

        month = file.split("_")[0].capitalize()
        month_year = f"{month} {year}"
        file_path = os.path.join(year_folder, file)

        try:
            df = pd.read_csv(file_path, low_memory=False)
            for col in df.columns:
                col_clean = col.strip()
                all_columns.append(col_clean)
                variable_months[col_clean].add(month_year)
        except Exception as e:
            print(f"[ERROR] {file} -> {e}")

all_columns = sorted(set(all_columns))

# ===============================================================
# STEP 2 — EXCLUSIVE CLUSTERING (Fixes Duplication Issue)
# ===============================================================
similarity_threshold = 85
filtered_groups = {} # Key = Main Variable, Value = List of Similar Variables
processed_vars = set() # Keeps track of variables already assigned to a group

for i, var1 in enumerate(all_columns):
    # If var1 is already part of another group, skip it. 
    # This prevents it from creating a duplicate subset group later.
    if var1 in processed_vars:
        continue
    
    current_group = []

    # Check against all subsequent variables
    for j in range(i + 1, len(all_columns)):
        var2 = all_columns[j]
        
        # If var2 is already taken, skip it
        if var2 in processed_vars:
            continue

        # Compare
        similarity = fuzz.token_sort_ratio(var1.lower(), var2.lower()) 
        
        if similarity >= similarity_threshold:
            current_group.append(var2)
            processed_vars.add(var2) # Mark var2 as taken
    
    # Only save if we found matches
    if current_group:
        filtered_groups[var1] = current_group
        processed_vars.add(var1) # Mark the leader as taken

# ===============================================================
# HELPER — LOAD PER-MONTH LABELS
# ===============================================================
def load_per_month_labels(variable):
    results = defaultdict(set)
    for year in os.listdir(metadata_path):
        year_folder = os.path.join(metadata_path, year)
        if not os.path.isdir(year_folder):
            continue
        for file in os.listdir(year_folder):
            if not file.startswith("Sheet2_") or not file.endswith(".csv"):
                continue
            month = file.split("_")[1].capitalize()
            month_year = f"{month} {year}"
            file_path = os.path.join(year_folder, file)
            try:
                df = pd.read_csv(file_path, dtype=str).fillna("")
            except Exception:
                continue
            if "Description" not in df.columns or "Label" not in df.columns:
                continue
            # Exact match on description
            match = df[df["Description"].astype(str).str.strip().str.lower() == variable.lower()]
            if not match.empty:
                # Keep original formatting here for display purposes
                labels_raw = [str(x).strip() for x in match["Label"].tolist() if str(x).strip() != ""]
                results[month_year].update(labels_raw)
    return results

# ===============================================================
# HELPER - NORMALIZE TEXT (Aggressive)
# ===============================================================
def normalize(text):
    """
    1. Converts to lowercase.
    2. Replaces non-breaking spaces (\xa0) with standard spaces.
    3. Collapses multiple spaces into one.
    """
    if not isinstance(text, str):
        text = str(text)
    text = text.replace('\xa0', ' ') # Handle hidden non-breaking spaces
    text = re.sub(r'\s+', ' ', text) # Collapse multiple spaces
    return text.strip().lower()

# ===============================================================
# STEP 3 — PRINT GROUPED ANALYSIS SUMMARY
# ===============================================================
print("\n============================================================")
print("      GROUPS WITH DETAILED LABEL + MONTH DIFFERENCE CHECK      ")
print("============================================================\n")

group_number = 1

for key, group in filtered_groups.items():
    full_group = sorted(set([key] + group))

    print(f"\n----- Group {group_number} -----")

    # Print variable list
    print("Variables:")
    for var in full_group:
        month_count = len(variable_months[var])
        print(f"- {var} ({month_count} months)")

    # Load labels
    group_labels = {}
    overall_label_union = {}
    
    # "Pretty Print" Map: normalized_string -> original_string
    pretty_print_map = {}

    for var in full_group:
        per_month = load_per_month_labels(var)
        group_labels[var] = per_month

        overall = set()
        for labels in per_month.values():
            overall.update(labels)
            # Populate the pretty map
            for label in labels:
                pretty_print_map[normalize(label)] = label
        
        overall_label_union[var] = overall

        print(f"\nVariable: {var}")
        if not per_month:
            print("Labels: (No labels found in metadata)")

        months_to_show = sorted(
            [m for m in per_month.keys() if m in variable_months[var]],
            key=lambda x: (int(x.split()[-1]), month_order.get(x.split()[0], 99))
        )

        if not months_to_show:
            for m in sorted(variable_months[var], key=lambda x: (int(x.split()[-1]), month_order.get(x.split()[0], 99))):
                print(f"  {m}: (empty)")
        else:
            for month in months_to_show:
                labels = per_month.get(month, set())
                # Sort by the original text
                sorted_labels = sorted(list(labels))
                
                if len(sorted_labels) > 10:
                     label_str = ", ".join(sorted_labels[:10]) + f", ... (+{len(sorted_labels)-10} more)"
                else:
                     label_str = ", ".join(sorted_labels)
                
                label_str = label_str if labels else "(empty)"
                print(f"  {month}: {label_str}")

    # --------------------------------------------------------------
    # LOGIC: NORMALIZE + TEMPORAL CHECK + PRETTY PRINTING
    # --------------------------------------------------------------
    
    # 1. Build Global Fingerprints
    fingerprints = {
        var: frozenset([normalize(s) for s in overall_label_union.get(var, set())]) 
        for var in full_group
    }
    
    reference_var = full_group[0]
    reference_set = fingerprints[reference_var]

    # 2. Global Vocabulary Check
    same_global_vocab = all(fingerprints[v] == reference_set for v in full_group)

    # 3. Temporal Consistency Check
    temporal_mismatches = []
    is_temporally_consistent = True

    for var in full_group:
        var_global_set = fingerprints[var] 
        
        for month in variable_months[var]:
            month_labels_raw = group_labels[var].get(month, set())
            month_labels_normalized = set([normalize(s) for s in month_labels_raw])

            missing_in_month = var_global_set - month_labels_normalized
            
            if missing_in_month:
                is_temporally_consistent = False
                missing_readable = sorted([pretty_print_map.get(x, x) for x in missing_in_month])
                
                if len(missing_readable) > 5:
                    missing_str = ", ".join(missing_readable[:5]) + "..."
                else:
                    missing_str = ", ".join(missing_readable)
                temporal_mismatches.append(f"- {var} in {month} is missing: {missing_str}")

    # Final Decision
    identical = same_global_vocab and is_temporally_consistent

    print("\nIdentical coding scheme?: ", end="")
    print("YES" if identical else "NO")

    # --------------------------------------------------------------
    # Difference Reporting
    # --------------------------------------------------------------
    if not identical:
        print("Differences found:")

        # Report Temporal Inconsistencies
        if temporal_mismatches:
             print(">> TEMPORAL INCONSISTENCIES (Labels missing in specific months):")
             for mismatch in temporal_mismatches[:10]:
                 print(mismatch)
             if len(temporal_mismatches) > 10:
                 print(f"... and {len(temporal_mismatches) - 10} more months.")

        # Report Global Differences
        for var in full_group:
            if var == reference_var:
                continue
            cur_set = fingerprints[var]
            extra_overall = cur_set - reference_set
            missing_overall = reference_set - cur_set
            
            if extra_overall:
                readable_extra = [pretty_print_map.get(x, x) for x in extra_overall]
                print(f"- {var} has EXTRA overall labels: {', '.join(sorted(readable_extra))}")
            
            if missing_overall:
                readable_missing = [pretty_print_map.get(x, x) for x in missing_overall]
                print(f"- {var} is MISSING overall labels: {', '.join(sorted(readable_missing))}")

    group_number += 1


      GROUPS WITH DETAILED LABEL + MONTH DIFFERENCE CHECK      


----- Group 1 -----
Variables:
- 2010Urban-RuralFIES (8 months)
- 2015Urban-RuralFIES (13 months)

Variable: 2010Urban-RuralFIES
  January 2018: Rural, Urban
  April 2018: Rural, Urban
  July 2018: Rural, Urban
  October 2018: Rural, Urban
  January 2019: Rural, Urban
  April 2019: Rural, Urban
  July 2019: Rural, Urban
  October 2019: Rural, Urban

Variable: 2015Urban-RuralFIES
  July 2022: Rural, Urban
  August 2022: Rural, Urban
  September 2022: Rural, Urban
  October 2022: Rural, Urban
  November 2022: Rural, Urban
  December 2022: Rural, Urban
  January 2023: Rural, Urban
  February 2023: Rural, Urban
  March 2023: Rural, Urban
  April 2023: Rural, Urban
  May 2023: Rural, Urban
  June 2023: Rural, Urban
  July 2023: Rural, Urban

Identical coding scheme?: YES

----- Group 2 -----
Variables:
- C08-Overseas Filipino Indicator (17 months)
- C10-Overseas Filipino Indicator (17 months)

Variable: C08-Overseas Filipino


#### Special Case: "Available for Work" variable

The variables "C27-Available for Work" and "C36-Available for Work" has labels in a binary format.

    "C27-Available for Work" - yes/no
    "C36-Available for Work" - not available/yes available

However, they have different text labels. In this case, we can standardize the label to "Yes" and "No".



In [33]:
import os
import pandas as pd
from collections import defaultdict
import re

# ===============================================================
# CONFIGURATION AND INPUTS (Assumed to be defined in environment)
# ===============================================================
# base_path = r"G:\..."  # Assumed to be defined globally
# month_order = { ... } # Assumed to be defined globally
# variable_months = { ... } # Assumed to be defined globally after Step 1 scan

# The specific variables identified for comparison
VARIABLES_TO_COMPARE = [
    "C27-Available for Work",
    "C36-Available for Work"
]

# Assuming the metadata_path is derived from base_path and the folder name:
METADATA_FOLDER = "Metadata Sheet 2 CSV's"
try:
    metadata_path = os.path.join(base_path, METADATA_FOLDER)
except NameError:
    print("ERROR: 'base_path' is not defined. Please ensure the base path is set globally.")
    exit()

# NOTE: The helper functions load_per_month_labels() and normalize() 
# must be defined in your environment before running this block.

# ===============================================================
# CORE ANALYSIS LOGIC
# ===============================================================
group_labels = {}
overall_label_union = {}
pretty_print_map = {}

# 1. Load Labels and Build Global Fingerprints
for var in VARIABLES_TO_COMPARE:
    try:
        # Calls the function from your Part 5/original code
        per_month = load_per_month_labels(var)
    except Exception as e:
        print(f"Error loading labels for {var}: {e}")
        continue
        
    group_labels[var] = per_month

    overall = set()
    for labels in per_month.values():
        overall.update(labels)
        # Populate the pretty map
        for label in labels:
            pretty_print_map[normalize(label)] = label
    
    overall_label_union[var] = overall

# 2. Print Summary in Desired Format
print(f"Variables:")
for var in VARIABLES_TO_COMPARE:
    # Use variable_months loaded from your Step 1 scan
    month_count = len(variable_months.get(var, set()))
    print(f"- {var} ({month_count} months)")

for var in VARIABLES_TO_COMPARE:
    print(f"Variable: {var}")
    per_month = group_labels.get(var, {})
    
    # Sort months correctly using month_order
    months_to_show = sorted(
        [m for m in per_month.keys() if m in variable_months.get(var, set())],
        key=lambda x: (int(x.split()[-1]), month_order.get(x.split()[0], 99))
    )
    
    for month in months_to_show:
        labels = per_month.get(month, set())
        # Sort labels alphabetically for consistent output
        sorted_labels = sorted(list(labels))
        label_str = ", ".join(sorted_labels)
        print(f"  {month}: {label_str}")

# 3. Consistency Checks

# 3.1. Build Global Fingerprints (Normalized sets)
fingerprints = {
    var: frozenset([normalize(s) for s in overall_label_union.get(var, set())]) 
    for var in VARIABLES_TO_COMPARE
}

# Assume the first variable is the reference
reference_var = VARIABLES_TO_COMPARE[0] 
reference_set = fingerprints.get(reference_var, set())

# 3.2. Global Vocabulary Check (Only compares C27 vs C36)
if len(VARIABLES_TO_COMPARE) == 2:
    other_var = VARIABLES_TO_COMPARE[1]
    other_set = fingerprints.get(other_var, set())
    
    same_global_vocab = reference_set == other_set
    # Since we are only checking two variables, temporal check logic is simplified
    is_temporally_consistent = True # Assuming per-month labels are subsets of global union (usually true if data pipeline worked)
else:
    # Generalized check for N variables
    same_global_vocab = all(fingerprints[v] == reference_set for v in VARIABLES_TO_COMPARE)
    is_temporally_consistent = True # Skip full temporal check for brevity, rely on global check for this comparison

# Final Decision
identical = same_global_vocab and is_temporally_consistent

print("\nIdentical coding scheme?: ", end="")
print("YES" if identical else "NO")

# 4. Difference Reporting (If NO)
if not identical and len(VARIABLES_TO_COMPARE) == 2:
    print("Differences found:")
    
    # We compare C36 against C27 (reference_var)
    var = other_var
    cur_set = other_set
    
    extra_overall = cur_set - reference_set
    missing_overall = reference_set - cur_set
    
    # Convert back to original format for display
    if extra_overall:
        readable_extra = [pretty_print_map.get(x, x) for x in extra_overall]
        print(f"- {var} has EXTRA overall labels: {', '.join(sorted(readable_extra))}")
    
    if missing_overall:
        readable_missing = [pretty_print_map.get(x, x) for x in missing_overall]
        print(f"- {var} is MISSING overall labels: {', '.join(sorted(readable_missing))}")


Variables:
- C27-Available for Work (17 months)
- C36-Available for Work (17 months)
Variable: C27-Available for Work
  August 2022: no, yes
  September 2022: no, yes
  November 2022: no, yes
  December 2022: no, yes
  February 2023: no, yes
  March 2023: no, yes
  May 2023: no, yes
  June 2023: no, yes
  August 2023: no, yes
  September 2023: no, yes
  November 2023: no, yes
  December 2023: no, yes
  February 2024: no, yes
  March 2024: no, yes
  May 2024: no, yes
  June 2024: no, yes
  August 2024: no, yes
Variable: C36-Available for Work
  January 2018: not available, yes available
  April 2018: not available, yes available
  July 2018: not available, yes available
  October 2018: not available, yes available
  January 2019: not available, yes available
  April 2019: not available, yes available
  July 2019: not available, yes available
  October 2019: not available, yes available
  July 2022: not available, yes available
  October 2022: not available, yes available
  January 2023:

### Renaming Variables Code + Automated saving into a new folder

In [34]:
import os
import pandas as pd
from collections import defaultdict
import re

# ===============================================================
# USER-DEFINED INPUT: THE RENAMING INSTRUCTION MAP
# ===============================================================
RENAMING_MAP = {
    # --- ADDED: Available for Work Consolidation ---
    "Available for Work": [
        "C27-Available for Work", 
        "C36-Available for Work"
    ],
    "Urban-RuralFIES": [
        "2010Urban-RuralFIES", 
        "2015Urban-RuralFIES"
    ],
    "Location of Work (Province, Municipality)": [
        "C11 - Location of Work (Province, Municipality)", 
        "C11-Location of Work (Province, Municipality)", 
        "C12A - Location of Work (Province, Municipality)"
    ],
    "Normal Working Hours per Day": [
        "C17-Normal Working Hours per Day", 
        "C18-Normal Working Hours per Day"
    ],
    "Want More Hours of Work": [
        "C19-Want More Hours of Work", 
        "C20-Want More Hours of Work"
    ],
    "Look for Additional Work": [
        "C20-Look for Additional Work", 
        "C21-Look for Additional Work"
    ],
    "Other Job Indicator": [
        "C22-Other Job Indicator", 
        "C26-Other Job Indicator"
    ],
    "Total Hours Worked for all Jobs": [
        "C23-Total Hours Worked for all Jobs", 
        "C28-Total Hours Worked for all Jobs"
    ],
    "Looked for Work or Tried to Establish Business During the Past Week": [
        "C25-Looked for Work or Tried to Establish Business during the past week", 
        "C30-Looked for Work or Tried to Establish Business during the past week"
    ],
    "First Time to Look for Work": [
        "C25B - First time to look for work", 
        "C31-First Time to Look for Work"
    ],
    "Previous Job Indicator": [
        "C28-Previous Job Indicator", 
        "C38-Previous Job Indicator"
    ],
    "Previous Occupation": [
        "C31-Previous Occupation", 
        "C40-Previous Occupation"
    ],
    "Kind of Business (Past Quarter)": [
        "C33-Kind of Business (past quarter)", 
        "C43-Kind of Business (past quarter)"
    ],
    "Province": [
        "Province", 
        "province"
    ],
    "Province Recode": [
        "Province Recode", 
        "province_recode"
    ]
}

# ===============================================================
# VALUE HARMONIZATION DICTIONARY
# Used to recode the cell values for 'Available for Work'
# The keys are case-insensitive.
# ===============================================================
AVAILABLE_FOR_WORK_HARMONIZATION = {
    "no": "No",
    "yes": "Yes",
    "not available": "No",
    "yes available": "Yes"
}


# ===============================================================
# PATHS AND CONFIGURATION (Steps 2 & 3)
# ===============================================================
# ASSUMPTION: 'base_path' is defined globally.

SOURCE_FOLDER = "Fully Decoded Surveys"
DESTINATION_FOLDER = "Renamed Fully Decoded Surveys"

try:
    decoded_path = os.path.join(base_path, SOURCE_FOLDER)
    renamed_path = os.path.join(base_path, DESTINATION_FOLDER)
except NameError:
    # Handle case where base_path isn't defined, although it's assumed
    print("Error: 'base_path' is not defined. Ensure it is set before running.")
    exit()

os.makedirs(renamed_path, exist_ok=True)
print(f"Output folder created/verified: {renamed_path}\n")

# --- 1. Renaming Map Generation ---
REVERSE_RENAMING_MAP = {}
for new_name, old_names in RENAMING_MAP.items():
    for old_name in old_names:
        REVERSE_RENAMING_MAP[old_name.strip()] = new_name.strip()
        
# --- 2. Renaming Function (Integrated Harmonization) ---

def rename_and_save_survey(source_filepath, dest_filepath, renaming_map, harmonization_map):
    """
    Loads a single survey file, performs value harmonization on 'Available for Work',
    renames specified columns, and saves the result.
    """
    try:
        df = pd.read_csv(source_filepath, low_memory=False) 
        columns_to_rename = {}
        
        # Determine the target column names for harmonization
        available_for_work_cols = [
            col for col in df.columns 
            if col.strip() in ["C27-Available for Work", "C36-Available for Work"]
        ]
        
        # --- DATA HARMONIZATION STEP ---
        if available_for_work_cols:
            harmonization_lookup = {k.lower(): v for k, v in harmonization_map.items()}
            
            for col in available_for_work_cols:
                # Apply the recoding based on the harmonization map
                # .str.lower().map() ensures case-insensitivity during lookup
                df[col] = df[col].astype(str).str.strip().str.lower().map(harmonization_lookup).fillna(df[col])
                # Note: The .fillna(df[col]) keeps values that were not in the harmonization list (like NaNs)
        
        # --- COLUMN RENAMING STEP ---
        for col in df.columns:
            if col.strip() in renaming_map:
                columns_to_rename[col] = renaming_map[col]
        
        if columns_to_rename:
            df = df.rename(columns=columns_to_rename)
        
        # --- SAVING ---
        # The file is saved with unified headers AND harmonized values
        df.to_csv(dest_filepath, index=False)
        
        return len(columns_to_rename)
        
    except Exception as e:
        print(f"[ERROR] Processing {source_filepath}: {e}")
        return -1


# --- 3. Batch Processing Automation (Step 3 Implementation) ---

def run_batch_renaming(source_root, dest_root, renaming_map, harmonization_map):
    """
    Iterates through all files in the source folder, applies renaming/harmonization, 
    and saves them to the destination folder.
    """
    total_files_processed = 0
    total_columns_unified = 0
    
    print(f"--- STARTING BATCH RENAMING AND SAVING ---")
    print(f"Source: {SOURCE_FOLDER}")
    print(f"Destination: {DESTINATION_FOLDER}")
    print(f"Total variables to unify: {len(REVERSE_RENAMING_MAP)}")
    print("-" * 50)
    
    for year in sorted(os.listdir(source_root)):
        year_source_folder = os.path.join(source_root, year)
        
        if not os.path.isdir(year_source_folder): continue
        
        year_dest_folder = os.path.join(dest_root, year)
        os.makedirs(year_dest_folder, exist_ok=True)
        
        for filename in os.listdir(year_source_folder):
            if filename.lower().endswith(".csv"):
                
                source_filepath = os.path.join(year_source_folder, filename)
                dest_filepath = os.path.join(year_dest_folder, filename)
                
                # Call the integrated function
                renamed_count = rename_and_save_survey(
                    source_filepath, dest_filepath, renaming_map=renaming_map, harmonization_map=harmonization_map
                )
                
                if renamed_count >= 0:
                    total_files_processed += 1
                    total_columns_unified += renamed_count
                    if renamed_count > 0:
                        print(f"[OK] {year}/{filename}: Unified {renamed_count} column(s).")
                    else:
                        print(f"[OK] {year}/{filename}: Saved (No unification needed in this file).")
                
                else:
                    print(f"[FAIL] {year}/{filename}: Check error log above.")

    print("-" * 50)
    print("BATCH RENAMING COMPLETE.")
    print(f"Total Files Processed: {total_files_processed}")
    print(f"Total Columns Unified Across All Files: {total_columns_unified}")
    print(f"Consolidated data is ready for FMI analysis in the '{DESTINATION_FOLDER}' folder.")


# --- EXECUTION BLOCK ---
if __name__ == "__main__":
    # Ensure base_path is defined if running outside of a notebook cell
    # Example: base_path = r"G:\.shortcut-targets-by-id\..." 
    
    # Run the batch processor
    run_batch_renaming(decoded_path, renamed_path, REVERSE_RENAMING_MAP, AVAILABLE_FOR_WORK_HARMONIZATION)

Output folder created/verified: /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/Renamed Fully Decoded Surveys

--- STARTING BATCH RENAMING AND SAVING ---
Source: Fully Decoded Surveys
Destination: Renamed Fully Decoded Surveys
Total variables to unify: 31
--------------------------------------------------
[OK] 2018/JULY_2018.CSV: Unified 14 column(s).
[OK] 2018/OCTOBER_2018.CSV: Unified 14 column(s).
[OK] 2018/APRIL_2018.CSV: Unified 14 column(s).
[OK] 2018/JANUARY_2018.CSV: Unified 14 column(s).
[OK] 2019/JULY_2019.CSV: Unified 12 column(s).
[OK] 2019/JANUARY_2019.CSV: Unified 12 column(s).
[OK] 2019/APRIL_2019.CSV: Unified 12 column(s).
[OK] 2019/OCTOBER_2019.CSV: Unified 12 column(s).
[OK] 2022/DECEMBER_2022.CSV: Unified 13 column(s).
[OK] 2022/FEBRUARY_2022.csv: Unified 12 column(s).
[OK] 2022/SEPTEMBER_2022.CSV: Unified 13 column(s).
[OK] 2022/NOVEMBER_2022.CSV: Unified 13 column(s).
[OK] 2022/JULY_2022.CSV: Unified 12

### Duplication check code


In [35]:
import os
import pandas as pd
from collections import defaultdict
import re


# Source folder from the renaming step
SOURCE_FOLDER = "Renamed Fully Decoded Surveys" 
renamed_path = os.path.join(base_path, SOURCE_FOLDER)

# ===============================================================
# HEADER DUPLICATION CHECK
# ===============================================================

def check_duplicate_headers(source_root):
    """
    Iterates through all CSV files in the source_root and checks 
    each DataFrame for duplicate column headers resulting from the renaming.
    """
    if not os.path.exists(source_root):
        print(f"[ERROR] Source folder not found: {source_root}")
        return

    print("--- STARTING DUPLICATE HEADER CHECK ---")
    print("-" * 50)
    
    total_files_checked = 0
    files_with_duplicates = 0
    
    for year in sorted(os.listdir(source_root)):
        year_source_folder = os.path.join(source_root, year)
        
        if not os.path.isdir(year_source_folder): 
            continue
        
        for filename in os.listdir(year_source_folder):
            if filename.lower().endswith(".csv"):
                
                source_filepath = os.path.join(year_source_folder, filename)
                
                try:
                    df = pd.read_csv(source_filepath, low_memory=False)
                    total_files_checked += 1
                    
                    # 1. Get all column names
                    columns = df.columns.tolist()
                    
                    # 2. Find duplicated column names
                    seen = set()
                    duplicates = set()
                    
                    for col in columns:
                        if col in seen:
                            duplicates.add(col)
                        seen.add(col)
                        
                    if duplicates:
                        files_with_duplicates += 1
                        print(f"[DUPLICATE FOUND] {year}/{filename}")
                        print(f"    Duplicated Headers: {sorted(list(duplicates))}")
                    # else:
                        # Optionally print success for every file:
                        # print(f"[OK] {year}/{filename}: No duplicate headers.")
                        
                except Exception as e:
                    print(f"[ERROR] Failed to read {year}/{filename}: {e}")

    print("-" * 50)
    print("DUPLICATION CHECK COMPLETE.")
    print(f"Total files checked: {total_files_checked}")
    if files_with_duplicates > 0:
        print(f"Total files with duplicates: {files_with_duplicates} (REQUIRES CONSOLIDATION)")
    else:
        print("No duplicate headers found across all files. (Ready for FMI)")


# --- EXECUTION BLOCK ---
if __name__ == "__main__":
    check_duplicate_headers(renamed_path)


--- STARTING DUPLICATE HEADER CHECK ---
--------------------------------------------------
--------------------------------------------------
DUPLICATION CHECK COMPLETE.
Total files checked: 40
No duplicate headers found across all files. (Ready for FMI)


#### Checking for duplicates and the labels in the "Available for Work" variable

In [36]:
import os
import pandas as pd
from collections import defaultdict
import re

# ===============================================================
# PATHS AND CONFIGURATION
# ===============================================================
# ASSUMPTION: 'base_path' is defined globally.

SOURCE_FOLDER = "Renamed Fully Decoded Surveys" 
renamed_path = os.path.join(base_path, SOURCE_FOLDER)

# The specific consolidated variable we are checking
TARGET_VARIABLE = "Available for Work"

# ===============================================================
# CHECK FUNCTIONS
# ===============================================================

def validate_available_for_work(source_root):
    """
    Checks all files for the consolidation integrity of the 'Available for Work'
    variable, verifies its label set, and checks for any duplicated headers.
    """
    if not os.path.exists(source_root):
        print(f"[ERROR] Source folder not found: {source_root}")
        return

    # Tracking variables
    months_with_variable_present = set() 
    global_label_set = set()
    
    total_files_checked = 0
    files_with_duplicates = 0
    
    # --- MODIFICATION 1: Update the starting print statement ---
    print(f"--- STARTING DUPLICATE HEADER CHECK FOR \"{TARGET_VARIABLE}\" VARIABLE ---")
    print("-" * 50)

    # --- 1. Iterate through files and check integrity ---
    for year in sorted(os.listdir(source_root)):
        year_source_folder = os.path.join(source_root, year)
        if not os.path.isdir(year_source_folder): 
            continue
        
        for filename in os.listdir(year_source_folder):
            if filename.lower().endswith(".csv"):
                
                source_filepath = os.path.join(year_source_folder, filename)
                month_part = filename.split('_')[0].capitalize()
                month_year = f"{month_part} {year}"
                
                try:
                    df = pd.read_csv(source_filepath, low_memory=False)
                    total_files_checked += 1
                    
                    # --- A. Duplicated Headers Check ---
                    if len(df.columns) != len(set(df.columns)):
                        files_with_duplicates += 1
                    
                    # --- B. Variable Consolidation and Label Check ---
                    
                    # Check for the presence of the target variable
                    if TARGET_VARIABLE in df.columns:
                        
                        # Check for single consolidation: must only appear once
                        if list(df.columns).count(TARGET_VARIABLE) == 1:
                            
                            # Add the month/year ONLY if the variable is present and correctly consolidated
                            months_with_variable_present.add(month_year)
                            
                            # Extract unique labels (excluding NaNs and potential blanks)
                            labels = df[TARGET_VARIABLE].dropna().unique()
                            
                            # Ensure labels are treated as strings and stripped for safety
                            cleaned_labels = {str(l).strip() for l in labels if str(l).strip() != ''}
                            
                            global_label_set.update(cleaned_labels)
                            
                        elif list(df.columns).count(TARGET_VARIABLE) > 1:
                            files_with_duplicates += 1 
                    
                except Exception as e:
                    print(f"[ERROR] Failed to read {year}/{filename}: {e}")

    # --- 2. Print Final Summary ---
    print("-" * 50)

    # Sort labels to match expected output format
    sorted_labels = sorted(list(global_label_set))
    
    # --- MODIFICATION 2: Removed month count and parentheses ---
    print(f"{TARGET_VARIABLE}")
    print(f"Labels: {', '.join(sorted_labels)}")
    print("\n" + "=" * 50)
    
    print(f"Total files checked: {total_files_checked}")
    if files_with_duplicates > 0:
        print(f"WARNING: {files_with_duplicates} files contained duplicate headers (REQUIRES CONSOLIDATION STEP).")
    else:
        print("No duplicate headers found across all files.")


# --- EXECUTION BLOCK ---
if __name__ == "__main__":
    try:
        # Assumes base_path is defined globally
        validate_available_for_work(renamed_path)
    except NameError:
        print("ERROR: 'base_path' variable not found. Please ensure it is defined before execution.")

--- STARTING DUPLICATE HEADER CHECK FOR "Available for Work" VARIABLE ---
--------------------------------------------------
--------------------------------------------------
Available for Work
Labels: No, Yes

Total files checked: 40
No duplicate headers found across all files.


### Code Automation for FMI

In [37]:
import os
import pandas as pd

# --- Missingness detector ---
TEXT_MISSING = {"", " ", "NA", "N/A", "NaN", "nan", ".", "-", "_"}
NUMERIC_SENTINELS = {9, 99, 999, 9999, -9, -99, -999, -9999}

def build_missing_mask(series: pd.Series,
                       include_numeric_sentinels: bool = True) -> pd.Series:
    """
    Detect missing values for FMI with decode-aware logic:
    - NaN
    - Empty strings / whitespace-only
    - Explicit blanks introduced by decoding (survey value missing OR metadata missing)
    - Common text tokens (NA, N/A, ., -, _)
    - Optional numeric sentinels (9, 99, 999, ... and negatives)
    """
    s = series.astype(str).str.strip()
    mask = series.isna() | (s == "") | s.isin(TEXT_MISSING)

    if include_numeric_sentinels:
        s_num = pd.to_numeric(series, errors="coerce")
        mask |= s_num.isin(NUMERIC_SENTINELS)

    return mask


def fmi_scan_csv(file_path: str, year: str, month: str,
                 include_numeric_sentinels: bool = True) -> pd.DataFrame:
    """
    Compute FMI per column for a single survey CSV.
    Duplicate headers are consolidated: only one FMI per unique column name.
    """
    df = pd.read_csv(file_path, low_memory=False)
    rows = []

    # --- Consolidate duplicates ---
    unique_columns = []
    seen = set()
    for col in df.columns:
        if col not in seen:
            unique_columns.append(col)
            seen.add(col)
        else:
            # Skip duplicate header
            continue

    # --- Scan only unique columns ---
    for col in unique_columns:
        miss_mask = build_missing_mask(df[col], include_numeric_sentinels=include_numeric_sentinels)
        missing = int(miss_mask.sum())
        total = int(len(df[col]))
        fmi = (missing / total) if total > 0 else 0.0

        # Flag severity
        if fmi < 0.05:
            flag, rec = "Low", "Keep"
        elif fmi < 0.20:
            flag, rec = "Moderate", "Consider imputation"
        elif fmi < 0.40:
            flag, rec = "High", "Strongly consider imputation"
        else:
            flag, rec = "Critical", "Candidate to drop (validate with business logic)"

        rows.append({
            "Year": year,
            "Month": month,
            "Column": col,
            "Missing": missing,
            "Total": total,
            "FMI": round(fmi, 6),
            "Flag": flag,
            "Recommendation": rec
        })

    return pd.DataFrame(rows)



### Automation tester for FMI

In [38]:
trial_path = os.path.join(base_path, "Renamed Fully Decoded Surveys", "2018", "January_2018.csv")
test_report = fmi_scan_csv(trial_path, "2018", "January", include_numeric_sentinels=True)
print(f"Scanned {len(test_report)} variables")
test_report.head(50)

Scanned 50 variables


Unnamed: 0,Year,Month,Column,Missing,Total,FMI,Flag,Recommendation
0,2018,January,Region,0,180262,0.0,Low,Keep
1,2018,January,Province,592,180262,0.003284,Low,Keep
2,2018,January,Province Recode,0,180262,0.0,Low,Keep
3,2018,January,Household Unique Sequential Number,17,180262,9.4e-05,Low,Keep
4,2018,January,Urban-RuralFIES,0,180262,0.0,Low,Keep
5,2018,January,Final Weight Based on Projection (provincial p...,0,180262,0.0,Low,Keep
6,2018,January,Survey Month,0,180262,0.0,Low,Keep
7,2018,January,Survey Year,0,180262,0.0,Low,Keep
8,2018,January,Psu Number,1246,180262,0.006912,Low,Keep
9,2018,January,Replicate,637,180262,0.003534,Low,Keep


In [39]:
# Quick check if it works on other datasets
apr_path = os.path.join(base_path, "Renamed Fully Decoded Surveys", "2018", "APRIL_2018.CSV")
apr_2018_report = fmi_scan_csv(apr_path, "2018", "April", base_path)
apr_2018_report.head(50)

Unnamed: 0,Year,Month,Column,Missing,Total,FMI,Flag,Recommendation
0,2018,April,Region,0,179815,0.0,Low,Keep
1,2018,April,Province,633,179815,0.00352,Low,Keep
2,2018,April,Province Recode,0,179815,0.0,Low,Keep
3,2018,April,household_seq_number,14,179815,7.8e-05,Low,Keep
4,2018,April,Urban-RuralFIES,0,179815,0.0,Low,Keep
5,2018,April,Final Weight Based on Projection (provincial p...,0,179815,0.0,Low,Keep
6,2018,April,Survey Month,0,179815,0.0,Low,Keep
7,2018,April,Survey Year,0,179815,0.0,Low,Keep
8,2018,April,Psu Number,614,179815,0.003415,Low,Keep
9,2018,April,Replicate,2028,179815,0.011278,Low,Keep


### Batch/Automation Runner - Redirect in Drive

In [40]:
decoded_path = os.path.join(base_path, "Renamed Fully Decoded Surveys")
output_root = os.path.join(base_path, "FMI Reports")

print("===============================================")
print("STARTING BATCH FMI REPORTS")
print(f"Source: {decoded_path}")
print(f"Dest:   {output_root}")
print("===============================================\n")

success_count = 0
error_count = 0

for year in os.listdir(decoded_path):
    year_folder = os.path.join(decoded_path, year)
    if not os.path.isdir(year_folder):
        continue

    output_year_folder = os.path.join(output_root, year)
    os.makedirs(output_year_folder, exist_ok=True)

    for file in os.listdir(year_folder):
        if not file.endswith(".CSV"):
            continue

        month = file.split("_")[0].capitalize()
        file_path = os.path.join(year_folder, file)

        print(f"Processing: {month} {year}...")

        try:
            report = fmi_scan_csv(file_path, year, month)
            out_file = os.path.join(output_year_folder, f"FMI_{month}_{year}.csv")
            report.to_csv(out_file, index=False)

            print(f"   [OK] Scanned {len(report)} variables.")
            print(f"   [SAVED] FMI_{month}_{year}.csv")
            print("----------------------------------------")
            success_count += 1
        except Exception as e:
            print(f"   [ERROR] {file} → {e}")
            print("----------------------------------------")
            error_count += 1

print(f"\nCOMPLETED. Success: {success_count} | Errors: {error_count}")

STARTING BATCH FMI REPORTS
Source: /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/Renamed Fully Decoded Surveys
Dest:   /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/FMI Reports

Processing: December 2022...
   [OK] Scanned 42 variables.
   [SAVED] FMI_December_2022.csv
----------------------------------------
Processing: September 2022...
   [OK] Scanned 42 variables.
   [SAVED] FMI_September_2022.csv
----------------------------------------
Processing: November 2022...
   [OK] Scanned 42 variables.
   [SAVED] FMI_November_2022.csv
----------------------------------------
Processing: July 2022...
   [OK] Scanned 52 variables.
   [SAVED] FMI_July_2022.csv
----------------------------------------
Processing: August 2022...
   [OK] Scanned 42 variables.
   [SAVED] FMI_August_2022.csv
----------------------------------------
Processing: October 2022...
   [OK] S

### Weighted Average of FMI across datasets, per variable

In [41]:

reports_root = os.path.join(base_path, "FMI Reports")

# --- Load all monthly FMI reports ---
all_reports = []
for year in os.listdir(reports_root):
    year_folder = os.path.join(reports_root, year)
    if not os.path.isdir(year_folder):
        continue
    for file in os.listdir(year_folder):
        if not file.endswith(".csv"):
            continue
        file_path = os.path.join(year_folder, file)
        df = pd.read_csv(file_path)
        all_reports.append(df)

# Combine all months into one DataFrame
combined = pd.concat(all_reports, ignore_index=True)

# --- Aggregate per variable across all years/months ---
FMI_summary = (
    combined.groupby("Column")
    .agg(
        TotalMissing=("Missing", "sum"),
        TotalRows=("Total", "sum"),
        AvgFMI=("FMI", "mean"),
        MonthsObserved=("Year", "count")
    )
    .reset_index()
)

# Compute overall FMI (weighted by total rows)
FMI_summary["OverallFMI"] = FMI_summary["TotalMissing"] / FMI_summary["TotalRows"]

# Flag severity based on OverallFMI
def flag_and_rec(fmi):
    if fmi < 0.05:
        return "Low", "Keep"
    elif fmi < 0.20:
        return "Moderate", "Consider imputation"
    elif fmi < 0.40:
        return "High", "Strongly consider imputation"
    else:
        return "Critical", "Candidate to drop (validate with business logic)"

FMI_summary[["Flag", "Recommendation"]] = FMI_summary["OverallFMI"].apply(
    lambda x: pd.Series(flag_and_rec(x))
)

FMI_summary.head(50)


Unnamed: 0,Column,TotalMissing,TotalRows,AvgFMI,MonthsObserved,OverallFMI,Flag,Recommendation
0,Available for Work,4712321,4881364,0.967968,34,0.96537,Critical,Candidate to drop (validate with business logic)
1,C03-Relationship to Household Head,0,4881364,0.0,34,0.0,Low,Keep
2,C04-Sex,0,4881364,0.0,34,0.0,Low,Keep
3,C05-Age as of Last Birthday,82751,4881364,0.019262,34,0.016952,Low,Keep
4,C05B - Ethnicity,0,707981,0.0,1,0.0,Low,Keep
5,C06-Marital Status,358820,4881364,0.069921,34,0.073508,Moderate,Consider imputation
6,C07-Highest Grade Completed,361914,4881364,0.071961,34,0.074142,Moderate,Consider imputation
7,C08-Currently Attending School,2281910,4116745,0.567432,17,0.5543,Critical,Candidate to drop (validate with business logic)
8,C08-Overseas Filipino Indicator,203920,764619,0.266685,17,0.266695,High,Strongly consider imputation
9,C09-Graduate of technical/vocational course,1162929,4116745,0.285236,17,0.282487,High,Strongly consider imputation


### Weighted Avg of FMI - Redirect to Drive

In [42]:

reports_root = os.path.join(base_path, "FMI Reports")

print("===============================================")
print("STARTING OVERALL FMI SUMMARY (2018–2024)")
print(f"Source: {reports_root}")
print(f"Dest:   {reports_root}")
print("===============================================\n")

success_count = 0
error_count = 0

# --- Load all monthly FMI reports ---
all_reports = []
for year in os.listdir(reports_root):
    year_folder = os.path.join(reports_root, year)
    if not os.path.isdir(year_folder):
        continue

    for file in os.listdir(year_folder):
        if not file.endswith(".csv"):
            continue
        file_path = os.path.join(year_folder, file)
        try:
            df = pd.read_csv(file_path)
            all_reports.append(df)
            print(f"Processing: {file}...")
            print(f"   [OK] Loaded {len(df)} rows.")
            print("----------------------------------------")
            success_count += 1
        except Exception as e:
            print(f"   [ERROR] {file} → {e}")
            print("----------------------------------------")
            error_count += 1

# --- Combine all reports ---
combined = pd.concat(all_reports, ignore_index=True)

# --- Aggregate per variable across all years/months ---
FMI_summary = (
    combined.groupby("Column")
    .agg(
        TotalMissing=("Missing", "sum"),
        TotalRows=("Total", "sum"),
        AvgFMI=("FMI", "mean"),
        MonthsObserved=("Year", "count")
    )
    .reset_index()
)

# Compute overall FMI (weighted by total rows)
FMI_summary["OverallFMI"] = FMI_summary["TotalMissing"] / FMI_summary["TotalRows"]

# Flag severity based on OverallFMI
def flag_and_rec(fmi):
    if fmi < 0.05:
        return "Low", "Keep"
    elif fmi < 0.20:
        return "Moderate", "Consider imputation"
    elif fmi < 0.40:
        return "High", "Strongly consider imputation"
    else:
        return "Critical", "Candidate to drop (validate with business logic)"

FMI_summary[["Flag", "Recommendation"]] = FMI_summary["OverallFMI"].apply(
    lambda x: pd.Series(flag_and_rec(x))
)

# --- Save overall summary to Drive ---
out_file = os.path.join(reports_root, "FMI_Summary_2018_2024.csv")
FMI_summary.to_csv(out_file, index=False)

print(f"\nCOMPLETED. Success: {success_count} | Errors: {error_count}")
print(f"[SAVED] FMI_Summary_2018_2024.csv")


STARTING OVERALL FMI SUMMARY (2018–2024)
Source: /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/FMI Reports
Dest:   /Users/neilkeannedelavega/Library/CloudStorage/GoogleDrive-shaniakeith23@gmail.com/My Drive/Labor Force Survey/FMI Reports

Processing: FMI_July_2022.csv...
   [OK] Loaded 52 rows.
----------------------------------------
Processing: FMI_December_2022.csv...
   [OK] Loaded 42 rows.
----------------------------------------
Processing: FMI_September_2022.csv...
   [OK] Loaded 42 rows.
----------------------------------------
Processing: FMI_August_2022.csv...
   [OK] Loaded 42 rows.
----------------------------------------
Processing: FMI_October_2022.csv...
   [OK] Loaded 52 rows.
----------------------------------------
Processing: FMI_November_2022.csv...
   [OK] Loaded 42 rows.
----------------------------------------
Processing: FMI_June_2024.csv...
   [OK] Loaded 40 rows.
---------------------------------