## Installing Libraries Needed

In [1]:
!python -m pip install pandas



In [2]:
!python -m pip install openpyxl




## Dataset Inventory Loader

In [1]:
import os
import re

base_path = r"G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey"

# Month ordering
month_order = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

# Patterns
month_pattern = re.compile(
    r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
    re.IGNORECASE
)
year_pattern = re.compile(r"(20\d{2})")

# Detect year folders from drive
year_folders = [
    f for f in os.listdir(base_path)
    if os.path.isdir(os.path.join(base_path, f)) and f.isdigit()
]

print("Detected year folders:", sorted(year_folders))

inventory = {}

for year in sorted(year_folders):
    year_path = os.path.join(base_path, year)

    # Accept both CSV and XLSX
    data_files = [
        f for f in os.listdir(year_path)
        if f.lower().endswith(".csv") or f.lower().endswith(".xlsx")
    ]

    inventory[year] = {}

    for file in data_files:
        upper = file.upper()

        # Detect type
        if upper.endswith(".XLSX"):
            filetype = "metadata"  # XLSX = metadata
        else:
            filetype = "survey"    # CSV = survey

        # Detect month
        month_match = month_pattern.search(upper)
        month = (
            month_match.group(1).capitalize()
            if month_match
            else "Unmatched"
        )

        # Detect year inside filename
        year_match = year_pattern.search(upper)
        file_year = year_match.group(1) if year_match else "UNKNOWN"

        # Store into inventory
        if month not in inventory[year]:
            inventory[year][month] = []

        inventory[year][month].append({
            "filename": file,
            "filetype": filetype,
            "file_year": file_year
        })

# Print clean summary
print("\n=== DATASET INVENTORY SUMMARY ===\n")

for yr in sorted(inventory.keys()):
    print(f"Year {yr}:")

    sorted_months = sorted(
        inventory[yr].keys(),
        key=lambda m: month_order.get(m, 99)
    )

    for month in sorted_months:
        print(f"  {month}:")
        for item in inventory[yr][month]:
            print(f"    {item['filename']} ({item['filetype']})")

    print()


Detected year folders: ['2018', '2019', '2022', '2023', '2024']

=== DATASET INVENTORY SUMMARY ===

Year 2018:
  January:
    JANUARY_2018_METADATA.xlsx (metadata)
    JANUARY_2018.CSV (survey)
  April:
    APRIL_2018_METADATA.xlsx (metadata)
    APRIL_2018.CSV (survey)
  July:
    JULY_2018.CSV (survey)
    JULY_2018_METADATA.xlsx (metadata)
  October:
    OCTOBER_2018_METADATA.xlsx (metadata)
    OCTOBER_2018.CSV (survey)

Year 2019:
  January:
    JANUARY_2019_METADATA.xlsx (metadata)
    JANUARY_2019.CSV (survey)
  April:
    APRIL_2019.CSV (survey)
    APRIL_2019_METADATA.xlsx (metadata)
  July:
    JULY_2019.CSV (survey)
    JULY_2019_METADATA.xlsx (metadata)
  October:
    OCTOBER_2019.CSV (survey)
    OCTOBER_2019_METADATA.xlsx (metadata)

Year 2022:
  January:
    JANUARY_2022_METADATA.xlsx (metadata)
    JANUARY_2022.csv (survey)
  February:
    FEBRUARY_2022_METADATA.xlsx (metadata)
    FEBRUARY_2022.csv (survey)
  March:
    MARCH_2022.csv (survey)
    MARCH_2022_METADATA.x

## Load Dataset Function

In [2]:
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load a dataset from the inventory.

    year: str, e.g., "2018"
    month: str, e.g., "January"
    filetype: "survey" or "metadata"
    sheet_number: 0(sheet 1) or 1(sheet 2)
    """
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )
    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])
    
    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)
    
    if sheet_number is not None:
        return pd.read_excel(file_path, sheet_name=sheet_number)
    
    return pd.read_excel(file_path)

Sample: January 2018 Survey

In [3]:
import pandas as pd

# Load the survey sheet of January 2018 metadata
jan_2018_survey = load_dataset("2018", "January","survey")

# View the first few rows
jan_2018_survey.head()

Unnamed: 0,PUFREG,PUFPRV,PUFPRRCD,PUFHHNUM,PUFURB2K10,PUFPWGTPRV,PUFSVYMO,PUFSVYYR,PUFPSU,PUFRPL,...,PUFC33_WEEKS,PUFC34_WYNOT,PUFC35_LTLOOKW,PUFC36_AVAIL,PUFC37_WILLING,PUFC38_PREVJOB,PUFC40_POCC,PUFC41_WQTR,PUFC43_QKB,PUFNEWEMPSTAT
0,14,1,100,1,2,124.9425,1,2018,140,32,...,,6.0,,,,1.0,52.0,2.0,,3.0
1,14,1,100,1,2,131.2126,1,2018,140,32,...,,,,,,,,1.0,1.0,1.0
2,14,1,100,1,2,142.0464,1,2018,140,32,...,,,,,,,,1.0,1.0,1.0
3,14,1,100,1,2,138.2958,1,2018,140,32,...,,,,,,,,,,
4,14,1,100,2,2,195.4152,1,2018,140,32,...,,,,,,,,1.0,41.0,1.0


## Metadata Sheet 1

<H5> Sample: January 2018 Metadata Sheet 1 (Raw) </H5>

In [4]:
# Load the first sheet of January 2018 metadata
january_2018_metadata_sheet1 = load_dataset("2018", "January", "metadata", 0)

# View the first few rows
print("=== January 2018 Metadata Sheet 1 (Raw) ===")
january_2018_metadata_sheet1.head()


=== January 2018 Metadata Sheet 1 (Raw) ===


Unnamed: 0,QUEST,Questionnaire,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,_IDS0,(Id Items),,
1,,,,,PUFREG,Region
2,,,,,PUFPRV,Province
3,,,,,PUFPRRCD,Province Recode
4,,,,,PUFHHNUM,Household Unique Sequential Number


#### Reshaping Metadata Sheet 1

In [5]:
import pandas as pd

def extract_variables(df):
    """
    Extract variable names and descriptions from metadata Sheet 1 (variable dictionary).
    Automatically reads the 4th and 5th columns (E and F in Excel) where variables and descriptions reside.
    
    Returns a clean DataFrame with columns ['Variable', 'Description'].
    """
    
    # Select the 4th and 5th columns (index 4 and 5)
    df_vars = df.iloc[:, 4:6].copy()
    
    # Rename columns
    df_vars.columns = ['Variable', 'Description']
    
    # Drop rows where 'Variable' is empty or NaN
    df_vars = df_vars[df_vars['Variable'].notna() & (df_vars['Variable'].astype(str).str.strip() != '')]
    
    # Strip whitespace from values
    df_vars['Variable'] = df_vars['Variable'].astype(str).str.strip()
    df_vars['Description'] = df_vars['Description'].astype(str).str.strip()
    
    # Reset index
    df_vars = df_vars.reset_index(drop=True)
    
    return df_vars


### Metadata Sheet 1 Reshaped Saving Function

In [6]:
import os
import pandas as pd

def batch_process_sheet1_metadata(inventory, base_output_path):
    """
    Loops through the entire inventory, loads Sheet 1 of the metadata,
    reshapes it, and saves it into a structured folder hierarchy.
    
    Provides a text-based summary report for assurance.
    """
    
    # Counters for the summary report
    success_count = 0
    failure_count = 0
    skipped_count = 0
    errors_log = []

    # 1. Define and Create the Main Parent Folder
    main_folder_name = "Metadata Sheet 1 CSV's"
    main_folder_path = os.path.join(base_output_path, main_folder_name)
    os.makedirs(main_folder_path, exist_ok=True)
    
    print("--- STARTING BATCH PROCESS ---")
    print(f"Target Directory: {main_folder_path}")
    print("-" * 50)

    # 2. Iterate through Years in the Inventory
    for year, months_data in inventory.items():
        
        # Create the Year Subfolder
        year_folder_path = os.path.join(main_folder_path, year)
        os.makedirs(year_folder_path, exist_ok=True)
        
        # 3. Iterate through Months in that Year
        for month, files_list in months_data.items():
            
            if month == "Unmatched":
                continue
            
            # Check for metadata file existence
            has_metadata = any(f.get('filetype') == 'metadata' for f in files_list)
            
            if has_metadata:
                try:
                    # A. Load the Data (Sheet 0 = Sheet 1)
                    raw_df = load_dataset(year, month, "metadata", 0)
                    
                    # B. Reshape the Data
                    clean_df = extract_variables(raw_df)
                    
                    # C. Save to CSV
                    filename = f"Sheet1_{month}_{year}.csv"
                    full_save_path = os.path.join(year_folder_path, filename)
                    
                    clean_df.to_csv(full_save_path, index=False)
                    
                    # Print confirmation for this specific file
                    print(f"[OK] Saved: {year}/{filename}")
                    success_count += 1
                    
                except Exception as e:
                    print(f"[ERROR] Failed {month} {year}: {e}")
                    errors_log.append(f"{month} {year}: {str(e)}")
                    failure_count += 1
            else:
                skipped_count += 1

    # 4. Final Assurance Report
    print("\n" + "="*40)
    print("      PROCESSING SUMMARY REPORT")
    print("="*40)
    print(f"Total Successfully Saved: {success_count}")
    print(f"Total Failed:             {failure_count}")
    print(f"Total Skipped (No File):  {skipped_count}")
    print("-" * 40)
    
    if failure_count == 0:
        print("STATUS: COMPLETE SUCCESS")
        print(f"All files are now located in: {main_folder_path}")
        print("Google Drive is syncing these files now.")
    else:
        print("STATUS: COMPLETED WITH ERRORS")
        print("Check the errors log above.")
        if errors_log:
            print("\nError Details:")
            for err in errors_log:
                print(f" - {err}")
    print("="*40)

In [7]:
# Run the processor
batch_process_sheet1_metadata(inventory, base_path)

--- STARTING BATCH PROCESS ---
Target Directory: G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey\Metadata Sheet 1 CSV's
--------------------------------------------------
[OK] Saved: 2018/Sheet1_January_2018.csv
[OK] Saved: 2018/Sheet1_April_2018.csv
[OK] Saved: 2018/Sheet1_July_2018.csv
[OK] Saved: 2018/Sheet1_October_2018.csv
[OK] Saved: 2019/Sheet1_January_2019.csv
[OK] Saved: 2019/Sheet1_April_2019.csv
[OK] Saved: 2019/Sheet1_July_2019.csv
[OK] Saved: 2019/Sheet1_October_2019.csv
[OK] Saved: 2022/Sheet1_August_2022.csv
[OK] Saved: 2022/Sheet1_September_2022.csv
[OK] Saved: 2022/Sheet1_January_2022.csv
[OK] Saved: 2022/Sheet1_June_2022.csv
[OK] Saved: 2022/Sheet1_October_2022.csv
[OK] Saved: 2022/Sheet1_December_2022.csv
[OK] Saved: 2022/Sheet1_March_2022.csv
[OK] Saved: 2022/Sheet1_April_2022.csv
[OK] Saved: 2022/Sheet1_May_2022.csv
[OK] Saved: 2022/Sheet1_July_2022.csv
[OK] Saved: 2022/Sheet1_November_2022.csv
[OK] Saved: 2022/Sheet1_February_2022.c

#### Verifying if the variable and description counts of Reshaped Metadata Sheet 1 and Original matches

In [8]:
def batch_verify_sheet1_variable_and_description_count_verbose(inventory, base_path):
    """
    Iterates through all years and months in the inventory and compares
    total variables and descriptions in raw vs reshaped Sheet 1 metadata.
    Prints mismatches immediately, and returns a DataFrame with all results.
    """

    results = []

    for year, months_data in inventory.items():
        for month, files_list in months_data.items():
            if month == "Unmatched":
                continue  # Skip unmatched files

            # --- Load raw Sheet 1 ---
            try:
                raw_df = load_dataset(year, month, "metadata", sheet_number=0)
            except Exception as e:
                print(f"[ERROR] {month} {year}: Could not load raw Sheet 1 ({e})")
                results.append({
                    'Year': year,
                    'Month': month,
                    'Raw Variable Count': 'ERROR',
                    'Reshaped Variable Count': 'ERROR',
                    'Raw Description Count': 'ERROR',
                    'Reshaped Description Count': 'ERROR',
                    'Status': f'FAIL (Raw load error: {e})'
                })
                continue

            # --- Load reshaped CSV Sheet 1 ---
            reshaped_file_path = os.path.join(
                base_path, "Metadata Sheet 1 CSV's", year, f"Sheet1_{month}_{year}.csv"
            )
            if not os.path.exists(reshaped_file_path):
                print(f"[ERROR] {month} {year}: Reshaped Sheet 1 CSV missing!")
                results.append({
                    'Year': year,
                    'Month': month,
                    'Raw Variable Count': 'ERROR',
                    'Reshaped Variable Count': 'ERROR',
                    'Raw Description Count': 'ERROR',
                    'Reshaped Description Count': 'ERROR',
                    'Status': 'FAIL (Reshaped CSV missing)'
                })
                continue

            reshaped_df = pd.read_csv(reshaped_file_path)

            # --- Count non-empty variables and descriptions ---
            raw_vars = raw_df.iloc[:, 4].dropna().astype(str).str.strip()
            raw_vars = raw_vars[raw_vars != '']
            raw_descs = raw_df.iloc[:, 5].dropna().astype(str).str.strip()
            raw_descs = raw_descs[raw_descs != '']

            reshaped_vars = reshaped_df['Variable'].astype(str).str.strip()
            reshaped_vars = reshaped_vars[reshaped_vars != '']
            reshaped_descs = reshaped_df['Description'].astype(str).str.strip()
            reshaped_descs = reshaped_descs[reshaped_descs != '']

            # --- Check if both counts match ---
            status = "PASS" if (len(raw_vars) == len(reshaped_vars) and len(raw_descs) == len(reshaped_descs)) else "FAIL"

            if status == "FAIL":
                # Immediate print for any mismatch
                print(f"[MISMATCH] {month} {year} - Variables: {len(raw_vars)} vs {len(reshaped_vars)}, "
                      f"Descriptions: {len(raw_descs)} vs {len(reshaped_descs)}")

            results.append({
                'Year': year,
                'Month': month,
                'Raw Variable Count': len(raw_vars),
                'Reshaped Variable Count': len(reshaped_vars),
                'Raw Description Count': len(raw_descs),
                'Reshaped Description Count': len(reshaped_descs),
                'Status': status
            })

    return pd.DataFrame(results).sort_values(['Year', 'Month']).reset_index(drop=True)

In [9]:
# Run the Sheet 1 verifier
verification_df = batch_verify_sheet1_variable_and_description_count_verbose(inventory, base_path)

# Print a header and show the first few rows
print("=== Sheet 1 Metadata Variables and Descriptions (Raw vs Reshaped) ===")
verification_df.head()

=== Sheet 1 Metadata Variables and Descriptions (Raw vs Reshaped) ===


Unnamed: 0,Year,Month,Raw Variable Count,Reshaped Variable Count,Raw Description Count,Reshaped Description Count,Status
0,2018,April,50,50,50,50,PASS
1,2018,January,50,50,50,50,PASS
2,2018,July,51,51,51,51,PASS
3,2018,October,51,51,51,51,PASS
4,2019,April,49,49,49,49,PASS


Checking January 2018 Metadata Reshaped Sheet 1

In [10]:
# Load metadata Sheet 1
January_metadata = load_dataset("2018", "January", "metadata", 0)

# Call your function
variables_df = extract_variables(January_metadata)

# View results
variables_df.head()

Unnamed: 0,Variable,Description
0,PUFREG,Region
1,PUFPRV,Province
2,PUFPRRCD,Province Recode
3,PUFHHNUM,Household Unique Sequential Number
4,PUFURB2K10,2010Urban-RuralFIES


Checking August 2024 Metadata Reshaped Sheet 1

In [11]:
# Load metadata Sheet 1
August_2024_metadata = load_dataset("2024", "August", "metadata", 0)

# Call your function
variables_df = extract_variables(August_2024_metadata)

# View results
variables_df.head()

Unnamed: 0,Variable,Description
0,PUFHHNUM,Household Unique Sequential Number
1,PUFPWGTPRV,Final Weight Based on Projection
2,PUFSVYMO,Survey Month
3,PUFSVYYR,Survey Year
4,PUFPSU,Psu Number


## Metadata Sheet 2 Function

<H5> Sample: January 2018 Metadata Sheet 2 (Raw)</H5>

In [12]:
# Load the second sheet of January 2018 metadata
january_2018_metadata_sheet2 = load_dataset("2018", "January", "metadata", 1)

# View the first few rows
print("=== January 2018 Metadata Sheet 2 (Raw) ===")
january_2018_metadata_sheet2.head()

=== January 2018 Metadata Sheet 2 (Raw) ===


Unnamed: 0,PUFREG_VS1,Region,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,National Capital Region,13,,
1,,,Cordillera Administrative Region,14,,
2,,,Region I - Ilocos Region,1,,
3,,,Region II - Cagayan Valley,2,,
4,,,Region III - Central Luzon,3,,


### Reshaping Metadata Sheet 2

In [13]:
import os
import pandas as pd


def reshape_sheet2_robust(df):
    """
    Convert metadata Sheet 2 (the values dictionary) into a clean, long-format table.

    This function reads the sheet exactly as it appears in Excel, without:
    - Assuming any header row
    - Auto-filling missing values
    - Inferencing min/max values
    - Guessing variable names

    Sheet 2 typically has this layout:
        Column A = Variable name (only appears once per block)
        Column B = Variable description (blank except at the start of a block)
        Column C = Label for each value (required)
        Column D = Minimum value (optional)
        Column E = Maximum value (optional)
        Column F+ = Additional text or category notes (optional)

    The function processes rows in order and:
        - Carries forward the most recent non-empty variable name (Column A)
        - Carries forward the most recent non-empty description (Column B)
        - Creates one output row per value label (Column C)
        - Leaves missing min/max/additional values as 0
        - Reads extra info (Column F onward) if present

    Returns:
        A clean pandas DataFrame with columns:
            Variable
            Description
            Label
            min_value
            max_value
            additional_value
    """

    reshaped = []

    # Ensure all blanks are handled consistently
    df = df.fillna('').astype(str)

    # Initialize with the first variable and description
    current_var = df.iloc[0, 0].strip() or 'UNKNOWN_VAR'
    current_desc = df.iloc[0, 1].strip() or ''

    # Iterate row-by-row
    for idx, row in df.iterrows():
        # ---- Column A: Variable name ----
        var_candidate = row.iloc[0].strip()
        if var_candidate:
            current_var = var_candidate

        # ---- Column B: Description ----
        desc_candidate = row.iloc[1].strip()
        if desc_candidate:
            current_desc = desc_candidate

        # ---- PRE-READ Columns D, E, F (Values) ----
        raw_min = row.iloc[3].strip()
        raw_max = row.iloc[4].strip()
        
        # Look for extra values (Column F+)
        extra = '0'
        if len(row) > 5:
            for j in range(5, len(row)):
                extra_candidate = row.iloc[j].strip()
                if extra_candidate:
                    extra = extra_candidate
                    break

        # ---- Column C: Label ----
        label = row.iloc[2].strip()

        # FIX: Don't just continue. Check if values exist.
        if not label:
            # If label is missing BUT we have min, max, or extra -> It's a valid row
            if raw_min or raw_max or extra != '0':
                label = '0'  # Assign default label
            else:
                continue     # Skip only if truly empty

        # ---- Finalize Min/Max ----
        min_value = raw_min if raw_min else '0'
        max_value = raw_max if raw_max else '0'

        # ---- Append clean record ----
        reshaped.append({
            "Variable": current_var,
            "Description": current_desc,
            "Label": label,
            "min_value": min_value,
            "max_value": max_value,
            "additional_value": extra
        })

    return pd.DataFrame(reshaped)


# ============================================================
#   load_dataset()
# ============================================================
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load any dataset (survey or metadata) from the file inventory.

    • For SURVEY CSV: normal pandas.read_csv()
    • For METADATA Excel: read with no header, reshape Sheet 2 automatically
    """
    # Retrieve file information from inventory
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )

    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])

    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)

    # Metadata Excel — always read with no header
    df = pd.read_excel(file_path, sheet_name=sheet_number, header=None)

    # Automatic reshaping ONLY for metadata Sheet 2
    if sheet_number == 1:
        df = reshape_sheet2_robust(df)

    return df


### Metadata Sheet 2 Reshaped Saving Function

In [14]:
import os
import pandas as pd

def batch_process_sheet2_metadata(inventory, base_output_path):
    """
    Loops through the inventory to process 'Sheet 2' (Value Codes).
    """
    
    # Counters for the summary report
    success_count = 0
    failure_count = 0
    skipped_count = 0
    errors_log = []

    # 1. Define Main Folder Name
    main_folder_name = "Metadata Sheet 2 CSV's"
    main_folder_path = os.path.join(base_output_path, main_folder_name)
    os.makedirs(main_folder_path, exist_ok=True)
    
    print("--- STARTING BATCH PROCESS (SHEET 2) ---")
    print(f"Target Directory: {main_folder_path}")
    print("-" * 50)

    # 2. Iterate through Inventory
    for year, months_data in inventory.items():
        
        # Create Year Subfolder
        year_folder_path = os.path.join(main_folder_path, year)
        os.makedirs(year_folder_path, exist_ok=True)
        
        for month, files_list in months_data.items():
            # Skip unmatched files
            if month == "Unmatched":
                continue
            
            # Check if metadata exists for this month
            has_metadata = any(f.get('filetype') == 'metadata' for f in files_list)
            
            if has_metadata:
                try:
                    # A. Load & Reshape
                    # Your load_dataset function handles the cleaning internally
                    clean_df = load_dataset(year, month, "metadata", 1)
                    
                    # B. Generate Filename
                    filename = f"Sheet2_{month}_{year}.csv"
                    full_save_path = os.path.join(year_folder_path, filename)
                    
                    # C. Save
                    clean_df.to_csv(full_save_path, index=False)
                    
                    print(f"[OK] Saved: {year}/{filename}")
                    success_count += 1
                    
                except Exception as e:
                    print(f"[ERROR] Failed {month} {year}: {e}")
                    errors_log.append(f"{month} {year}: {str(e)}")
                    failure_count += 1
            else:
                skipped_count += 1

    # 3. Final Report
    print("\n" + "="*40)
    print("      SHEET 2 PROCESSING SUMMARY")
    print("="*40)
    print(f"Total Saved:    {success_count}")
    print(f"Total Failed:   {failure_count}")
    print(f"Total Skipped:  {skipped_count}")
    print("-" * 40)
    
    if failure_count == 0:
        print("STATUS: COMPLETE SUCCESS")
        print(f"Files are syncing to: {main_folder_path}")
    else:
        print("STATUS: COMPLETED WITH ERRORS")
        for err in errors_log:
            print(f" - {err}")
    print("="*40)

In [15]:
# Run the processor
# (Requires 'inventory' and 'load_dataset' to be defined in your environment)
batch_process_sheet2_metadata(inventory, base_path)

--- STARTING BATCH PROCESS (SHEET 2) ---
Target Directory: G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey\Metadata Sheet 2 CSV's
--------------------------------------------------
[OK] Saved: 2018/Sheet2_January_2018.csv
[OK] Saved: 2018/Sheet2_April_2018.csv
[OK] Saved: 2018/Sheet2_July_2018.csv
[OK] Saved: 2018/Sheet2_October_2018.csv
[OK] Saved: 2019/Sheet2_January_2019.csv
[OK] Saved: 2019/Sheet2_April_2019.csv
[OK] Saved: 2019/Sheet2_July_2019.csv
[OK] Saved: 2019/Sheet2_October_2019.csv
[OK] Saved: 2022/Sheet2_August_2022.csv
[OK] Saved: 2022/Sheet2_September_2022.csv
[OK] Saved: 2022/Sheet2_January_2022.csv
[OK] Saved: 2022/Sheet2_June_2022.csv
[OK] Saved: 2022/Sheet2_October_2022.csv
[OK] Saved: 2022/Sheet2_December_2022.csv
[OK] Saved: 2022/Sheet2_March_2022.csv
[OK] Saved: 2022/Sheet2_April_2022.csv
[OK] Saved: 2022/Sheet2_May_2022.csv
[OK] Saved: 2022/Sheet2_July_2022.csv
[OK] Saved: 2022/Sheet2_November_2022.csv
[OK] Saved: 2022/Sheet2_Febru

#### Verifying if the variable counts of Reshaped Metadata Sheet 2 and Original matches

In [16]:
import os
import pandas as pd

def batch_verify_sheet2_variable_and_label_count(inventory, base_path):
    """
    Batch verify Sheet 2 metadata (values dictionary) across years/months.
    Compares:
      • Unique variable count (raw vs reshaped)
      • Label count per variable (raw vs reshaped)
    Prints mismatches immediately and returns a summary DataFrame.
    """
    all_results = []

    for year, months_data in inventory.items():
        for month, files_list in months_data.items():
            if month == "Unmatched":
                continue

            # --- Load raw Sheet 2 ---
            try:
                raw_df = load_dataset(year, month, "metadata", sheet_number=1)
            except Exception as e:
                print(f"[ERROR] {month} {year}: Could not load raw Sheet 2 ({e})")
                continue

            # --- Load reshaped Sheet 2 CSV ---
            reshaped_path = os.path.join(base_path, "Metadata Sheet 2 CSV's", year, f"Sheet2_{month}_{year}.csv")
            if not os.path.exists(reshaped_path):
                print(f"[ERROR] {month} {year}: Reshaped Sheet 2 CSV missing!")
                continue

            reshaped_df = pd.read_csv(reshaped_path, dtype=str).fillna("")

            # --- Count unique variables ---
            raw_vars = raw_df.iloc[:, 0].astype(str).str.strip()
            raw_vars = raw_vars[raw_vars != '']  # ignore empty
            raw_unique_vars = pd.Index(raw_vars).unique()

            resh_vars = reshaped_df['Variable'].astype(str).str.strip()
            resh_unique_vars = pd.Index(resh_vars).unique()

            # Check variable count mismatch
            variable_mismatch = len(raw_unique_vars) != len(resh_unique_vars)
            if variable_mismatch:
                print(f"[VARIABLE COUNT MISMATCH] {month} {year}: Raw={len(raw_unique_vars)}, Reshaped={len(resh_unique_vars)}")

            # --- Count labels per variable ---
            label_mismatches = []

            for var in raw_unique_vars:
                # Raw: select rows matching variable
                raw_rows = raw_df[raw_df.iloc[:, 0].astype(str).str.strip() == var]

                # Count non-empty label cells safely
                raw_label_count = raw_rows.iloc[:, 2:6].astype(str).apply(lambda x: x.str.strip().ne('').any(), axis=1).sum()

                # Reshaped: count rows per variable
                resh_label_count = reshaped_df[reshaped_df['Variable'].astype(str).str.strip() == var].shape[0]

                if raw_label_count != resh_label_count:
                    label_mismatches.append({
                        "Variable": var,
                        "Raw_Label_Count": raw_label_count,
                        "Reshaped_Label_Count": resh_label_count
                    })

            # --- Print immediate label mismatches ---
            for m in label_mismatches:
                print(f"[LABEL COUNT MISMATCH] {month} {year} - Variable: {m['Variable']} | Raw={m['Raw_Label_Count']} vs Reshaped={m['Reshaped_Label_Count']}")

            # --- Record summary ---
            all_results.append({
                "Year": year,
                "Month": month,
                "Raw_Variable_Count": len(raw_unique_vars),
                "Reshaped_Variable_Count": len(resh_unique_vars),
                "Variable_Count_Status": "PASS" if not variable_mismatch else "FAIL",
                "Label_Count_Mismatches": len(label_mismatches)
            })

    # Return as DataFrame
    return pd.DataFrame(all_results).sort_values(['Year', 'Month']).reset_index(drop=True)


In [17]:
summary_df = batch_verify_sheet2_variable_and_label_count(inventory, base_path)
print("=== Sheet 2 Variable & Label Count Verification ===")
summary_df

=== Sheet 2 Variable & Label Count Verification ===


Unnamed: 0,Year,Month,Raw_Variable_Count,Reshaped_Variable_Count,Variable_Count_Status,Label_Count_Mismatches
0,2018,April,43,43,PASS,0
1,2018,January,46,46,PASS,0
2,2018,July,44,44,PASS,0
3,2018,October,52,52,PASS,0
4,2019,April,45,45,PASS,0
5,2019,January,44,44,PASS,0
6,2019,July,56,56,PASS,0
7,2019,October,56,56,PASS,0
8,2022,April,57,57,PASS,0
9,2022,August,46,46,PASS,0


#### A verifier to check if all variables, descriptions, labels, and value fields in the original Sheet 2 exist and match in the reshaped Sheet 2, regardless of row order

In [18]:
def verify_sheet2(original_df, reshaped_df):
    """
    Compare original Sheet 2 with reshaped version.
    Checks:
      • Same variables
      • Same descriptions
      • Same labels
      • Same min/max/additional values
      • Ignores row order
    """

    # Normalize to string
    original = original_df.fillna("").astype(str)
    reshaped = reshaped_df.fillna("").astype(str)

    # --- Extract original as dict ---
    def build_original_dict(df):
        data = {}
        current_var = ""
        current_desc = ""
        for _, row in df.iterrows():
            colA = row.iloc[0].strip()
            colB = row.iloc[1].strip()
            colC = row.iloc[2].strip()
            if colA:
                current_var = colA
            if colB:
                current_desc = colB
            if not colC:
                continue
            minv = row.iloc[3].strip() if len(row) > 3 else ""
            maxv = row.iloc[4].strip() if len(row) > 4 else ""
            extra = ""
            if len(row) > 5:
                for j in range(5, len(row)):
                    if row.iloc[j].strip():
                        extra = row.iloc[j].strip()
                        break
            if current_var not in data:
                data[current_var] = []
            data[current_var].append({
                "Description": current_desc,
                "Label": colC,
                "min_value": minv,
                "max_value": maxv,
                "additional_value": extra
            })
        return data

    orig_dict = build_original_dict(original)

    # --- Extract reshaped as dict (warning-free) ---
    resh_dict = {
        var: group.drop(columns="Variable").to_dict("records")
        for var, group in reshaped.groupby("Variable")
    }

    # --- Verification ---
    errors = []

    orig_vars = set(orig_dict.keys())
    resh_vars = set(resh_dict.keys())

    missing_vars = orig_vars - resh_vars
    extra_vars = resh_vars - orig_vars
    if missing_vars:
        errors.append(f"Missing variables in reshaped: {missing_vars}")
    if extra_vars:
        errors.append(f"Extra variables in reshaped: {extra_vars}")

    # Detailed label/content comparison
    for var in orig_vars & resh_vars:
        orig_records = orig_dict[var]
        resh_records = resh_dict[var]
        orig_set = {
            (d["Label"], d["min_value"], d["max_value"], d["additional_value"])
            for d in orig_records
        }
        resh_set = {
            (d["Label"], d["min_value"], d["max_value"], d["additional_value"])
            for d in resh_records
        }
        missing_rec = orig_set - resh_set
        extra_rec = resh_set - orig_set
        if missing_rec:
            errors.append(f"[{var}] Missing records: {missing_rec}")
        if extra_rec:
            errors.append(f"[{var}] Extra records: {extra_rec}")

    if not errors:
        return "SUCCESS: Reshaped metadata matches the original Sheet 2."
    else:
        return "MISMATCH FOUND:\n" + "\n".join(errors)


Checking for January 2018 Original vs Reshaped Metadata Sheet 2

In [19]:
# ===========================
# Caller: content-based verification with month/year in messages
# ===========================

# Load original metadata (Sheet 2)
original_df = load_dataset("2018", "January", "metadata", 1)

# Load reshaped metadata (automatically reshaped)
reshaped_df = load_dataset("2018", "January", "metadata", 1)

# Store month/year for messages
month = "January"
year = "2018"

# Call the verifier
result_message = verify_sheet2(original_df, reshaped_df)

# Customize message to include month/year
if "SUCCESS" in result_message:
    print(f"SUCCESS: Reshaped Sheet 2 metadata for {month} {year} matches the original.\n")
    print("Sample content verification (first 5 variables):\n")
    
    # Pick first 5 unique variables from reshaped
    sample_vars = reshaped_df["Variable"].unique()[:5]
    for var in sample_vars:
        sample_rows = reshaped_df[reshaped_df["Variable"] == var]
        print(f"Variable: {var}")
        print(sample_rows[["Label", "min_value", "max_value", "additional_value"]])
        print("-" * 50)
else:
    print(f"MISMATCH FOUND in {month} {year} metadata:\n")
    # Show first few rows from both original and reshaped for inspection
    orig_preview = original_df.fillna("").astype(str).head(10)
    resh_preview = reshaped_df.fillna("").astype(str).head(10)
    print("Original preview:")
    print(orig_preview)
    print("\nReshaped preview:")
    print(resh_preview)


SUCCESS: Reshaped Sheet 2 metadata for January 2018 matches the original.

Sample content verification (first 5 variables):

Variable: PUFREG_VS1
                                   Label min_value max_value additional_value
0                National Capital Region        13         0                0
1       Cordillera Administrative Region        14         0                0
2               Region I - Ilocos Region         1         0                0
3             Region II - Cagayan Valley         2         0                0
4             Region III - Central Luzon         3         0                0
5                Region IVA - CALABARZON         4         0                0
6                  Region IVB - MIMAROPA        17         0                0
7                        Region V- Bicol         5         0                0
8            Region VI - Western Visayas         6         0                0
9           Region VII - Central Visayas         7         0              

Checking for August 2024 Original vs Reshaped Metadata Sheet 2

In [20]:
# ===========================
# Caller: content-based verification with month/year in messages
# ===========================

# Load original metadata (Sheet 2)
original_df = load_dataset("2024", "August", "metadata", 1)

# Load reshaped metadata (automatically reshaped)
reshaped_df = load_dataset("2024", "August", "metadata", 1)

# Store month/year for messages
month = "August"
year = "2024"

# Call the verifier
result_message = verify_sheet2(original_df, reshaped_df)

# Customize message to include month/year
if "SUCCESS" in result_message:
    print(f"SUCCESS: Reshaped Sheet 2 metadata for {month} {year} matches the original.\n")
    print("Sample content verification (first 5 variables):\n")
    
    # Pick first 5 unique variables from reshaped
    sample_vars = reshaped_df["Variable"].unique()[:5]
    for var in sample_vars:
        sample_rows = reshaped_df[reshaped_df["Variable"] == var]
        print(f"Variable: {var}")
        print(sample_rows[["Label", "min_value", "max_value", "additional_value"]])
        print("-" * 50)
else:
    print(f"MISMATCH FOUND in {month} {year} metadata:\n")
    # Show first few rows from both original and reshaped for inspection
    orig_preview = original_df.fillna("").astype(str).head(10)
    resh_preview = reshaped_df.fillna("").astype(str).head(10)
    print("Original preview:")
    print(orig_preview)
    print("\nReshaped preview:")
    print(resh_preview)


SUCCESS: Reshaped Sheet 2 metadata for August 2024 matches the original.

Sample content verification (first 5 variables):

Variable: PUFSVYMO_VS1
        Label min_value max_value additional_value
0     January       1.0         0                0
1    February       2.0         0                0
2       March       3.0         0                0
3       April       4.0         0                0
4         May       5.0         0                0
5        June       6.0         0                0
6        July       7.0         0                0
7      August       8.0         0                0
8   September       9.0         0                0
9     October      10.0         0                0
10   November      11.0         0                0
11   December      12.0         0                0
--------------------------------------------------
Variable: PUFHHSIZE_VS1
   Label min_value max_value additional_value
12     1       1.0         0                0
13     2       2.0     

Checking January 2018 Metadata Reshaped Sheet 2

In [21]:
clean_jan_2018 = load_dataset("2018", "January", "metadata", 1)
clean_jan_2018.head()

Unnamed: 0,Variable,Description,Label,min_value,max_value,additional_value
0,PUFREG_VS1,Region,National Capital Region,13,0,0
1,PUFREG_VS1,Region,Cordillera Administrative Region,14,0,0
2,PUFREG_VS1,Region,Region I - Ilocos Region,1,0,0
3,PUFREG_VS1,Region,Region II - Cagayan Valley,2,0,0
4,PUFREG_VS1,Region,Region III - Central Luzon,3,0,0


Checking August 2024 Metadata Reshaped Sheet 2

In [22]:
clean_aug_2024 = load_dataset("2024", "August", "metadata", 1)
clean_aug_2024.head()

Unnamed: 0,Variable,Description,Label,min_value,max_value,additional_value
0,PUFSVYMO_VS1,Survey Month,January,1.0,0,0
1,PUFSVYMO_VS1,Survey Month,February,2.0,0,0
2,PUFSVYMO_VS1,Survey Month,March,3.0,0,0
3,PUFSVYMO_VS1,Survey Month,April,4.0,0,0
4,PUFSVYMO_VS1,Survey Month,May,5.0,0,0


Checking for Variables with Additional Value (Extra Column)

In [23]:
# Load metadata Sheet 2 for your desired month/year
df_metadata = load_dataset("2018", "January", "metadata", 1)

# Filter rows for PUFC10_CONWR_VS1
pu_fc10 = df_metadata[df_metadata["Variable"] == "PUFC10_CONWR_VS1"]

# Display the result
pu_fc10


Unnamed: 0,Variable,Description,Label,min_value,max_value,additional_value
219,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Overseas Contract Workers,1,0,0
220,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Workers other than OCW,2,0,0
221,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,"Employees in Philippine Embassy, Consulates & ...",3,0,0
222,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Students abroad/Tourists,4,0,0
223,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Others,5,0,0
224,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Less than 15 Years Old,0,0,Not Applicable


In [24]:
def load_clean_sheet2(year, month):
    """
    Loads the pre-processed clean Sheet 2 metadata (values dictionary) CSV 
    saved during the batch process.
    """
    main_folder_name = "Metadata Sheet 2 CSV's"
    filename = f"Sheet2_{month}_{year}.csv"
    file_path = os.path.join(base_path, main_folder_name, year, filename)
    
    # Check for the expected path
    if not os.path.exists(file_path):
        # Fallback check in case the file was saved directly in the year folder
        file_path_fallback = os.path.join(base_path, year, filename)
        if os.path.exists(file_path_fallback):
            file_path = file_path_fallback
        else:
            # Raise a clear error if the metadata file is missing
            raise FileNotFoundError(
                f"Clean Sheet 2 metadata not found for {month} {year}. "
                f"Expected path: {file_path}"
            )

    # Read the CSV, ensuring code columns remain as strings for accurate mapping
    return pd.read_csv(
        file_path, 
        dtype={
            'min_value': str, 
            'max_value': str, 
            'additional_value': str
        }
    )

# --- CORE MINI-SPRINT FUNCTION ---
def decode_single_variable(survey_df, clean_metadata_df, variable_name):
    """
    Decodes a single variable (column) in the survey DataFrame using 
    the clean long-format Sheet 2 metadata.
    
    The function adds a new column named {VARIABLE_NAME}_LABEL.
    """
    
    # 1. Prepare the Lookup Table (Mapping Dictionary)
    lookup_table = clean_metadata_df[
        clean_metadata_df['Variable'].str.contains(variable_name, case=False, na=False)
    ].copy()
    
    lookup_table['min_value'] = lookup_table['min_value'].astype(str).str.strip()
    
    string_mapping_dict = pd.Series(
        lookup_table['Label'].values, 
        index=lookup_table['min_value']
    ).to_dict()
    
    # 2. Prepare Columns and New Column Name
    new_col_name = f"{variable_name}_LABEL"
    original_col = survey_df[variable_name].copy()
    
    # 3. Perform the Mapping (Handles mixed types: strings and numerics)
    mapped_series = original_col.astype(str).str.strip().replace(string_mapping_dict)
    
    numeric_mapping_dict = {}
    for k, v in string_mapping_dict.items():
        try:
            if "." in k:
                 numeric_mapping_dict[float(k)] = v
            else:
                 numeric_mapping_dict[int(k)] = v
        except ValueError:
            pass
    
    numeric_mapped_series = original_col.map(numeric_mapping_dict)
    
    survey_df[new_col_name] = numeric_mapped_series.fillna(mapped_series)
    
    return survey_df

In [25]:
YEAR = "2018"
MONTH = "January"
VARIABLE_REGION = "PUFREG"
VARIABLE_MONTH = "PUFSVYMO" 

# Load the raw survey data (e.g., JANUARY_2018.CSV)
jan_2018_survey_raw = load_dataset(YEAR, MONTH, "survey")

# Load the clean Sheet 2 metadata (our lookup table)
jan_2018_metadata_clean = load_clean_sheet2(YEAR, MONTH)

# Display the head of the raw data to confirm initial state and variable existence
# Showing only the coded columns initially
jan_2018_survey_raw[[
    VARIABLE_REGION, 
    VARIABLE_MONTH, 
    'PUFPRV', 
    'PUFPRRCD'
]].head(10)

Unnamed: 0,PUFREG,PUFSVYMO,PUFPRV,PUFPRRCD
0,14,1,1,100
1,14,1,1,100
2,14,1,1,100
3,14,1,1,100
4,14,1,1,100
5,14,1,1,100
6,14,1,1,100
7,14,1,1,100
8,14,1,1,100
9,14,1,1,100


In [26]:
# Create a copy to perform the decoding on
jan_2018_decoded = jan_2018_survey_raw.copy()

# Decode the REGION variable
jan_2018_decoded = decode_single_variable(
    jan_2018_decoded, jan_2018_metadata_clean, VARIABLE_REGION
)

# Decode the MONTH variable
jan_2018_decoded = decode_single_variable(
    jan_2018_decoded, jan_2018_metadata_clean, VARIABLE_MONTH
)

# Display a focused output: only the new Label columns and retained columns
jan_2018_decoded[[
    f"{VARIABLE_REGION}_LABEL",     # New decoded label for Region
    f"{VARIABLE_MONTH}_LABEL",      # New decoded label for Month
    'PUFPRV',                       # Unrelated column 1 (retained)
    'PUFPRRCD'                      # Unrelated column 2 (retained)
]].head(10)

Unnamed: 0,PUFREG_LABEL,PUFSVYMO_LABEL,PUFPRV,PUFPRRCD
0,Cordillera Administrative Region,January,1,100
1,Cordillera Administrative Region,January,1,100
2,Cordillera Administrative Region,January,1,100
3,Cordillera Administrative Region,January,1,100
4,Cordillera Administrative Region,January,1,100
5,Cordillera Administrative Region,January,1,100
6,Cordillera Administrative Region,January,1,100
7,Cordillera Administrative Region,January,1,100
8,Cordillera Administrative Region,January,1,100
9,Cordillera Administrative Region,January,1,100


In [27]:
TEST_VARIABLES = [VARIABLE_REGION, VARIABLE_MONTH]

# --- Verification Report Function ---
def generate_verification_report(raw_df, decoded_df, var_name):
    """Generates a clear report confirming decoding integrity."""
    
    new_col = f"{var_name}_LABEL"
    
    # 1. Total Record Count Check (Must be TRUE for integrity)
    raw_total = raw_df[var_name].value_counts(dropna=False).sum()
    decoded_total = decoded_df[new_col].value_counts(dropna=False).sum()
    
    integrity_check = (raw_total == decoded_total)
    
    # 2. Value Counts Check (To ensure codes were replaced by labels)
    
    # Create the report structure
    report = {
        'Variable': var_name,
        'Raw Total Records': [raw_total],
        'Decoded Total Records': [decoded_total],
        'Integrity Status': ['PASS' if integrity_check else 'FAIL']
    }
    
    report_df = pd.DataFrame(report).set_index('Variable')
    
    print(f"\n--- Decoded Value Counts for {var_name} (Top 5) ---")
    display(decoded_df[new_col].value_counts(dropna=False).head(5))
    
    print(f"\n--- INTEGRITY CHECK SUMMARY for {var_name} ---")
    return report_df

# --- Run Verification for Both Variables ---
verification_reports = []

for var in TEST_VARIABLES:
    report = generate_verification_report(jan_2018_survey_raw, jan_2018_decoded, var)
    verification_reports.append(report)

# Combine and display the final summary tables
pd.concat(verification_reports)


--- Decoded Value Counts for PUFREG (Top 5) ---


PUFREG_LABEL
National Capital Region          25614
Region III - Central Luzon       13931
Region VI - Western Visayas      11973
Region VIII - Eastern Visayas    11379
Region X - Northern Mindanao     10551
Name: count, dtype: int64


--- INTEGRITY CHECK SUMMARY for PUFREG ---

--- Decoded Value Counts for PUFSVYMO (Top 5) ---


PUFSVYMO_LABEL
January    180262
Name: count, dtype: int64


--- INTEGRITY CHECK SUMMARY for PUFSVYMO ---


Unnamed: 0_level_0,Raw Total Records,Decoded Total Records,Integrity Status
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PUFREG,180262,180262,PASS
PUFSVYMO,180262,180262,PASS
