## Installing Libraries Needed

In [None]:
!python -m pip install pandas




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!python -m pip install openpyxl





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Dataset Inventory Loader

In [3]:
import os
import re

base_path = r"G:\My Drive\Labor Force Survey"

# Month ordering
month_order = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

# Patterns
month_pattern = re.compile(
    r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
    re.IGNORECASE
)
year_pattern = re.compile(r"(20\d{2})")

# Detect year folders from drive
year_folders = [
    f for f in os.listdir(base_path)
    if os.path.isdir(os.path.join(base_path, f)) and f.isdigit()
]

print("Detected year folders:", sorted(year_folders))

inventory = {}

for year in sorted(year_folders):
    year_path = os.path.join(base_path, year)

    # Accept both CSV and XLSX
    data_files = [
        f for f in os.listdir(year_path)
        if f.lower().endswith(".csv") or f.lower().endswith(".xlsx")
    ]

    inventory[year] = {}

    for file in data_files:
        upper = file.upper()

        # Detect type
        if upper.endswith(".XLSX"):
            filetype = "metadata"  # XLSX = metadata
        else:
            filetype = "survey"    # CSV = survey

        # Detect month
        month_match = month_pattern.search(upper)
        month = (
            month_match.group(1).capitalize()
            if month_match
            else "Unmatched"
        )

        # Detect year inside filename
        year_match = year_pattern.search(upper)
        file_year = year_match.group(1) if year_match else "UNKNOWN"

        # Store into inventory
        if month not in inventory[year]:
            inventory[year][month] = []

        inventory[year][month].append({
            "filename": file,
            "filetype": filetype,
            "file_year": file_year
        })

# Print clean summary
print("\n=== DATASET INVENTORY SUMMARY ===\n")

for yr in sorted(inventory.keys()):
    print(f"Year {yr}:")

    sorted_months = sorted(
        inventory[yr].keys(),
        key=lambda m: month_order.get(m, 99)
    )

    for month in sorted_months:
        print(f"  {month}:")
        for item in inventory[yr][month]:
            print(f"    {item['filename']} ({item['filetype']})")

    print()


Detected year folders: ['2018', '2019', '2022', '2023', '2024']

=== DATASET INVENTORY SUMMARY ===

Year 2018:
  January:
    JANUARY_2018_METADATA.xlsx (metadata)
    JANUARY_2018.CSV (survey)
  April:
    APRIL_2018.CSV (survey)
    APRIL_2018_METADATA.xlsx (metadata)
  July:
    JULY_2018.CSV (survey)
    JULY_2018_METADATA.xlsx (metadata)
  October:
    OCTOBER_2018.CSV (survey)
    OCTOBER_2018_METADATA.xlsx (metadata)
  Unmatched:
    clean_jan_2018_metadata.csv (survey)
    clean_jan_2018_metadata_sheet2.csv (survey)

Year 2019:
  January:
    JANUARY_2019.CSV (survey)
    JANUARY_2019_METADATA.xlsx (metadata)
  April:
    APRIL_2019.CSV (survey)
    APRIL_2019_METADATA.xlsx (metadata)
  July:
    JULY_2019.CSV (survey)
    JULY_2019_METADATA.xlsx (metadata)
  October:
    OCTOBER_2019.CSV (survey)
    OCTOBER_2019_METADATA.xlsx (metadata)

Year 2022:
  January:
    JANUARY_2022.csv (survey)
    JANUARY_2022_METADATA.xlsx (metadata)
  February:
    FEBRUARY_2022.csv (survey)
   

## Load Dataset Function

In [4]:
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load a dataset from the inventory.

    year: str, e.g., "2018"
    month: str, e.g., "January"
    filetype: "survey" or "metadata"
    sheet_number: 0(sheet 1) or 1(sheet 2)
    """
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )
    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])
    
    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)
    
    if sheet_number is not None:
        return pd.read_excel(file_path, sheet_name=sheet_number)
    
    return pd.read_excel(file_path)

## Metadata Sheet 1 Function 

In [5]:
def extract_variables(df):
    """
    Extract variable names and descriptions from metadata DataFrame.
    Looks for values in columns 'Unnamed: 4' (variable) and 'Unnamed: 5' (description).
    Returns a clean DataFrame with ['Variable', 'Description'].
    """

    # Keep only rows where column 4 has a variable name
    filtered = df[df['Unnamed: 4'].notna()][['Unnamed: 4', 'Unnamed: 5']]

    # Rename columns
    filtered.columns = ['Variable', 'Description']

    return filtered


### Testing the Function

In [6]:
import pandas as pd

# Load metadata
January_metadata = load_dataset("2018", "January", "metadata", 0)

# Call your function
variables_1 = extract_variables(January_metadata)

# View results
variables_1.head()


Unnamed: 0,Variable,Description
1,PUFREG,Region
2,PUFPRV,Province
3,PUFPRRCD,Province Recode
4,PUFHHNUM,Household Unique Sequential Number
6,PUFURB2K10,2010Urban-RuralFIES


In [7]:
import pandas as pd

# Load metadata
August_metadata = load_dataset("2024", "August", "metadata", 0)

# Call your function
variables_2 = extract_variables(August_metadata)

# View results
variables_2.head()


Unnamed: 0,Variable,Description
1,PUFHHNUM,Household Unique Sequential Number
3,PUFPWGTPRV,Final Weight Based on Projection
4,PUFSVYMO,Survey Month
5,PUFSVYYR,Survey Year
6,PUFPSU,Psu Number


## Metadata Sheet 2 Function

In [8]:
import pandas as pd
import os

def reshape_sheet2(df):
    reshaped = []
    
    # 1. Clean the dataframe
    df = df.fillna('').astype(str)

    current_var = None
    current_label = None 

    # Header extraction
    header_var = df.columns[0] if 'Unnamed' not in str(df.columns[0]) else ''
    header_lbl = df.columns[1] if 'Unnamed' not in str(df.columns[1]) else ''
    
    if header_var: current_var = header_var.strip()
    if header_lbl: current_label = header_lbl.strip()

    for idx, row in df.iterrows():
        # Read raw columns
        var_col      = row.iloc[0].strip()
        label_col    = row.iloc[1].strip()
        category_col = row.iloc[2].strip()
        raw_min      = row.iloc[3].strip()
        raw_max      = row.iloc[4].strip()
        raw_add      = row.iloc[5].strip()

        # Forward Fill Logic
        if var_col: current_var = var_col
        if label_col: current_label = label_col

        if not category_col:
            continue

        # --- LOGIC ---

        # 1. MIN VALUE
        try:
            # Try to turn "1" or "1.0" into 1
            min_value = int(float(raw_min))
        except (ValueError, TypeError):
            # If it's text (e.g. "See Note") or empty, keep it as is
            min_value = raw_min

        # 2. MAX VALUE
        try:
            max_value = int(float(raw_max))
        except (ValueError, TypeError):
            # Conversion failed. Is it because it's empty?
            # If empty -> Use min_value. 
            # If text -> Keep the text.
            max_value = min_value if not raw_max else raw_max

        # 3. ADDITIONAL VALUE
        try:
            additional_value = int(float(raw_add))
        except (ValueError, TypeError):
            # Conversion failed. Is it because it's empty?
            # If empty -> Default to 0.
            # If text -> Keep the text.
            additional_value = 0 if not raw_add else raw_add

        reshaped.append({
            "Variable": current_var,
            "Label": current_label,     
            "Category": category_col,    
            "min_value": min_value,
            "max_value": max_value,
            "additional_value": additional_value
        })

    return pd.DataFrame(reshaped)

### Testing the Function

In [9]:
import pandas as pd

# Load metadata
January_metadata = load_dataset("2018", "January", "metadata", 1)

# Call your function
variables_3 = reshape_sheet2(January_metadata)

# View results
variables_3.head()

Unnamed: 0,Variable,Label,Category,min_value,max_value,additional_value
0,PUFREG_VS1,Region,National Capital Region,13,13,0
1,PUFREG_VS1,Region,Cordillera Administrative Region,14,14,0
2,PUFREG_VS1,Region,Region I - Ilocos Region,1,1,0
3,PUFREG_VS1,Region,Region II - Cagayan Valley,2,2,0
4,PUFREG_VS1,Region,Region III - Central Luzon,3,3,0


In [10]:
import pandas as pd

# Load metadata
August_metadata = load_dataset("2024", "August", "metadata", 1)

# Call your function
variables_4 = reshape_sheet2(August_metadata)

# View results
variables_4.head()


Unnamed: 0,Variable,Label,Category,min_value,max_value,additional_value
0,PUFSVYMO_VS1,Survey Month,January,1,1,0
1,PUFSVYMO_VS1,Survey Month,February,2,2,0
2,PUFSVYMO_VS1,Survey Month,March,3,3,0
3,PUFSVYMO_VS1,Survey Month,April,4,4,0
4,PUFSVYMO_VS1,Survey Month,May,5,5,0


### Verifier Function for Sheet 2

In [11]:
import pandas as pd
import numpy as np

def verify_sheet2(reshaped_df):
    """
    Checks the reshaped metadata DataFrame, calculates accuracy, and prints a 
    clean, readable verification report.
    
    Returns:
        dict: The raw dictionary containing all verification results and error samples.
    """
    verification_checks = {}
    total_records = len(reshaped_df)

    if total_records == 0:
        print("\n=================================")
        print("ERROR: Cannot verify empty DataFrame.")
        print("=================================")
        return {"error": "Cannot verify empty DataFrame."}

    # 1. --- CALCULATIONS ---
    
    # Check for Missing Categories
    missing_categories = reshaped_df[reshaped_df['Category'].isna() | (reshaped_df['Category'] == '')]
    missing_cat_count = len(missing_categories)
    accuracy_categories = 100.0 * (1.0 - (missing_cat_count / total_records))
    status_categories = "PASS" if missing_cat_count == 0 else "FAIL"

    verification_checks['category_completeness'] = {
        'status': status_categories, 'accuracy': round(accuracy_categories, 2),
        'errors_count': missing_cat_count,
        'error_sample': missing_categories[['Variable', 'Label', 'Category']].head(3).to_dict('records')
    }

    # Check for Missing Minimum Code Value
    missing_min = reshaped_df[reshaped_df['min_value'].isna()]
    missing_min_count = len(missing_min)
    accuracy_min_value = 100.0 * (1.0 - (missing_min_count / total_records))
    status_min = "PASS" if missing_min_count == 0 else "FAIL"

    verification_checks['value_code_completeness'] = {
        'status': status_min, 'accuracy': round(accuracy_min_value, 2),
        'errors_count': missing_min_count,
        'error_sample': missing_min[['Category', 'min_value', 'max_value']].head(3).to_dict('records')
    }

    # Check for Structural Integrity (Min > Max)
    numeric_rows = reshaped_df.copy()
    numeric_rows['min_num'] = pd.to_numeric(numeric_rows['min_value'], errors='coerce', downcast='integer')
    numeric_rows['max_num'] = pd.to_numeric(numeric_rows['max_value'], errors='coerce', downcast='integer')
    numeric_rows = numeric_rows[numeric_rows['min_num'].notna() & numeric_rows['max_num'].notna()]
    base_count = len(numeric_rows)
    invalid_ranges = numeric_rows[numeric_rows['min_num'] > numeric_rows['max_num']]
    invalid_count = len(invalid_ranges)
    
    accuracy_ranges = 100.0 * (1.0 - (invalid_count / base_count)) if base_count > 0 else 100.0
    status_ranges = "PASS" if invalid_count == 0 else "FAIL"

    verification_checks['range_integrity'] = {
        'status': status_ranges, 'accuracy': round(accuracy_ranges, 2),
        'errors_count': invalid_count,
        'error_sample': invalid_ranges[['Category', 'min_num', 'max_num']].head(3).to_dict('records')
    }
    
    # 2. --- PRINTING (Integrated for plug-and-play experience) ---
    
    print("\n=================================================")
    print("      DATA TRANSFORMATION VERIFICATION REPORT")
    print("=================================================")

    # Loop through the final checks and print them in a readable format
    for check_name, data in verification_checks.items():
        title = check_name.replace('_', ' ').title()
        status_marker = ">>" if data['status'] == 'PASS' else "XX"

        print(f"\n--- {status_marker} {title} ---")
        print(f"  Status: {data['status']}")
        print(f"  Accuracy: {data['accuracy']}%")
        print(f"  Errors Found: {data['errors_count']}")

        if data['errors_count'] > 0:
            print("\n  Sample Error Rows:")
            for sample in data['error_sample']:
                formatted_sample = ', '.join([f'{k}: {v}' for k, v in sample.items()])
                print(f"    - {formatted_sample}")
    
    print("\n=================================================")

    # Returns the structured data for downstream processes if needed
    return verification_checks

### Testing the Function

In [None]:
verification_report_1 = verify_sheet2(variables_3)


      DATA TRANSFORMATION VERIFICATION REPORT

--- >> Category Completeness ---
  Status: PASS
  Accuracy: 100.0%
  Errors Found: 0

--- >> Value Code Completeness ---
  Status: PASS
  Accuracy: 100.0%
  Errors Found: 0

--- >> Range Integrity ---
  Status: PASS
  Accuracy: 100.0%
  Errors Found: 0



In [13]:
verification_report_2 = verify_sheet2(variables_4)


      DATA TRANSFORMATION VERIFICATION REPORT

--- >> Category Completeness ---
  Status: PASS
  Accuracy: 100.0%
  Errors Found: 0

--- >> Value Code Completeness ---
  Status: PASS
  Accuracy: 100.0%
  Errors Found: 0

--- >> Range Integrity ---
  Status: PASS
  Accuracy: 100.0%
  Errors Found: 0

