## Installing Libraries Needed

In [None]:
!python -m pip install pandas

In [None]:
!python -m pip install openpyxl


## Dataset Inventory Loader

In [1]:
import os
import re

base_path = r"G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey"

# Month ordering
month_order = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

# Patterns
month_pattern = re.compile(
    r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
    re.IGNORECASE
)
year_pattern = re.compile(r"(20\d{2})")

# Detect year folders from drive
year_folders = [
    f for f in os.listdir(base_path)
    if os.path.isdir(os.path.join(base_path, f)) and f.isdigit()
]

print("Detected year folders:", sorted(year_folders))

inventory = {}

for year in sorted(year_folders):
    year_path = os.path.join(base_path, year)

    # Accept both CSV and XLSX
    data_files = [
        f for f in os.listdir(year_path)
        if f.lower().endswith(".csv") or f.lower().endswith(".xlsx")
    ]

    inventory[year] = {}

    for file in data_files:
        upper = file.upper()

        # Detect type
        if upper.endswith(".XLSX"):
            filetype = "metadata"  # XLSX = metadata
        else:
            filetype = "survey"    # CSV = survey

        # Detect month
        month_match = month_pattern.search(upper)
        month = (
            month_match.group(1).capitalize()
            if month_match
            else "Unmatched"
        )

        # Detect year inside filename
        year_match = year_pattern.search(upper)
        file_year = year_match.group(1) if year_match else "UNKNOWN"

        # Store into inventory
        if month not in inventory[year]:
            inventory[year][month] = []

        inventory[year][month].append({
            "filename": file,
            "filetype": filetype,
            "file_year": file_year
        })

# Print clean summary
print("\n=== DATASET INVENTORY SUMMARY ===\n")

for yr in sorted(inventory.keys()):
    print(f"Year {yr}:")

    sorted_months = sorted(
        inventory[yr].keys(),
        key=lambda m: month_order.get(m, 99)
    )

    for month in sorted_months:
        print(f"  {month}:")
        for item in inventory[yr][month]:
            print(f"    {item['filename']} ({item['filetype']})")

    print()


Detected year folders: ['2018', '2019', '2022', '2023', '2024']

=== DATASET INVENTORY SUMMARY ===

Year 2018:
  January:
    JANUARY_2018_METADATA.xlsx (metadata)
    JANUARY_2018.CSV (survey)
    ~$JANUARY_2018_METADATA.xlsx (metadata)
  April:
    APRIL_2018_METADATA.xlsx (metadata)
    APRIL_2018.CSV (survey)
  July:
    JULY_2018.CSV (survey)
    JULY_2018_METADATA.xlsx (metadata)
  October:
    OCTOBER_2018_METADATA.xlsx (metadata)
    OCTOBER_2018.CSV (survey)
  Unmatched:
    clean_jan_2018_metadata.csv (survey)
    clean_jan_2018_metadata_sheet2.csv (survey)

Year 2019:
  January:
    JANUARY_2019_METADATA.xlsx (metadata)
    JANUARY_2019.CSV (survey)
  April:
    APRIL_2019.CSV (survey)
    APRIL_2019_METADATA.xlsx (metadata)
  July:
    JULY_2019.CSV (survey)
    JULY_2019_METADATA.xlsx (metadata)
  October:
    OCTOBER_2019.CSV (survey)
    OCTOBER_2019_METADATA.xlsx (metadata)

Year 2022:
  January:
    JANUARY_2022_METADATA.xlsx (metadata)
    JANUARY_2022.csv (survey)
  

## Load Dataset Function

In [2]:
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load a dataset from the inventory.

    year: str, e.g., "2018"
    month: str, e.g., "January"
    filetype: "survey" or "metadata"
    sheet_number: 0(sheet 1) or 1(sheet 2)
    """
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )
    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])
    
    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)
    
    if sheet_number is not None:
        return pd.read_excel(file_path, sheet_name=sheet_number)
    
    return pd.read_excel(file_path)

Sample: January 2018 Survey

In [3]:
import pandas as pd

# Load the survey sheet of January 2018 metadata
jan_2018_survey = load_dataset("2018", "January","survey")

# View the first few rows
jan_2018_survey.head()

Unnamed: 0,PUFREG,PUFPRV,PUFPRRCD,PUFHHNUM,PUFURB2K10,PUFPWGTPRV,PUFSVYMO,PUFSVYYR,PUFPSU,PUFRPL,...,PUFC33_WEEKS,PUFC34_WYNOT,PUFC35_LTLOOKW,PUFC36_AVAIL,PUFC37_WILLING,PUFC38_PREVJOB,PUFC40_POCC,PUFC41_WQTR,PUFC43_QKB,PUFNEWEMPSTAT
0,14,1,100,1,2,124.9425,1,2018,140,32,...,,6.0,,,,1.0,52.0,2.0,,3.0
1,14,1,100,1,2,131.2126,1,2018,140,32,...,,,,,,,,1.0,1.0,1.0
2,14,1,100,1,2,142.0464,1,2018,140,32,...,,,,,,,,1.0,1.0,1.0
3,14,1,100,1,2,138.2958,1,2018,140,32,...,,,,,,,,,,
4,14,1,100,2,2,195.4152,1,2018,140,32,...,,,,,,,,1.0,41.0,1.0


## Metadata Sheet 1

<H5> Sample: January 2018 Metadata Sheet 1 (Raw) </H5>

In [4]:
# Load the first sheet of January 2018 metadata
january_2018_metadata_sheet1 = load_dataset("2018", "January", "metadata", 0)

# View the first few rows
print("=== January 2018 Metadata Sheet 1 ===")
january_2018_metadata_sheet1.head()


=== January 2018 Metadata Sheet 1 ===


Unnamed: 0,QUEST,Questionnaire,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,_IDS0,(Id Items),,
1,,,,,PUFREG,Region
2,,,,,PUFPRV,Province
3,,,,,PUFPRRCD,Province Recode
4,,,,,PUFHHNUM,Household Unique Sequential Number


#### Reshaping Metadata Sheet 1

In [5]:
import pandas as pd

def extract_variables(df):
    """
    Extract variable names and descriptions from metadata Sheet 1 (variable dictionary).
    Automatically reads the 4th and 5th columns (E and F in Excel) where variables and descriptions reside.
    
    Returns a clean DataFrame with columns ['Variable', 'Description'].
    """
    
    # Select the 4th and 5th columns (index 4 and 5)
    df_vars = df.iloc[:, 4:6].copy()
    
    # Rename columns
    df_vars.columns = ['Variable', 'Description']
    
    # Drop rows where 'Variable' is empty or NaN
    df_vars = df_vars[df_vars['Variable'].notna() & (df_vars['Variable'].astype(str).str.strip() != '')]
    
    # Strip whitespace from values
    df_vars['Variable'] = df_vars['Variable'].astype(str).str.strip()
    df_vars['Description'] = df_vars['Description'].astype(str).str.strip()
    
    # Reset index
    df_vars = df_vars.reset_index(drop=True)
    
    return df_vars


Checking January 2018 Metadata Reshaped Sheet 1

In [6]:
# Load metadata Sheet 1
January_metadata = load_dataset("2018", "January", "metadata", 0)

# Call your function
variables_df = extract_variables(January_metadata)

# View results
variables_df.head(20)

Unnamed: 0,Variable,Description
0,PUFREG,Region
1,PUFPRV,Province
2,PUFPRRCD,Province Recode
3,PUFHHNUM,Household Unique Sequential Number
4,PUFURB2K10,2010Urban-RuralFIES
5,PUFPWGTPRV,Final Weight Based on Projection (provincial p...
6,PUFSVYMO,Survey Month
7,PUFSVYYR,Survey Year
8,PUFPSU,Psu Number
9,PUFRPL,Replicate


Checking August 2024 Metadata Reshaped Sheet 1

In [7]:
# Load metadata Sheet 1
August_2024_metadata = load_dataset("2024", "August", "metadata", 0)

# Call your function
variables_df = extract_variables(August_2024_metadata)

# View results
variables_df.head(20)

Unnamed: 0,Variable,Description
0,PUFHHNUM,Household Unique Sequential Number
1,PUFPWGTPRV,Final Weight Based on Projection
2,PUFSVYMO,Survey Month
3,PUFSVYYR,Survey Year
4,PUFPSU,Psu Number
5,PUFRPL,Replicate
6,PUFHHSIZE,Household Size
7,PUFC01_LNO,C101-Line Number
8,PUFC03_REL,C03-Relationship to Household Head
9,PUFC04_SEX,C04-Sex


#### Verifying if the content of Reshaped Metadata Sheet 1 and Original matches

In [8]:
def verify_sheet1(original_df, reshaped_df):
    """
    Verify reshaped Sheet 1 metadata (variable dictionary).
    Only compares columns 4 and 5 (Variable and Description).
    Ignores other columns and empty rows.
    """

    # Select columns 4 and 5 in original (E & F)
    orig_vars = original_df.iloc[:, 4:6].copy()
    orig_vars.columns = ['Variable', 'Description']

    # Drop empty rows
    orig_vars = orig_vars[orig_vars['Variable'].notna() & (orig_vars['Variable'].astype(str).str.strip() != '')]

    # Strip whitespace
    orig_vars['Variable'] = orig_vars['Variable'].astype(str).str.strip()
    orig_vars['Description'] = orig_vars['Description'].astype(str).str.strip()

    # Strip whitespace in reshaped
    reshaped_df['Variable'] = reshaped_df['Variable'].astype(str).str.strip()
    reshaped_df['Description'] = reshaped_df['Description'].astype(str).str.strip()

    # Convert to sets of tuples for comparison (ignoring order)
    orig_set = set([tuple(x) for x in orig_vars[['Variable', 'Description']].values])
    resh_set = set([tuple(x) for x in reshaped_df[['Variable', 'Description']].values])

    missing = orig_set - resh_set
    extra = resh_set - orig_set

    if not missing and not extra:
        return "SUCCESS: Reshaped Sheet 1 metadata matches the original."
    else:
        msg = "MISMATCH FOUND in Sheet 1 metadata.\n"
        if missing:
            msg += f"Missing records in reshaped: {missing}\n"
        if extra:
            msg += f"Extra records in reshaped: {extra}\n"
        return msg


Checking January 2018 Original vs Reshaped Metadata Sheet 1

In [9]:
# Load original metadata Sheet 1
original_df = load_dataset("2018", "January", "metadata", 0)

# Load reshaped metadata
reshaped_df = extract_variables(original_df)

# Store month/year for messages
month = "January"
year = "2018"

# Verify
result_message = verify_sheet1(original_df, reshaped_df)

# Print message with month/year
print(f"{month} {year}: {result_message}")


January 2018: SUCCESS: Reshaped Sheet 1 metadata matches the original.


In [10]:
# Load original metadata Sheet 1
original_df = load_dataset("2024", "August", "metadata", 0)

# Load reshaped metadata
reshaped_df = extract_variables(original_df)

# Store month/year for messages
month = "August"
year = "2024"

# Verify
result_message = verify_sheet1(original_df, reshaped_df)

# Print message with month/year
print(f"{month} {year}: {result_message}")


August 2024: SUCCESS: Reshaped Sheet 1 metadata matches the original.


## Metadata Sheet 2 Function

<H5> Sample: January 2018 Metadata Sheet 2 (Raw)</H5>

In [11]:
# Load the second sheet of January 2018 metadata
january_2018_metadata_sheet2 = load_dataset("2018", "January", "metadata", 1)

# View the first few rows
print("=== January 2018 Metadata Sheet 2 ===")
january_2018_metadata_sheet2.head()

=== January 2018 Metadata Sheet 2 ===


Unnamed: 0,PUFREG_VS1,Region,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,National Capital Region,13,,
1,,,Cordillera Administrative Region,14,,
2,,,Region I - Ilocos Region,1,,
3,,,Region II - Cagayan Valley,2,,
4,,,Region III - Central Luzon,3,,


### Reshaping Metadata Sheet 2

In [12]:
import os
import pandas as pd


def reshape_sheet2_robust(df):
    """
    Convert metadata Sheet 2 (the values dictionary) into a clean, long-format table.

    This function reads the sheet exactly as it appears in Excel, without:
    - Assuming any header row
    - Auto-filling missing values
    - Inferencing min/max values
    - Guessing variable names

    Sheet 2 typically has this layout:
        Column A = Variable name (only appears once per block)
        Column B = Variable description (blank except at the start of a block)
        Column C = Label for each value (required)
        Column D = Minimum value (optional)
        Column E = Maximum value (optional)
        Column F+ = Additional text or category notes (optional)

    The function processes rows in order and:
        - Carries forward the most recent non-empty variable name (Column A)
        - Carries forward the most recent non-empty description (Column B)
        - Creates one output row per value label (Column C)
        - Leaves missing min/max/additional values as 0
        - Reads extra info (Column F onward) if present

    Returns:
        A clean pandas DataFrame with columns:
            Variable
            Description
            Label
            min_value
            max_value
            additional_value
    """

    reshaped = []

    # Ensure all blanks are handled consistently
    df = df.fillna('').astype(str)

    # Initialize with the first variable and description
    current_var = df.iloc[0, 0].strip() or 'UNKNOWN_VAR'
    current_desc = df.iloc[0, 1].strip() or ''

    # Iterate row-by-row
    for idx, row in df.iterrows():
        # ---- Column A: Variable name ----
        var_candidate = row.iloc[0].strip()
        if var_candidate:
            current_var = var_candidate

        # ---- Column B: Description ----
        desc_candidate = row.iloc[1].strip()
        if desc_candidate:
            current_desc = desc_candidate

        # ---- Column C: Label (required) ----
        label = row.iloc[2].strip()
        if not label:
            continue   # skip rows without labels

        # ---- Column D/E: min/max values ----
        raw_min = row.iloc[3].strip()
        raw_max = row.iloc[4].strip()

        min_value = raw_min if raw_min else '0'
        max_value = raw_max if raw_max else '0'

        # ---- Column F+: Additional text ----
        extra = '0'  # default to 0
        if len(row) > 5:
            for j in range(5, len(row)):
                extra_candidate = row.iloc[j].strip()
                if extra_candidate:
                    extra = extra_candidate
                    break

        # ---- Append clean record ----
        reshaped.append({
            "Variable": current_var,
            "Description": current_desc,
            "Label": label,
            "min_value": min_value,
            "max_value": max_value,
            "additional_value": extra
        })

    return pd.DataFrame(reshaped)


# ============================================================
#   load_dataset()
# ============================================================
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load any dataset (survey or metadata) from the file inventory.

    • For SURVEY CSV: normal pandas.read_csv()
    • For METADATA Excel: read with no header, reshape Sheet 2 automatically
    """
    # Retrieve file information from inventory
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )

    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])

    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)

    # Metadata Excel — always read with no header
    df = pd.read_excel(file_path, sheet_name=sheet_number, header=None)

    # Automatic reshaping ONLY for metadata Sheet 2
    if sheet_number == 1:
        df = reshape_sheet2_robust(df)

    return df


Checking January 2018 Metadata Reshaped Sheet 2

In [13]:
clean_jan_2018 = load_dataset("2018", "January", "metadata", 1)
clean_jan_2018.head(20)


Unnamed: 0,Variable,Description,Label,min_value,max_value,additional_value
0,PUFREG_VS1,Region,National Capital Region,13,0,0
1,PUFREG_VS1,Region,Cordillera Administrative Region,14,0,0
2,PUFREG_VS1,Region,Region I - Ilocos Region,1,0,0
3,PUFREG_VS1,Region,Region II - Cagayan Valley,2,0,0
4,PUFREG_VS1,Region,Region III - Central Luzon,3,0,0
5,PUFREG_VS1,Region,Region IVA - CALABARZON,4,0,0
6,PUFREG_VS1,Region,Region IVB - MIMAROPA,17,0,0
7,PUFREG_VS1,Region,Region V- Bicol,5,0,0
8,PUFREG_VS1,Region,Region VI - Western Visayas,6,0,0
9,PUFREG_VS1,Region,Region VII - Central Visayas,7,0,0


Checking August 2024 Metadata Reshaped Sheet 2

In [14]:
clean_aug_2024 = load_dataset("2024", "August", "metadata", 1)
clean_aug_2024.head(20)

Unnamed: 0,Variable,Description,Label,min_value,max_value,additional_value
0,PUFSVYMO_VS1,Survey Month,January,1.0,0,0
1,PUFSVYMO_VS1,Survey Month,February,2.0,0,0
2,PUFSVYMO_VS1,Survey Month,March,3.0,0,0
3,PUFSVYMO_VS1,Survey Month,April,4.0,0,0
4,PUFSVYMO_VS1,Survey Month,May,5.0,0,0
5,PUFSVYMO_VS1,Survey Month,June,6.0,0,0
6,PUFSVYMO_VS1,Survey Month,July,7.0,0,0
7,PUFSVYMO_VS1,Survey Month,August,8.0,0,0
8,PUFSVYMO_VS1,Survey Month,September,9.0,0,0
9,PUFSVYMO_VS1,Survey Month,October,10.0,0,0


Checking for Variables with Additional Value (Extra Column)

In [15]:
# Load metadata Sheet 2 for your desired month/year
df_metadata = load_dataset("2018", "January", "metadata", 1)

# Filter rows for PUFC10_CONWR_VS1
pu_fc10 = df_metadata[df_metadata["Variable"] == "PUFC10_CONWR_VS1"]

# Display the result
pu_fc10


Unnamed: 0,Variable,Description,Label,min_value,max_value,additional_value
218,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Overseas Contract Workers,1,0,0
219,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Workers other than OCW,2,0,0
220,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,"Employees in Philippine Embassy, Consulates & ...",3,0,0
221,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Students abroad/Tourists,4,0,0
222,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Others,5,0,0
223,PUFC10_CONWR_VS1,C10-Overseas Filipino Indicator,Less than 15 Years Old,0,0,Not Applicable


#### Verifying if the content of Reshaped Metadata Sheet 2 and Original matches

In [16]:
def verify_sheet2(original_df, reshaped_df):
    """
    Compare original Sheet 2 with reshaped version.
    Checks:
      • Same variables
      • Same descriptions
      • Same labels
      • Same min/max/additional values
      • Ignores row order
    """

    # Normalize to string
    original = original_df.fillna("").astype(str)
    reshaped = reshaped_df.fillna("").astype(str)

    # --- Extract original as dict ---
    def build_original_dict(df):
        data = {}
        current_var = ""
        current_desc = ""
        for _, row in df.iterrows():
            colA = row.iloc[0].strip()
            colB = row.iloc[1].strip()
            colC = row.iloc[2].strip()
            if colA:
                current_var = colA
            if colB:
                current_desc = colB
            if not colC:
                continue
            minv = row.iloc[3].strip() if len(row) > 3 else ""
            maxv = row.iloc[4].strip() if len(row) > 4 else ""
            extra = ""
            if len(row) > 5:
                for j in range(5, len(row)):
                    if row.iloc[j].strip():
                        extra = row.iloc[j].strip()
                        break
            if current_var not in data:
                data[current_var] = []
            data[current_var].append({
                "Description": current_desc,
                "Label": colC,
                "min_value": minv,
                "max_value": maxv,
                "additional_value": extra
            })
        return data

    orig_dict = build_original_dict(original)

    # --- Extract reshaped as dict (warning-free) ---
    resh_dict = {
        var: group.drop(columns="Variable").to_dict("records")
        for var, group in reshaped.groupby("Variable")
    }

    # --- Verification ---
    errors = []

    orig_vars = set(orig_dict.keys())
    resh_vars = set(resh_dict.keys())

    missing_vars = orig_vars - resh_vars
    extra_vars = resh_vars - orig_vars
    if missing_vars:
        errors.append(f"Missing variables in reshaped: {missing_vars}")
    if extra_vars:
        errors.append(f"Extra variables in reshaped: {extra_vars}")

    # Detailed label/content comparison
    for var in orig_vars & resh_vars:
        orig_records = orig_dict[var]
        resh_records = resh_dict[var]
        orig_set = {
            (d["Label"], d["min_value"], d["max_value"], d["additional_value"])
            for d in orig_records
        }
        resh_set = {
            (d["Label"], d["min_value"], d["max_value"], d["additional_value"])
            for d in resh_records
        }
        missing_rec = orig_set - resh_set
        extra_rec = resh_set - orig_set
        if missing_rec:
            errors.append(f"[{var}] Missing records: {missing_rec}")
        if extra_rec:
            errors.append(f"[{var}] Extra records: {extra_rec}")

    if not errors:
        return "SUCCESS: Reshaped metadata matches the original Sheet 2."
    else:
        return "MISMATCH FOUND:\n" + "\n".join(errors)


Checking for January 2018 Original vs Reshaped Metadata Sheet 2

In [17]:
# ===========================
# Caller: content-based verification with month/year in messages
# ===========================

# Load original metadata (Sheet 2)
original_df = load_dataset("2018", "January", "metadata", 1)

# Load reshaped metadata (automatically reshaped)
reshaped_df = load_dataset("2018", "January", "metadata", 1)

# Store month/year for messages
month = "January"
year = "2018"

# Call the verifier
result_message = verify_sheet2(original_df, reshaped_df)

# Customize message to include month/year
if "SUCCESS" in result_message:
    print(f"SUCCESS: Reshaped Sheet 2 metadata for {month} {year} matches the original.\n")
    print("Sample content verification (first 5 variables):\n")
    
    # Pick first 5 unique variables from reshaped
    sample_vars = reshaped_df["Variable"].unique()[:5]
    for var in sample_vars:
        sample_rows = reshaped_df[reshaped_df["Variable"] == var]
        print(f"Variable: {var}")
        print(sample_rows[["Label", "min_value", "max_value", "additional_value"]])
        print("-" * 50)
else:
    print(f"MISMATCH FOUND in {month} {year} metadata:\n")
    # Show first few rows from both original and reshaped for inspection
    orig_preview = original_df.fillna("").astype(str).head(10)
    resh_preview = reshaped_df.fillna("").astype(str).head(10)
    print("Original preview:")
    print(orig_preview)
    print("\nReshaped preview:")
    print(resh_preview)


SUCCESS: Reshaped Sheet 2 metadata for January 2018 matches the original.

Sample content verification (first 5 variables):

Variable: PUFREG_VS1
                                   Label min_value max_value additional_value
0                National Capital Region        13         0                0
1       Cordillera Administrative Region        14         0                0
2               Region I - Ilocos Region         1         0                0
3             Region II - Cagayan Valley         2         0                0
4             Region III - Central Luzon         3         0                0
5                Region IVA - CALABARZON         4         0                0
6                  Region IVB - MIMAROPA        17         0                0
7                        Region V- Bicol         5         0                0
8            Region VI - Western Visayas         6         0                0
9           Region VII - Central Visayas         7         0              

Checking for August 2024 Original vs Reshaping Metadata Sheet 2

In [18]:
# ===========================
# Caller: content-based verification with month/year in messages
# ===========================

# Load original metadata (Sheet 2)
original_df = load_dataset("2024", "August", "metadata", 1)

# Load reshaped metadata (automatically reshaped)
reshaped_df = load_dataset("2024", "August", "metadata", 1)

# Store month/year for messages
month = "August"
year = "2024"

# Call the verifier
result_message = verify_sheet2(original_df, reshaped_df)

# Customize message to include month/year
if "SUCCESS" in result_message:
    print(f"SUCCESS: Reshaped Sheet 2 metadata for {month} {year} matches the original.\n")
    print("Sample content verification (first 5 variables):\n")
    
    # Pick first 5 unique variables from reshaped
    sample_vars = reshaped_df["Variable"].unique()[:5]
    for var in sample_vars:
        sample_rows = reshaped_df[reshaped_df["Variable"] == var]
        print(f"Variable: {var}")
        print(sample_rows[["Label", "min_value", "max_value", "additional_value"]])
        print("-" * 50)
else:
    print(f"MISMATCH FOUND in {month} {year} metadata:\n")
    # Show first few rows from both original and reshaped for inspection
    orig_preview = original_df.fillna("").astype(str).head(10)
    resh_preview = reshaped_df.fillna("").astype(str).head(10)
    print("Original preview:")
    print(orig_preview)
    print("\nReshaped preview:")
    print(resh_preview)


SUCCESS: Reshaped Sheet 2 metadata for August 2024 matches the original.

Sample content verification (first 5 variables):

Variable: PUFSVYMO_VS1
        Label min_value max_value additional_value
0     January       1.0         0                0
1    February       2.0         0                0
2       March       3.0         0                0
3       April       4.0         0                0
4         May       5.0         0                0
5        June       6.0         0                0
6        July       7.0         0                0
7      August       8.0         0                0
8   September       9.0         0                0
9     October      10.0         0                0
10   November      11.0         0                0
11   December      12.0         0                0
--------------------------------------------------
Variable: PUFHHSIZE_VS1
   Label min_value max_value additional_value
12     1       1.0         0                0
13     2       2.0     