In [1]:
!python -m pip install pandas



In [2]:
!python -m pip install openpyxl




In [3]:
import os
import re
import pandas as pd

base_path = r"G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey"

# Month ordering
month_order = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

# Patterns
month_pattern = re.compile(
    r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
    re.IGNORECASE
)
year_pattern = re.compile(r"(20\d{2})")

# Detect year folders from drive
year_folders = [
    f for f in os.listdir(base_path)
    if os.path.isdir(os.path.join(base_path, f)) and f.isdigit()
]

print("Detected year folders:", sorted(year_folders))

inventory = {}

for year in sorted(year_folders):
    year_path = os.path.join(base_path, year)

    # Accept both CSV and XLSX
    data_files = [
        f for f in os.listdir(year_path)
        if f.lower().endswith(".csv") or f.lower().endswith(".xlsx")
    ]

    inventory[year] = {}

    for file in data_files:
        upper = file.upper()

        # Detect type
        if upper.endswith(".XLSX"):
            filetype = "metadata"  # XLSX = metadata
        else:
            filetype = "survey"    # CSV = survey

        # Detect month
        month_match = month_pattern.search(upper)
        month = (
            month_match.group(1).capitalize()
            if month_match
            else "Unmatched"
        )

        # Detect year inside filename
        year_match = year_pattern.search(upper)
        file_year = year_match.group(1) if year_match else "UNKNOWN"

        # Store into inventory
        if month not in inventory[year]:
            inventory[year][month] = []

        inventory[year][month].append({
            "filename": file,
            "filetype": filetype,
            "file_year": file_year
        })

# Print clean summary
print("\n=== DATASET INVENTORY SUMMARY ===\n")

for yr in sorted(inventory.keys()):
    print(f"Year {yr}:")

    sorted_months = sorted(
        inventory[yr].keys(),
        key=lambda m: month_order.get(m, 99)
    )

    for month in sorted_months:
        print(f"  {month}:")
        for item in inventory[yr][month]:
            print(f"    {item['filename']} ({item['filetype']})")

    print()


Detected year folders: ['2018', '2019', '2022', '2023', '2024']

=== DATASET INVENTORY SUMMARY ===

Year 2018:
  January:
    JANUARY_2018.CSV (survey)
    JANUARY_2018_METADATA.xlsx (metadata)
  April:
    APRIL_2018.CSV (survey)
    APRIL_2018_METADATA.xlsx (metadata)
  July:
    JULY_2018.CSV (survey)
    JULY_2018_METADATA.xlsx (metadata)
  October:
    OCTOBER_2018.CSV (survey)
    OCTOBER_2018_METADATA.xlsx (metadata)
  Unmatched:
    clean_jan_2018_metadata.csv (survey)
    clean_jan_2018_metadata_sheet2.csv (survey)

Year 2019:
  January:
    JANUARY_2019.CSV (survey)
    JANUARY_2019_METADATA.xlsx (metadata)
  April:
    APRIL_2019_METADATA.xlsx (metadata)
    APRIL_2019.CSV (survey)
  July:
    JULY_2019_METADATA.xlsx (metadata)
    JULY_2019.CSV (survey)
  October:
    OCTOBER_2019_METADATA.xlsx (metadata)
    OCTOBER_2019.CSV (survey)

Year 2022:
  January:
    JANUARY_2022.csv (survey)
    JANUARY_2022_METADATA.xlsx (metadata)
  February:
    FEBRUARY_2022.csv (survey)
   

In [4]:
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load a dataset from the inventory.

    year: str, e.g., "2018"
    month: str, e.g., "January"
    filetype: "survey" or "metadata"
    sheet_number: 0(sheet 1) or 1(sheet 2)
    """
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )
    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])
    
    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)
    
    if sheet_number is not None:
        return pd.read_excel(file_path, sheet_name=sheet_number)
    
    return pd.read_excel(file_path)

In [5]:
import pandas as pd

# Load the second sheet of January 2018 metadata
second_sheet = load_dataset("2018", "January","metadata", 1)

# View the first few rows
second_sheet.head()


Unnamed: 0,PUFREG_VS1,Region,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,National Capital Region,13,,
1,,,Cordillera Administrative Region,14,,
2,,,Region I - Ilocos Region,1,,
3,,,Region II - Cagayan Valley,2,,
4,,,Region III - Central Luzon,3,,


In [6]:
# Load the second sheet of January 2018 metadata
second_sheet = load_dataset("2018", "January","metadata", 0)

# View the first few rows
second_sheet.head()


Unnamed: 0,QUEST,Questionnaire,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,_IDS0,(Id Items),,
1,,,,,PUFREG,Region
2,,,,,PUFPRV,Province
3,,,,,PUFPRRCD,Province Recode
4,,,,,PUFHHNUM,Household Unique Sequential Number


In [7]:
def extract_variables(df):
    """
    Extract variable names and descriptions from metadata DataFrame.
    Looks for values in columns 'Unnamed: 4' (variable) and 'Unnamed: 5' (description).
    Returns a clean DataFrame with ['Variable', 'Description'].
    """

    # Keep only rows where column 4 has a variable name
    filtered = df[df['Unnamed: 4'].notna()][['Unnamed: 4', 'Unnamed: 5']]

    # Rename columns
    filtered.columns = ['Variable', 'Description']

    return filtered


In [8]:
# Load metadata
January_metadata = load_dataset("2018", "January", "metadata", 0)

# Call your function
variables_df = extract_variables(January_metadata)

# View results
variables_df.head(50)


Unnamed: 0,Variable,Description
1,PUFREG,Region
2,PUFPRV,Province
3,PUFPRRCD,Province Recode
4,PUFHHNUM,Household Unique Sequential Number
6,PUFURB2K10,2010Urban-RuralFIES
7,PUFPWGTPRV,Final Weight Based on Projection (provincial p...
8,PUFSVYMO,Survey Month
9,PUFSVYYR,Survey Year
10,PUFPSU,Psu Number
11,PUFRPL,Replicate


In [9]:
sheet2_jan2018 = load_dataset(year="2018", month="January", filetype="metadata", sheet_number=1)

print("Raw metadata sheet:")
print(sheet2_jan2018.head(20))
print(sheet2_jan2018.columns)
print(sheet2_jan2018.shape)


Raw metadata sheet:
        PUFREG_VS1               Region                            Unnamed: 2  \
0              NaN                  NaN               National Capital Region   
1              NaN                  NaN      Cordillera Administrative Region   
2              NaN                  NaN              Region I - Ilocos Region   
3              NaN                  NaN            Region II - Cagayan Valley   
4              NaN                  NaN            Region III - Central Luzon   
5              NaN                  NaN               Region IVA - CALABARZON   
6              NaN                  NaN                 Region IVB - MIMAROPA   
7              NaN                  NaN                       Region V- Bicol   
8              NaN                  NaN           Region VI - Western Visayas   
9              NaN                  NaN          Region VII - Central Visayas   
10             NaN                  NaN         Region VIII - Eastern Visayas   
11      

In [10]:
import os
import re
import pandas as pd

# ============================================================
#   Reshape function for metadata Sheet 2
# ============================================================
def process_metadata_sheet_binary1(df):
    """
    Reshape metadata Sheet 2 (binary=1):
    - Variable = col 0, Label/Description = col 1, Category = col 2
    - min_val/max_val:
        * Scan columns 3,4,5 left-to-right for numeric values (int or float-like).
        * First numeric → min_val; second numeric → max_val.
        * If Category has an explicit range (e.g., "15-24", "15 to 24", "15 and over"),
          prefer the first two integers from Category as min/max.
    - additional_value:
        * Strictly from col 5 (index 5) only if it's non-numeric text; else 0.
    """

    # Normalize blanks to empty strings; keep everything as text for parsing
    df = df.fillna('').astype(str).reset_index(drop=True)

    def extract_ints(text):
        # grab integers from text (handles "47.0" by extracting 47)
        return [int(m.group()) for m in re.finditer(r"\d+", text)] if text else []

    def looks_like_range(text):
        s = text.lower()
        return ("-" in s) or ("–" in s) or (" to " in s) or ("and over" in s)

    def is_numeric_text(s):
        s = s.strip()
        if s == "":
            return False
        # Accept integers or floats like "47", "47.0"
        return bool(re.fullmatch(r"[+-]?\d+(?:\.\d+)?", s))

    def to_int_safe(s):
        # Convert numeric strings (int/float-like) to int
        try:
            return int(float(s.strip()))
        except Exception:
            return 0

    def clean_additional(val):
        s = val.strip()
        if s == "" or s.lower() in {"n/a", "na", "not applicable", "none"}:
            return 0
        # Only keep if non-numeric text
        return s

    records = []
    current_var = ""
    current_label = ""

    for _, row in df.iterrows():
        # Ensure at least 6 columns; pad missing with ""
        cells = [row.iloc[i].strip() if i < len(row) else "" for i in range(6)]

        # Carry forward context
        if cells[0]:
            current_var = cells[0]
        if cells[1]:
            current_label = cells[1]

        category = cells[2].strip()
        if not category:
            continue

        # Determine min/max
        min_val = max_val = 0

        # Priority 1: explicit range in Category
        cat_ints = extract_ints(category)
        if looks_like_range(category) and len(cat_ints) >= 2:
            min_val, max_val = cat_ints[0], cat_ints[1]
        else:
            # Priority 2: scan cols 3..5 left-to-right for numeric tokens
            numeric_tokens = []
            for c in (cells[3], cells[4], cells[5]):
                if is_numeric_text(c):
                    numeric_tokens.append(to_int_safe(c))

            if len(numeric_tokens) >= 1:
                min_val = numeric_tokens[0]
            if len(numeric_tokens) >= 2:
                max_val = numeric_tokens[1]
            # Fallback: if no numeric in cols 3..5 but Category itself is a single number
            elif len(numeric_tokens) == 0 and category.isdigit():
                min_val = int(category)
                max_val = 0

        # Additional value: strictly col 5, only if non-numeric text
        if is_numeric_text(cells[5]):
            additional_value = 0
        else:
            additional_value = clean_additional(cells[5])

        records.append({
            "Variable": current_var,
            "Label": current_label,
            "Category": category,
            "min_val": min_val,
            "max_val": max_val,
            "additional_value": additional_value
        })

    return pd.DataFrame.from_records(
        records,
        columns=["Variable","Label","Category","min_val","max_val","additional_value"]
    )


In [11]:
# ============================================================
#   Loader function (unchanged interface)
# ============================================================
def load_dataset(year, month, filetype="survey", sheet_number=None):
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )
    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])

    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)

    # Read metadata as raw data
    df = pd.read_excel(file_path, sheet_name=sheet_number, header=None)

    # Auto-reshape only for Sheet 2
    if sheet_number == 1:
        df = process_metadata_sheet_binary1(df)

    return df

# ============================================================
#   Tester for January 2018 for reproducibility)
# ============================================================
sheet2_jan2018 = load_dataset(year="2018", month="January", filetype="metadata", sheet_number=1)
("January 2018 (Sheet 2) preview:")
(sheet2_jan2018.tail(10))


Unnamed: 0,Variable,Label,Category,min_val,max_val,additional_value
376,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Public Administration and Defense; Compulsory ...,84,0,0
377,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Education,85,0,0
378,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Human Health and Social Work Activities,86,88,0
379,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),"Arts, Entertainment and Recreation",90,93,0
380,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Other Service Activities,94,96,0
381,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Activities of Households as Employers,97,98,0
382,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Activities of Extraterritorial Organizations a...,99,0,0
383,PUFNEWEMPSTAT_VS1,"New Employment Criteria (jul 05, 2005)",EMPLOYED,1,0,0
384,PUFNEWEMPSTAT_VS1,"New Employment Criteria (jul 05, 2005)",UNEMPLOYED,2,0,0
385,PUFNEWEMPSTAT_VS1,"New Employment Criteria (jul 05, 2005)",NOT IN THE LABOR FORCE,3,0,0


In [12]:
# ============================================================
#   Tester for April 2018 for reproducibility)
# ============================================================
sheet2_apr2018 = load_dataset(year="2018", month="April", filetype="metadata", sheet_number=1)
("April 2018 (Sheet 2) preview:")
(sheet2_apr2018.tail(10))

Unnamed: 0,Variable,Label,Category,min_val,max_val,additional_value
344,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Public Administration and Defense; Compulsory ...,84,0,0
345,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Education,85,0,0
346,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Human Health and Social Work Activities,86,88,0
347,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),"Arts, Entertainment and Recreation",90,93,0
348,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Other Service Activities,94,96,0
349,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Activities of Households as Employers,97,98,0
350,PUFC43_QKB_VS1,C43-Kind of Business (past quarter),Activities of Extraterritorial Organizations a...,99,0,0
351,PUFNEWEMPSTAT_VS1,"New Employment Criteria (jul 05, 2005)",EMPLOYED,1,0,0
352,PUFNEWEMPSTAT_VS1,"New Employment Criteria (jul 05, 2005)",UNEMPLOYED,2,0,0
353,PUFNEWEMPSTAT_VS1,"New Employment Criteria (jul 05, 2005)",NOT IN THE LABOR FORCE,3,0,0


##### VERIFIER IF RESHAPED RETAINED ORIGINAL VALUES (REPRODUCIBLE ACROSS ALL METADATA, FOR SHEET 2)

In [13]:
import pandas as pd
import os

def verify_accuracy(year, month, sheet_number=1, sample_errors=10):
    """
    Formal verification of reshaping accuracy.
    Compares raw vs reshaped categories and reports precision, recall, F1-score.
    Also prints mismatched samples for inspection.
    """

    # --- Load raw Excel sheet (no reshaping) ---
    file_info = next((f for f in inventory[year][month] if f["filetype"]=="metadata"), None)
    if not file_info:
        raise ValueError(f"No metadata file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])
    raw_df = pd.read_excel(file_path, sheet_name=sheet_number, header=None)

    # --- Load reshaped version ---
    reshaped_df = load_dataset(year=year, month=month, filetype="metadata", sheet_number=sheet_number)

    # Normalize categories: strip, lowercase, drop blanks/nan
    raw_categories = set(
        raw_df.iloc[:,2].astype(str).str.strip().str.lower().replace({"nan":""}).unique()
    )
    raw_categories.discard("")
    reshaped_categories = set(
        reshaped_df["Category"].astype(str).str.strip().str.lower().unique()
    )
    reshaped_categories.discard("")

    # Compare sets
    overlap = raw_categories & reshaped_categories
    only_in_raw = raw_categories - reshaped_categories
    only_in_reshaped = reshaped_categories - raw_categories

    # Metrics
    recall = len(overlap) / len(raw_categories) * 100 if raw_categories else 0
    precision = len(overlap) / len(reshaped_categories) * 100 if reshaped_categories else 0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0

    print(f"\n=== {month} {year} Metadata Sheet {sheet_number} ===")
    print(f"Raw categories: {len(raw_categories)} | Reshaped categories: {len(reshaped_categories)}")
    print(f"Overlap: {len(overlap)}")
    print(f"Recall (coverage of raw): {recall:.2f}%")
    print(f"Precision (accuracy of reshaped): {precision:.2f}%")
    print(f"F1-score: {f1:.2f}%")

    if only_in_raw:
        print(f"\nMissing from reshaped ({len(only_in_raw)}):")
        for val in list(only_in_raw)[:sample_errors]:
            print("  -", val)
    if only_in_reshaped:
        print(f"\nExtra in reshaped ({len(only_in_reshaped)}):")
        for val in list(only_in_reshaped)[:sample_errors]:
            print("  -", val)

    return {"recall": recall, "precision": precision, "f1": f1}

# ============================================================
#   Reproducibility tests
# ============================================================

acc_jan = verify_accuracy("2018", "January", sheet_number=1)
acc_apr = verify_accuracy("2018", "April", sheet_number=1)

print("\n=== Reproducibility check ===")
print(f"January 2018 -> Recall: {acc_jan['recall']:.2f}%, Precision: {acc_jan['precision']:.2f}%, F1: {acc_jan['f1']:.2f}%")
print(f"April 2018   -> Recall: {acc_apr['recall']:.2f}%, Precision: {acc_apr['precision']:.2f}%, F1: {acc_apr['f1']:.2f}%")



=== January 2018 Metadata Sheet 1 ===
Raw categories: 289 | Reshaped categories: 289
Overlap: 289
Recall (coverage of raw): 100.00%
Precision (accuracy of reshaped): 100.00%
F1-score: 100.00%

=== April 2018 Metadata Sheet 1 ===
Raw categories: 294 | Reshaped categories: 294
Overlap: 294
Recall (coverage of raw): 100.00%
Precision (accuracy of reshaped): 100.00%
F1-score: 100.00%

=== Reproducibility check ===
January 2018 -> Recall: 100.00%, Precision: 100.00%, F1: 100.00%
April 2018   -> Recall: 100.00%, Precision: 100.00%, F1: 100.00%
