## Installing Libraries Needed

In [1]:
!python -m pip install pandas




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!python -m pip install openpyxl





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Dataset Inventory Loader

In [10]:
import os
import re

base_path = r"G:\My Drive\Labor Force Survey"

# Month ordering
month_order = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

# Patterns
month_pattern = re.compile(
    r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
    re.IGNORECASE
)
year_pattern = re.compile(r"(20\d{2})")

# Detect year folders from drive
year_folders = [
    f for f in os.listdir(base_path)
    if os.path.isdir(os.path.join(base_path, f)) and f.isdigit()
]

print("Detected year folders:", sorted(year_folders))

inventory = {}

for year in sorted(year_folders):
    year_path = os.path.join(base_path, year)

    # Accept both CSV and XLSX
    data_files = [
        f for f in os.listdir(year_path)
        if f.lower().endswith(".csv") or f.lower().endswith(".xlsx")
    ]

    inventory[year] = {}

    for file in data_files:
        upper = file.upper()

        # Detect type
        if upper.endswith(".XLSX"):
            filetype = "metadata"  # XLSX = metadata
        else:
            filetype = "survey"    # CSV = survey

        # Detect month
        month_match = month_pattern.search(upper)
        month = (
            month_match.group(1).capitalize()
            if month_match
            else "Unmatched"
        )

        # Detect year inside filename
        year_match = year_pattern.search(upper)
        file_year = year_match.group(1) if year_match else "UNKNOWN"

        # Store into inventory
        if month not in inventory[year]:
            inventory[year][month] = []

        inventory[year][month].append({
            "filename": file,
            "filetype": filetype,
            "file_year": file_year
        })

# Print clean summary
print("\n=== DATASET INVENTORY SUMMARY ===\n")

for yr in sorted(inventory.keys()):
    print(f"Year {yr}:")

    sorted_months = sorted(
        inventory[yr].keys(),
        key=lambda m: month_order.get(m, 99)
    )

    for month in sorted_months:
        print(f"  {month}:")
        for item in inventory[yr][month]:
            print(f"    {item['filename']} ({item['filetype']})")

    print()


Detected year folders: ['2018', '2019', '2022', '2023', '2024']

=== DATASET INVENTORY SUMMARY ===

Year 2018:
  January:
    JANUARY_2018_METADATA.xlsx (metadata)
    JANUARY_2018.CSV (survey)
  April:
    APRIL_2018.CSV (survey)
    APRIL_2018_METADATA.xlsx (metadata)
  July:
    JULY_2018.CSV (survey)
    JULY_2018_METADATA.xlsx (metadata)
  October:
    OCTOBER_2018.CSV (survey)
    OCTOBER_2018_METADATA.xlsx (metadata)

Year 2019:
  January:
    JANUARY_2019.CSV (survey)
    JANUARY_2019_METADATA.xlsx (metadata)
  April:
    APRIL_2019.CSV (survey)
    APRIL_2019_METADATA.xlsx (metadata)
  July:
    JULY_2019.CSV (survey)
    JULY_2019_METADATA.xlsx (metadata)
  October:
    OCTOBER_2019.CSV (survey)
    OCTOBER_2019_METADATA.xlsx (metadata)

Year 2022:
  January:
    JANUARY_2022.csv (survey)
    JANUARY_2022_METADATA.xlsx (metadata)
  February:
    FEBRUARY_2022.csv (survey)
    FEBRUARY_2022_METADATA.xlsx (metadata)
  March:
    MARCH_2022.csv (survey)
    MARCH_2022_METADATA.x

## Load Dataset Function

In [11]:
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load a dataset from the inventory.

    year: str, e.g., "2018"
    month: str, e.g., "January"
    filetype: "survey" or "metadata"
    sheet_number: 0(sheet 1) or 1(sheet 2)
    """
    file_info = next(
        (f for f in inventory[year][month] if f["filetype"] == filetype),
        None
    )
    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = os.path.join(base_path, year, file_info["filename"])
    
    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)
    
    if sheet_number is not None:
        return pd.read_excel(file_path, sheet_name=sheet_number)
    
    return pd.read_excel(file_path)

## Metadata Sheet 1 Function 

In [16]:
def extract_variables(df):
    """
    Extract variable names and descriptions from metadata DataFrame.
    Looks for values in columns 'Unnamed: 4' (variable) and 'Unnamed: 5' (description).
    Returns a clean DataFrame with ['Variable', 'Description'].
    """

    # Keep only rows where column 4 has a variable name
    filtered = df[df['Unnamed: 4'].notna()][['Unnamed: 4', 'Unnamed: 5']]

    # Rename columns
    filtered.columns = ['Variable', 'Description']

    return filtered


In [17]:
# Load metadata
January_metadata = load_dataset("2018", "January", "metadata", 0)

# Call your function
variables_df = extract_variables(January_metadata)

# View results
variables_df.head()


Unnamed: 0,Variable,Description
1,PUFREG,Region
2,PUFPRV,Province
3,PUFPRRCD,Province Recode
4,PUFHHNUM,Household Unique Sequential Number
6,PUFURB2K10,2010Urban-RuralFIES


## Metadata Sheet 2 Function

In [23]:
# Load metadata
January_metadata = load_dataset("2018", "January", "metadata", 1)

January_metadata.head()

Unnamed: 0,PUFREG_VS1,Region,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,National Capital Region,13,,
1,,,Cordillera Administrative Region,14,,
2,,,Region I - Ilocos Region,1,,
3,,,Region II - Cagayan Valley,2,,
4,,,Region III - Central Luzon,3,,


In [27]:
import pandas as pd
import re

def reshape_metadata_sheet2_general(df):
    """
    Reshape any metadata Sheet 2 into a long DataFrame with columns:
    ['Variable','Label','value_start','value_end','value','note']
    
    Rules:
      - Header row: col0 and col1 not-NA -> start new variable block.
      - For value rows:
        * col2 = label/text
        * col3 = value_start
        * col4 = value_end (optional, same as start if empty)
        * col5 = note (optional)
    """
    def is_num(x):
        if pd.isna(x): 
            return False
        s = str(x).strip()
        return re.fullmatch(r"[+-]?\d+(\.\d+)?", s) is not None

    rows = []
    cur_var = None
    cur_desc = None

    for _, r in df.iterrows():
        # new variable header
        if pd.notna(r.iloc[0]) and pd.notna(r.iloc[1]):
            cur_var = r.iloc[0]
            cur_desc = r.iloc[1]
            continue

        if cur_var is None:
            continue

        label = r.iloc[2] if len(r) > 2 else None
        val_start = r.iloc[3] if len(r) > 3 else None
        val_end = r.iloc[4] if len(r) > 4 else None
        note = r.iloc[5] if len(r) > 5 else None

        # skip fully empty rows
        if pd.isna(label) and pd.isna(val_start):
            continue

        # numeric conversion
        try:
            if is_num(val_start):
                val_start = int(float(val_start))
            if is_num(val_end):
                val_end = int(float(val_end))
            else:
                val_end = val_start
        except:
            pass

        value = val_start if val_end == val_start else f"{val_start}-{val_end}"

        rows.append({
            "Variable": cur_var,
            "Label": label,
            "value_start": val_start,
            "value_end": val_end,
            "value": value,
            "note": note
        })

    return pd.DataFrame(rows, columns=["Variable","Label","value_start","value_end","value","note"])


In [28]:
sheet2 = load_dataset("2018", "January", "metadata", sheet_number=1)
clean = reshape_metadata_sheet2_v2(sheet2)
clean.head(30)


Unnamed: 0,Variable,Description,value_start,value_end,value,label,note
0,PUFURB2K10_VS1,2010Urban-RuralFIES,1.0,1.0,1,Urban,
1,PUFURB2K10_VS1,2010Urban-RuralFIES,2.0,2.0,2,Rural,
2,PUFSVYMO_VS1,Survey Month,1.0,1.0,1,January,
3,PUFSVYMO_VS1,Survey Month,4.0,4.0,4,April,
4,PUFSVYMO_VS1,Survey Month,7.0,7.0,7,July,
5,PUFSVYMO_VS1,Survey Month,10.0,10.0,10,October,
6,PUFHHSIZE_VS1,Household Size,1.0,1.0,1-1,,
7,PUFHHSIZE_VS1,Household Size,2.0,2.0,2-2,,
8,PUFHHSIZE_VS1,Household Size,3.0,3.0,3-3,,
9,PUFHHSIZE_VS1,Household Size,4.0,4.0,4-4,,


In [31]:
# Load January 2018 metadata sheet 2
sheet2 = load_dataset("2018", "January", "metadata", sheet_number=1)

# Reshape using your function
clean = reshape_metadata_sheet2_v2(sheet2)

# Save to Downloads
output_path = r"C:\Users\juanp\Downloads\January_metadata_sheet2.csv"
clean.to_csv(output_path, index=False)

clean.head(30)


Unnamed: 0,Variable,Description,value_start,value_end,value,label,note
0,PUFURB2K10_VS1,2010Urban-RuralFIES,1.0,1.0,1,Urban,
1,PUFURB2K10_VS1,2010Urban-RuralFIES,2.0,2.0,2,Rural,
2,PUFSVYMO_VS1,Survey Month,1.0,1.0,1,January,
3,PUFSVYMO_VS1,Survey Month,4.0,4.0,4,April,
4,PUFSVYMO_VS1,Survey Month,7.0,7.0,7,July,
5,PUFSVYMO_VS1,Survey Month,10.0,10.0,10,October,
6,PUFHHSIZE_VS1,Household Size,1.0,1.0,1-1,,
7,PUFHHSIZE_VS1,Household Size,2.0,2.0,2-2,,
8,PUFHHSIZE_VS1,Household Size,3.0,3.0,3-3,,
9,PUFHHSIZE_VS1,Household Size,4.0,4.0,4-4,,
