### Recalling consistent variables throughout the datasets

In [3]:
import os
import pandas as pd

# Base path where decoded surveys are stored
base_path = r"G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey\Fully Decoded Surveys"

# Years to inspect
years = [str(y) for y in range(2018, 2025)]

# Dictionary to store variables per year
vars_per_year = {}

for year in years:
    year_folder = os.path.join(base_path, year)
    if not os.path.isdir(year_folder):
        continue
    
    cols_this_year = set()
    for file in os.listdir(year_folder):
        if file.endswith(".CSV"):
            df = pd.read_csv(os.path.join(year_folder, file), nrows=10)  # read only first 10 rows for speed
            cols_this_year.update(df.columns.tolist())
    
    vars_per_year[year] = cols_this_year
    print(f"[OK] {year}: {len(cols_this_year)} variables detected.")

# Find variables consistent across all years
consistent_vars = set.intersection(*vars_per_year.values())

print("\n===============================================")
print("CONSISTENT VARIABLES ACROSS 2018–2024")
print("===============================================\n")
for var in sorted(consistent_vars):
    print(var)

print(f"\nTotal consistent variables: {len(consistent_vars)}")


[OK] 2018: 54 variables detected.
[OK] 2019: 50 variables detected.
[OK] 2022: 78 variables detected.
[OK] 2023: 81 variables detected.
[OK] 2024: 79 variables detected.

CONSISTENT VARIABLES ACROSS 2018–2024

C03-Relationship to Household Head
C04-Sex
C05-Age as of Last Birthday
C06-Marital Status
C07-Highest Grade Completed
C08-Currently Attending School
C09-Graduate of technical/vocational course
C09a - Currently Attending Non-formal Training for Skills Development
C10-Overseas Filipino Indicator
C101-Line Number
C11-Work Indicator
C12-Job Indicator
C14-Primary Occupation
C16-Kind of Business (Primary Occupation)
C17-Nature of Employment (Primary Occupation)
C18-Normal Working Hours per Day
C19-Total Number of Hours Worked during the past week
C20-Want More Hours of Work
C21-Look for Additional Work
C22-First Time to Work
C23-Class of Worker (Primary Occupation)
C24-Basis of Payment (Primary Occupation)
C25-Basic Pay per Day (Primary Occupation)
C26-Other Job Indicator
C27-Number of

### Recall FMI Summary Results

In [None]:
import os
import pandas as pd

# Define your base path again in this notebook
base_path = r"G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey"

# Point to the *new* FMI Reports folder
reports_root = os.path.join(base_path, "New FMI Reports")

# Path to the saved overall summary file
summary_path = os.path.join(reports_root, "FMI_Summary_2018_2024.csv")

# Load the summary
FMI_summary = pd.read_csv(summary_path)

# Inspect results
FMI_summary.head()


Unnamed: 0,Column,TotalMissing,TotalRows,AvgFMI,MonthsObserved,OverallFMI,Flag,Recommendation
0,C03-Relationship to Household Head,0,4881364,0.0,34,0.0,Low,Keep
1,C04-Sex,0,4881364,0.0,34,0.0,Low,Keep
2,C05-Age as of Last Birthday,82751,4881364,0.019262,34,0.016952,Low,Keep
3,C05B - Ethnicity,0,707981,0.0,1,0.0,Low,Keep
4,C06-Marital Status,358820,4881364,0.069921,34,0.073508,Moderate,Consider imputation
5,C07-Highest Grade Completed,361914,4881364,0.071961,34,0.074142,Moderate,Consider imputation
6,C08-Currently Attending School,2281910,4116745,0.567432,17,0.5543,Critical,Candidate to drop (validate with business logic)
7,C08-Overseas Filipino Indicator,203920,764619,0.266685,17,0.266695,High,Strongly consider imputation
8,C09-Graduate of technical/vocational course,1162929,4116745,0.285236,17,0.282487,High,Strongly consider imputation
9,C09-Work Indicator,64311,764619,0.084135,17,0.084109,Moderate,Consider imputation
