## Section 1. Introduction ##

In this notebook, the dataset to be processed is the Labor Force Survey conducted April 2016 and retrieved through Philippine Statistics Authority database. 



In [2]:
import os
import random
import pickle
import numpy as np
import pandas as pd
import h5py


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR

plt.rcParams['figure.figsize'] = (6.0, 6.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

plt.style.use('ggplot')

# autoreload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

<h1>Importing LFS PUF April 2016.CSV</h1>

In [4]:
try:
    lfs_data = pd.read_csv("LFS PUF April 2016.CSV")
except FileNotFoundError:
    print("Error: CSV file not found. Please make sure the file exists in the correct directory or provide the correct path.")
    exit()

<h1>Data Information, Pre-Processing, and Cleaning</h1>

Let's get an overview of our dataset.

In [6]:
lfs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180862 entries, 0 to 180861
Data columns (total 50 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   PUFREG           180862 non-null  int64  
 1   PUFPRV           180862 non-null  int64  
 2   PUFPRRCD         180862 non-null  int64  
 3   PUFHHNUM         180862 non-null  int64  
 4   PUFURB2K10       180862 non-null  int64  
 5   PUFPWGTFIN       180862 non-null  float64
 6   PUFSVYMO         180862 non-null  int64  
 7   PUFSVYYR         180862 non-null  int64  
 8   PUFPSU           180862 non-null  int64  
 9   PUFRPL           180862 non-null  int64  
 10  PUFHHSIZE        180862 non-null  int64  
 11  PUFC01_LNO       180862 non-null  int64  
 12  PUFC03_REL       180862 non-null  int64  
 13  PUFC04_SEX       180862 non-null  int64  
 14  PUFC05_AGE       180862 non-null  int64  
 15  PUFC06_MSTAT     180862 non-null  object 
 16  PUFC07_GRADE     180862 non-null  obje

---
Of interest to us, there are:
<ul><li>1 contains float values, </li>
<li>14 contain integer values, and </li>
<li><b>35 are object values</b>.</li></ul>


---
Let's check for duplicates:

In [9]:
lfs_data.duplicated().sum()

0

No duplicates here, and therefore no cleaning need follow in this regard.

The dataset seems to contain null values in the form of whitespaces. Let's count those:

In [12]:
has_null = lfs_data.apply(lambda col: col.str.isspace().sum() if col.dtype == 'object' else 0)

print("Number Empty Cells:")
print(has_null[has_null > 0])

Number Empty Cells:
PUFC06_MSTAT        18339
PUFC07_GRADE        18339
PUFC08_CURSCH      107137
PUFC09_GRADTECH     57782
PUFC10_CONWR        57782
PUFC11_WORK         21894
PUFC12_JOB          93306
PUFC14_PROCC       108360
PUFC16_PKB         108360
PUFC17_NATEM       109507
PUFC18_PNWHRS      109507
PUFC19_PHOURS      109507
PUFC20_PWMORE      109507
PUFC21_PLADDW      109507
PUFC22_PFWRK       109507
PUFC23_PCLASS      109507
PUFC24_PBASIS      138947
PUFC25_PBASIC      144274
PUFC26_OJOB        109507
PUFC27_NJOBS       174924
PUFC28_THOURS      109507
PUFC29_WWM48H      163629
PUFC30_LOOKW       132692
PUFC31_FLWRK       178569
PUFC32_JOBSM       178569
PUFC33_WEEKS       178569
PUFC34_WYNOT       134985
PUFC35_LTLOOKW     179269
PUFC36_AVAIL       174893
PUFC37_WILLING     174893
PUFC38_PREVJOB     132692
PUFC40_POCC        152982
PUFC41_WQTR         81627
PUFC43_QKB         107825
PUFNEWEMPSTAT       61337
dtype: int64


---
And standardize, replacing these whitespace values with -1:

In [14]:
lfs_data.replace(r"^\s+$", -1, regex=True, inplace=True)
nan_counts_per_column = lfs_data.isna().sum()
print(nan_counts_per_column[nan_counts_per_column > 0])

Series([], dtype: int64)


Now that these are -1, let's return to the data types, and find if our object columns from earlier are convertible to integers (or float):

In [16]:
int_convertible_columns = []

for col in lfs_data.columns:
    if lfs_data[col].dtypes == 'object':  
        try:
            float_vals = lfs_data[col].dropna().astype(float)
            if (float_vals % 1 == 0).all():
                int_convertible_columns.append(col)
        except ValueError:
            pass 

print("Safely convertable to int:")
print(int_convertible_columns)

Safely convertable to int:
['PUFC06_MSTAT', 'PUFC07_GRADE', 'PUFC08_CURSCH', 'PUFC09_GRADTECH', 'PUFC10_CONWR', 'PUFC11_WORK', 'PUFC12_JOB', 'PUFC14_PROCC', 'PUFC16_PKB', 'PUFC17_NATEM', 'PUFC18_PNWHRS', 'PUFC19_PHOURS', 'PUFC20_PWMORE', 'PUFC21_PLADDW', 'PUFC22_PFWRK', 'PUFC23_PCLASS', 'PUFC24_PBASIS', 'PUFC25_PBASIC', 'PUFC26_OJOB', 'PUFC27_NJOBS', 'PUFC28_THOURS', 'PUFC29_WWM48H', 'PUFC30_LOOKW', 'PUFC31_FLWRK', 'PUFC32_JOBSM', 'PUFC33_WEEKS', 'PUFC34_WYNOT', 'PUFC35_LTLOOKW', 'PUFC36_AVAIL', 'PUFC37_WILLING', 'PUFC38_PREVJOB', 'PUFC40_POCC', 'PUFC41_WQTR', 'PUFC43_QKB', 'PUFNEWEMPSTAT']


---
And convert to int

In [18]:
columns_to_convert = [
    'PUFC06_MSTAT', 'PUFC08_CURSCH', 'PUFC09_GRADTECH', 'PUFC10_CONWR', 'PUFC11_WORK', 
    'PUFC12_JOB', 'PUFC14_PROCC', 'PUFC16_PKB', 'PUFC17_NATEM', 'PUFC18_PNWHRS', 
    'PUFC19_PHOURS', 'PUFC20_PWMORE', 'PUFC21_PLADDW', 'PUFC22_PFWRK', 'PUFC23_PCLASS', 
    'PUFC24_PBASIS', 'PUFC25_PBASIC', 'PUFC26_OJOB', 'PUFC27_NJOBS', 'PUFC28_THOURS', 
    'PUFC29_WWM48H', 'PUFC30_LOOKW', 'PUFC31_FLWRK', 'PUFC32_JOBSM', 'PUFC33_WEEKS', 
    'PUFC34_WYNOT', 'PUFC35_LTLOOKW', 'PUFC36_AVAIL', 'PUFC37_WILLING', 'PUFC38_PREVJOB', 
    'PUFC40_POCC', 'PUFC41_WQTR', 'PUFC43_QKB', 'PUFNEWEMPSTAT'
]

for col in columns_to_convert:
    lfs_data[col] = lfs_data[col].astype(int) 

---
Let's also apply the unique() function to our dataset.

In [20]:
lfs_data.apply(lambda x: x.nunique())

PUFREG                17
PUFPRV                86
PUFPRRCD             116
PUFHHNUM           40880
PUFURB2K10             2
PUFPWGTFIN         35599
PUFSVYMO               1
PUFSVYYR               1
PUFPSU               975
PUFRPL                 4
PUFHHSIZE             20
PUFC01_LNO            23
PUFC03_REL            11
PUFC04_SEX             2
PUFC05_AGE           100
PUFC06_MSTAT           7
PUFC07_GRADE          68
PUFC08_CURSCH          3
PUFC09_GRADTECH        3
PUFC10_CONWR           6
PUFC11_WORK            3
PUFC12_JOB             3
PUFC14_PROCC          44
PUFC16_PKB            88
PUFC17_NATEM           4
PUFC18_PNWHRS         17
PUFC19_PHOURS        103
PUFC20_PWMORE          3
PUFC21_PLADDW          3
PUFC22_PFWRK           3
PUFC23_PCLASS          8
PUFC24_PBASIS          9
PUFC25_PBASIC       1152
PUFC26_OJOB            3
PUFC27_NJOBS           6
PUFC28_THOURS        111
PUFC29_WWM48H          6
PUFC30_LOOKW           3
PUFC31_FLWRK           3
PUFC32_JOBSM           7


---
Considering our dataset has 18,000 entries, features with particularly low numbers stand out as questions that have clear, defined choices. Reviewing the [questionnaire](https://psada.psa.gov.ph/catalog/67/download/537), we find that certain questions ask the participant to specify beyond prespecified choices.

This column possibly contains "010," which is obviously not an integer. We ensure this column is a string, and check for values not specified in the questionnaire.

In [22]:
lfs_data['PUFC07_GRADE'] = lfs_data['PUFC07_GRADE']
valid_codes = [
    0, 10,  # No Grade, Preschool
    210, 220, 230, 240, 250, 260, 280,  # Elementary
    310, 320, 330, 340, 350,  # High School
    410, 420,  # Post Secondary; If Graduate Specify
    810, 820, 830, 840,  # College; If Graduate Specify
    900,  # Post Baccalaureate
    np.nan
]
invalid_rows = lfs_data[~(lfs_data['PUFC07_GRADE'].isin(valid_codes))]

unique_invalid_values = invalid_rows['PUFC07_GRADE'].unique()
print(unique_invalid_values)

['350' '320' '250' -1 '622' '672' '240' '220' '614' '330' '010' '280'
 '632' '310' '000' '900' '820' '230' '589' '572' '210' '830' '810' '634'
 '686' '581' '681' '552' '534' '840' '658' '548' '648' '652' '662' '601'
 '642' '562' '260' '685' '631' '684' '340' '584' '621' '410' '420' '664'
 '676' '521' '638' '554' '646' '689' '522' '654' '644' '532' '531' '514'
 '558' '501' '586' '542' '576' '544' '585' '564']


Values 5XX 6XX are not detailed in the questionnaire. As it instructs the participant to specify whether they graduated from post secondary or college, we'll create a new data point to encapsulate these.

In [24]:
lfs_data.loc[~lfs_data['PUFC07_GRADE'].isin(valid_codes), 'PUFC07_GRADE'] = 700
print(lfs_data['PUFC07_GRADE'].unique())

[700]


## EDA

In [27]:
lfs_data_with_nan = lfs_data.copy()
lfs_data_with_nan.replace(-1, np.nan, inplace=True)
corr_matrix = lfs_data_with_nan.corr()

strong_correlations = []
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)): 
        corr_value = corr_matrix.iloc[i, j]
        if (0.5 < corr_value < 1) or (-1 < corr_value < -0.5):
            strong_correlations.append((
                corr_matrix.index[i], 
                corr_matrix.columns[j], 
                corr_value
            ))

strong_correlations.sort(key=lambda x: abs(x[2]), reverse=True)

print("Strong correlations (|corr| > 0.5 and |corr| < 1):")
for var1, var2, corr in strong_correlations:
    print(f"{var1} — {var2}: {corr:.3f}")


Strong correlations (|corr| > 0.5 and |corr| < 1):
PUFPRV — PUFPRRCD: 1.000
PUFREG — PUFHHNUM: 0.995
PUFC19_PHOURS — PUFC28_THOURS: 0.972
PUFC16_PKB — PUFC43_QKB: 0.969
PUFC11_WORK — PUFNEWEMPSTAT: 0.964
PUFC41_WQTR — PUFNEWEMPSTAT: 0.878
PUFC11_WORK — PUFC41_WQTR: 0.852
PUFC31_FLWRK — PUFC38_PREVJOB: -0.795
PUFC36_AVAIL — PUFC37_WILLING: 0.785
PUFC37_WILLING — PUFNEWEMPSTAT: 0.785
PUFC18_PNWHRS — PUFC19_PHOURS: 0.785
PUFC18_PNWHRS — PUFC28_THOURS: 0.769
PUFPWGTFIN — PUFPSU: 0.709
PUFC12_JOB — PUFNEWEMPSTAT: 0.704
PUFC05_AGE — PUFC06_MSTAT: 0.701
PUFC34_WYNOT — PUFNEWEMPSTAT: 0.631
PUFC30_LOOKW — PUFNEWEMPSTAT: 0.625
PUFC01_LNO — PUFC03_REL: 0.625
PUFC05_AGE — PUFC08_CURSCH: 0.590
PUFHHSIZE — PUFC01_LNO: 0.571
PUFC01_LNO — PUFC05_AGE: -0.567
PUFC20_PWMORE — PUFC21_PLADDW: 0.556
PUFC08_CURSCH — PUFC11_WORK: -0.514
PUFC08_CURSCH — PUFC34_WYNOT: -0.510
PUFC08_CURSCH — PUFNEWEMPSTAT: -0.509


# Regression
We will opt for a simple 80/20 train-test split. We will use

In [None]:
## Data Prep

In [None]:
# Dataset class for handling missing values
class LFSDataset(Dataset):
    def __init__(self, features, labels, missing_value=-1):
        # Convert to numpy arrays
        self.features = features.values.astype(np.float32)
        
        # Ensure labels are 0 or 1
        unique_labels = np.unique(labels)
        if len(unique_labels) <= 2 and 0 in unique_labels and 1 in unique_labels:
            self.labels = labels.values.astype(np.float32)
        else:
            # Map lowest value to 0, others to 1
            min_label = labels.min()
            self.labels = (labels.values != min_label).astype(np.float32)
            print(f"Normalized target values from {unique_labels} to [0, 1]")
        
        self.missing_value = missing_value
        
        # Create mask (1 = present, 0 = missing)
        self.mask = (self.features != missing_value).astype(np.float32)
        
        # Replace missing values with 0
        self.features = np.where(self.features == missing_value, 0, self.features)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'mask': self.mask[idx],
            'labels': self.labels[idx]
        }

# Logistic regression with missing value handling
class MaskedLogisticRegression(torch.nn.Module):
    def __init__(self, input_dim):
        super(MaskedLogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, 1)
        
    def forward(self, features, mask):
        # Zero out missing features
        masked_features = features * mask
        output = self.linear(masked_features)
        return torch.sigmoid(output)

def prepare_data(lfs_data, target_col='PUFC11_WORK', feature_cols=None, test_size=0.2, missing_value=-1):

    feature_cols = [
            'PUFC05_AGE', 'PUFC06_MSTAT', 'PUFC04_SEX', 
            'PUFC07_GRADE', 'PUFC08_CURSCH', 
            'PUFC38_PREVJOB', 'PUFC31_FLWRK',
            'PUFC30_LOOKW', 'PUFC34_WYNOT'
    ]
    
    # Identify rows where target is not missing
    mask = lfs_data[target_col] != missing_value
    filtered_data = lfs_data.loc[mask, feature_cols + [target_col]]

    X = filtered_data[feature_cols]
    y = filtered_data[target_col]
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=45)
    
    # Handle missing values for scaling
    # Create masks where values are missing
    train_missing_mask = X_train == missing_value
    test_missing_mask = X_test == missing_value
    
    # Temporarily fill missing values with 0 for scaling
    X_train_filled = X_train.copy()
    X_test_filled = X_test.copy()
    X_train_filled[train_missing_mask] = 0
    X_test_filled[test_missing_mask] = 0
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train_filled),
        columns=X_train.columns,
        index=X_train.index
    )
    
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test_filled),
        columns=X_test.columns,
        index=X_test.index
    )
    
    # Restore missing value markers
    X_train_scaled[train_missing_mask] = missing_value
    X_test_scaled[test_missing_mask] = missing_value
    
    return {
        'X_train': X_train_scaled,
        'X_test': X_test_scaled,
        'y_train': y_train,
        'y_test': y_test,
        'feature_names': feature_cols,
        'scaler': scaler
    }

def train_model(data_dict, learning_rate=0.01, batch_size=64, num_epochs=10, 
                scheduler_step_size=3, scheduler_gamma=0.5, missing_value=-1):
    # Create datasets and loaders
    train_dataset = LFSDataset(data_dict['X_train'], data_dict['y_train'], missing_value)
    test_dataset = LFSDataset(data_dict['X_test'], data_dict['y_test'], missing_value)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Setup model
    input_dim = data_dict['X_train'].shape[1]
    model = MaskedLogisticRegression(input_dim)
    
    # Loss and optimizer
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)
    
    # Training history
    history = {
        'train_loss': [],
        'test_loss': [],
        'train_accuracy': [],
        'test_accuracy': []
    }
    
    # For confusion matrix
    all_y_test = []
    all_predictions = []
    
    # Train for each epoch
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for batch in train_loader:
            features = batch['features']
            mask = batch['mask']
            labels = batch['labels'].view(-1, 1)
            
            # Forward, backward, optimize
            outputs = model(features, mask)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            predicted = (outputs >= 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        
        scheduler.step()
        
        # Evaluate on test set
        model.eval()
        test_loss = 0
        test_correct = 0
        test_total = 0
        epoch_y_test = []
        epoch_predictions = []
        
        with torch.no_grad():
            for batch in test_loader:
                features = batch['features']
                mask = batch['mask']
                labels = batch['labels'].view(-1, 1)
                
                outputs = model(features, mask)
                loss = criterion(outputs, labels)
                
                test_loss += loss.item()
                
                predicted = (outputs >= 0.5).float()
                test_correct += (predicted == labels).sum().item()
                test_total += labels.size(0)
                
                # Store for confusion matrix
                epoch_y_test.extend(labels.view(-1).tolist())
                epoch_predictions.extend(predicted.view(-1).tolist())
        
        # Save metrics
        avg_train_loss = total_loss / len(train_loader)
        avg_test_loss = test_loss / len(test_loader)
        train_accuracy = correct / total
        test_accuracy = test_correct / test_total
        
        history['train_loss'].append(avg_train_loss)
        history['test_loss'].append(avg_test_loss)
        history['train_accuracy'].append(train_accuracy)
        history['test_accuracy'].append(test_accuracy)
        
        # Store predictions for final epoch
        if epoch == num_epochs - 1:
            all_y_test = epoch_y_test
            all_predictions = epoch_predictions
        
        # Print each epoch
        print(f'Epoch {epoch+1}/{num_epochs}: '
              f'Train Loss: {avg_train_loss:.4f}, '
              f'Test Loss: {avg_test_loss:.4f}, '
              f'Train Acc: {train_accuracy:.4f}, '
              f'Test Acc: {test_accuracy:.4f}, '
              f'LR: {scheduler.get_last_lr()[0]:.6f}')
    
    # Create confusion matrix
    cm = confusion_matrix(all_y_test, all_predictions)
    print("\nConfusion Matrix:")
    print(cm)
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(all_y_test, all_predictions))
    
    return {
        'model': model,
        'history': history,
        'feature_names': data_dict['feature_names'],
        'confusion_matrix': cm
    }

def analyze_model(result_dict):
    # Get model coefficients
    model = result_dict['model']
    feature_names = result_dict['feature_names']
    
    weights = model.linear.weight.data.numpy().flatten()
    bias = model.linear.bias.data.numpy()[0]
    
    # Coefficient table
    coefficients = pd.DataFrame({
        'Feature': feature_names,
        'Coefficient': weights
    })
    
    # Sort by importance
    coefficients = coefficients.reindex(coefficients['Coefficient'].abs().sort_values(ascending=False).index)
    
    print("\nLogistic Regression Coefficients:")
    print(f"Bias (Intercept): {bias:.4f}")
    print(coefficients)
    
    # Print confusion matrix
    print("\nConfusion Matrix:")
    cm = result_dict.get('confusion_matrix')
    if cm is not None:
        print(f"True Negative: {cm[0,0]}")
        print(f"False Positive: {cm[0,1]}")
        print(f"False Negative: {cm[1,0]}")
        print(f"True Positive: {cm[1,1]}")
        
        # Calculate metrics
        if cm.shape == (2, 2):
            accuracy = (cm[0,0] + cm[1,1]) / cm.sum()
            if cm[1,1] + cm[0,1] > 0:
                precision = cm[1,1] / (cm[1,1] + cm[0,1])
            else:
                precision = 0
            if cm[1,1] + cm[1,0] > 0:
                recall = cm[1,1] / (cm[1,1] + cm[1,0])
            else:
                recall = 0
            if precision + recall > 0:
                f1 = 2 * (precision * recall) / (precision + recall)
            else:
                f1 = 0
                
            print(f"\nAccuracy: {accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1 Score: {f1:.4f}")

def predict_employment_status(lfs_data, result_dict, target_normalization=None, missing_value=-1):
    model = result_dict['model']
    feature_names = result_dict['feature_names']
    
    # Get features
    X = lfs_data[feature_names]
    
    # Create mask and handle missing values
    mask = (X != missing_value).astype(np.float32)
    X = X.replace(missing_value, 0).astype(np.float32)
    
    # Predict
    model.eval()
    with torch.no_grad():
        predictions = model(torch.tensor(X.values), torch.tensor(mask.values))
        binary_predictions = (predictions >= 0.5).float().numpy().flatten()
    
    # Map back to original values if needed
    if target_normalization is not None and 'original_values' in target_normalization:
        original_values = target_normalization['original_values']
        if len(original_values) == 2:
            value_map = {0: min(original_values), 1: max(original_values)}
            return pd.Series([value_map[int(p)] for p in binary_predictions], index=X.index)
    
    return pd.Series(binary_predictions, index=X.index)

def run_employment_prediction(lfs_data, target_col='PUFC11_WORK', missing_value=-1):
    print("Preparing data...")
    data_dict = prepare_data(lfs_data, target_col=target_col, missing_value=missing_value)
    
    print(f"Training on {len(data_dict['X_train'])} samples with {len(data_dict['feature_names'])} features")
    print(f"Features: {data_dict['feature_names']}")
    
    # Store original values for later mapping
    original_values = sorted(list(set(lfs_data[lfs_data[target_col] != missing_value][target_col])))
    target_normalization = {'original_values': original_values}
    print(f"Original target values: {original_values}")
    
    result_dict = train_model(
        data_dict,
        learning_rate=0.01,  # Increased for faster convergence
        batch_size=128,
        num_epochs=10,  # Reduced to 10 epochs
        scheduler_step_size=3,  # Adjusted for fewer epochs
        scheduler_gamma=0.5,
        missing_value=missing_value
    )
    
    result_dict['target_normalization'] = target_normalization
    
    print("\nModel training complete!")
    analyze_model(result_dict)
    
    return result_dict

# Usage:
# result_dict = run_employment_prediction(lfs_data, missing_value=-1)

In [30]:
result_dict = run_employment_prediction(lfs_data)

Preparing data...
Training on 127174 samples with 9 features
Features: ['PUFC05_AGE', 'PUFC06_MSTAT', 'PUFC04_SEX', 'PUFC07_GRADE', 'PUFC08_CURSCH', 'PUFC38_PREVJOB', 'PUFC31_FLWRK', 'PUFC30_LOOKW', 'PUFC34_WYNOT']
Original target values: [1, 2]
Normalized target values from [1 2] to [0, 1]
Normalized target values from [1 2] to [0, 1]
Epoch 1/10: Train Loss: 0.2028, Test Loss: 0.1370, Train Acc: 0.9497, Test Acc: 0.9706, LR: 0.010000
Epoch 2/10: Train Loss: 0.1280, Test Loss: 0.1171, Train Acc: 0.9731, Test Acc: 0.9747, LR: 0.010000
Epoch 3/10: Train Loss: 0.1181, Test Loss: 0.1121, Train Acc: 0.9746, Test Acc: 0.9746, LR: 0.005000
Epoch 4/10: Train Loss: 0.1158, Test Loss: 0.1111, Train Acc: 0.9750, Test Acc: 0.9755, LR: 0.005000
Epoch 5/10: Train Loss: 0.1152, Test Loss: 0.1105, Train Acc: 0.9751, Test Acc: 0.9759, LR: 0.005000
Epoch 6/10: Train Loss: 0.1148, Test Loss: 0.1103, Train Acc: 0.9754, Test Acc: 0.9754, LR: 0.002500
Epoch 7/10: Train Loss: 0.1148, Test Loss: 0.1100, Train