In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set("talk")

def std_err(x):
    return x.std() / np.sqrt(len(x))

In [2]:
df = pd.read_csv("../data/master_dataset_gender-1990-2019_1-16.csv")

In [3]:
def weighted_r2(y_true, y_pred, weights):
    weighted_mean_true = np.average(y_true, weights=weights)
    weighted_total_ss = np.average((y_true - weighted_mean_true) ** 2, weights=weights)
    weighted_residual_ss = np.average((y_true - y_pred) ** 2, weights=weights)
    r2 = 1 - (weighted_residual_ss / weighted_total_ss)
    return r2

#### The data in the next cell makes the Wage R^2 column of Table 1

In [4]:
wage_pred_methods = [
    'coarse_grained_regression', 'coarse_grained_lasso', 
    'fine_grained_lasso', 'wage_model_random_init', 
    'wage_model_current_job_only_pretrain_seed_1', 
    'wage_model_participation_only_pretrain_seed_1', 'wage_model',
    'dragonnet', 'round-2-wage',
]

results = {}
rs = np.random.RandomState(42)
for method in wage_pred_methods:
    print(f"Working on {method}")
    # Calculate weighted R²
    r2_score = weighted_r2(df['lnwage'], df[f'{method}_wage_pred'], df['famwgt'])
    
    # Bootstrap to get standard error
    n_bootstrap = 100
    bootstrap_r2s = []
    for _ in range(n_bootstrap):
        # Sample with replacement
        idx = rs.choice(len(df), size=len(df), replace=True)
        bootstrap_sample = df.iloc[idx]
        bootstrap_r2 = weighted_r2(
            bootstrap_sample['lnwage'],
            bootstrap_sample[f'{method}_wage_pred'],
            bootstrap_sample['famwgt']
        )
        bootstrap_r2s.append(bootstrap_r2)
    
    std_err = np.std(bootstrap_r2s)
    
    results[method] = {
        'r2': r2_score,
        'std_err': std_err
    }

# Print results in a formatted way
for method in wage_pred_methods:
    print(f"{method}:")
    print(f"  R²: {results[method]['r2']:.3f} ({results[method]['std_err']:.3f})")

Working on coarse_grained_regression
Working on coarse_grained_lasso
Working on fine_grained_lasso
Working on wage_model_random_init
Working on wage_model_current_job_only_pretrain_seed_1
Working on wage_model_participation_only_pretrain_seed_1
Working on wage_model_pretrain_seed_1
Working on wage_model
Working on dragonnet
Working on round-2-wage
coarse_grained_regression:
  R²: 0.428 (0.004)
coarse_grained_lasso:
  R²: 0.428 (0.003)
fine_grained_lasso:
  R²: 0.455 (0.003)
wage_model_random_init:
  R²: 0.462 (0.003)
wage_model_current_job_only_pretrain_seed_1:
  R²: 0.454 (0.004)
wage_model_participation_only_pretrain_seed_1:
  R²: 0.467 (0.003)
wage_model_pretrain_seed_1:
  R²: 0.498 (0.004)
wage_model:
  R²: 0.515 (0.004)
dragonnet:
  R²: 0.468 (0.004)
round-2-wage:
  R²: 0.503 (0.003)


In [5]:
gender_pred_methods = ['coarse_grained_regression', 'coarse_grained_lasso', 'fine_grained_lasso', 
                       'propensity_model_random_init', 
                       'propensity_model_current_job_only_pretrain_seed_1', 
                       'propensity_model_participation_only_pretrain_seed_1', 'propensity_model', 
                       'dragonnet', 'round-1-prop']

#### The data in the next cell makes the Gender R^2 column of Table 1

In [6]:
def weighted_nll(y_true, y_pred_prob, weights):
    # Clip probabilities to avoid log(0)
    eps = 1e-15
    y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)
    # Calculate negative log likelihood for each observation
    nll = -(y_true * np.log(y_pred_prob) + (1 - y_true) * np.log(1 - y_pred_prob))
    # Return weighted average
    return np.average(nll, weights=weights)

def weighted_pseudo_r2(y_true, y_pred_prob, weights):
    eps = 1e-15
    # Calculate null model NLL (using mean of y_true as prediction)
    y_mean = np.average(y_true, weights=weights)
    y_mean = np.clip(y_mean, eps, 1 - eps)
    null_nll = -(y_true * np.log(y_mean) + (1 - y_true) * np.log(1 - y_mean))
    null_nll = np.average(null_nll, weights=weights)
    
    # Calculate model NLL
    model_nll = weighted_nll(y_true, y_pred_prob, weights)
    
    # Calculate McFadden's pseudo R²
    return 1 - (model_nll / null_nll)

results = {}

for method in gender_pred_methods:
    print(f"Working on {method}")
    # Calculate weighted NLL and pseudo R²
    nll_score = weighted_nll(df['female'], df[f'{method}_prop_score'], df['famwgt'])
    pseudo_r2_score = weighted_pseudo_r2(df['female'], df[f'{method}_prop_score'], df['famwgt'])
    
    # Bootstrap to get standard errors
    n_bootstrap = 100
    bootstrap_nlls = []
    bootstrap_r2s = []
    for _ in range(n_bootstrap):
        # Sample with replacement
        idx = np.random.choice(len(df), size=len(df), replace=True)
        bootstrap_sample = df.iloc[idx]
        
        bootstrap_nll = weighted_nll(
            bootstrap_sample['female'],
            bootstrap_sample[f'{method}_prop_score'],
            bootstrap_sample['famwgt']
        )
        bootstrap_pseudo_r2 = weighted_pseudo_r2(
            bootstrap_sample['female'],
            bootstrap_sample[f'{method}_prop_score'],
            bootstrap_sample['famwgt']
        )
        
        bootstrap_nlls.append(bootstrap_nll)
        bootstrap_r2s.append(bootstrap_pseudo_r2)
    
    nll_std_err = np.std(bootstrap_nlls)
    r2_std_err = np.std(bootstrap_r2s)
    
    results[method] = {
        'nll': nll_score,
        'nll_std_err': nll_std_err,
        'pseudo_r2': pseudo_r2_score,
        'pseudo_r2_std_err': r2_std_err
    }

# Print results in a formatted way
for method in gender_pred_methods:
    print(f"{method}:")
    print(f"  NLL: {results[method]['nll']:.3f} ({results[method]['nll_std_err']:.3f})")
    print(f"  Pseudo R²: {results[method]['pseudo_r2']:.3f} ({results[method]['pseudo_r2_std_err']:.3f})")

Working on coarse_grained_regression
Working on coarse_grained_lasso
Working on fine_grained_lasso
Working on propensity_model_random_init
Working on propensity_model_current_job_only_pretrain_seed_1
Working on propensity_model_participation_only_pretrain_seed_1
Working on propensity_model_pretrain_seed_1
Working on propensity_model
Working on dragonnet
Working on round-1-prop
coarse_grained_regression:
  NLL: 0.596 (0.001)
  Pseudo R²: 0.137 (0.002)
coarse_grained_lasso:
  NLL: 0.511 (0.002)
  Pseudo R²: 0.260 (0.003)
fine_grained_lasso:
  NLL: 0.474 (0.002)
  Pseudo R²: 0.314 (0.003)
propensity_model_random_init:
  NLL: 0.398 (0.003)
  Pseudo R²: 0.424 (0.004)
propensity_model_current_job_only_pretrain_seed_1:
  NLL: 0.479 (0.002)
  Pseudo R²: 0.307 (0.003)
propensity_model_participation_only_pretrain_seed_1:
  NLL: 0.477 (0.003)
  Pseudo R²: 0.309 (0.004)
propensity_model_pretrain_seed_1:
  NLL: 0.359 (0.003)
  Pseudo R²: 0.481 (0.005)
propensity_model:
  NLL: 0.338 (0.002)
  Pseudo