In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [20]:
df = pd.read_csv('../data/master_dataset_gender-1990-2019_1-16.csv')

In [21]:
covs = ['expf', 'expfsq',
       'expp', 'exppsq', 'edyrs', 'colldeg', 'advdeg', 'northeast',
       'northcentral', 'south', 'black', 'hisp', 'otherrace', 'cb', 'gov',
       'durables', 'nondurables', 'transport_ind', 'utilities',
       'communications', 'retailtrade', 'wholesaletrade', 'finance',
       'socartother', 'hotelsrestaurants', 'professional', 'medical',
       'education', 'publicadmin', 'business', 'financialop', 'computer',
       'architect', 'scientist', 'socialworker', 'postseceduc', 'legaleduc',
       'artist', 'lawyerphysician', 'healthcare', 'healthsupport',
       'protective', 'foodcare', 'building', 'sales', 'officeadmin', 'farmer',
       'constructextractinstall', 'production', 'transport']
bk_covs = ['expf', 'expfsq', 
          'expp', 'exppsq', 'edyrs', 'colldeg', 'advdeg', 'msa', 'northeast', 
          'northcentral', 'south', 'black', 'hisp', 'otherrace', 'cb', 'gov',
          'durables', 'nondurables', 'Transport', 'Utilities', 
          'Communications', 'retailtrade', 'wholesaletrade', 'finance', 
          'SocArtOther', 'hotelsrestaurants', 'Medical', 
          'Education', 'professional', 'publicadmin', 'business', 'financialop', 'computer', 
          'architect', 'scientist', 'socialworker', 'postseceduc', 'legaleduc', 
          'artist', 'lawyerphysician', 'healthcare', 'healthsupport', 
          'protective', 'foodcare', 'building', 'sales', 'officeadmin', 'farmer', 
          'constructextractinstall', 'production', 'transport']

In [22]:
bk = pd.read_csv("../data/psid_blau_kahn_2017.csv")
bk = bk[bk['farmer'] != 1.]
bk = bk[~pd.isnull(bk['smsa'])]
bk = bk[bk['annwks'] >= 26]
bk['lnwage'] = bk['lnrealwg']
bk = bk[bk['ft'] == 1]
bk = bk[bk['wagesamp'] == 1.]

bk['expf'] = bk['yrsftexpfz']
bk['expfsq'] = bk['yrsftexpfz'] ** 2
bk['expp'] = bk['yrsptexpfz']
bk['exppsq'] = bk['yrsptexpfzsq']
bk['edyrs'] = bk['schupd']
bk['colldeg'] = bk['ba']
bk['advdeg'] = bk['adv']
bk['msa'] = bk['smsa']
bk['otherrace'] = bk['othrace']
bk['cb'] = bk['unjob'] 
bk['gov'] = np.where(bk['wtrgov'] == 1, 1, 0)

#### The data in the next cell makes Table S1.

In [23]:
possible_years = [1990, 1999,  2011]

for year in possible_years:
  X = df[df['year'] == year][covs].values
  A = df[df['year'] == year]['treatment'].values
  Y = df[df['year'] == year]['lnwage'].values
  weights = df[df['year'] == year]['famwgt'].values

  treated_linreg = LinearRegression()
  treated_linreg.fit(X[A == 1], Y[A == 1], sample_weight=weights[A == 1])
  untreated_linreg = LinearRegression()
  untreated_linreg.fit(X[A == 0], Y[A == 0], sample_weight=weights[A == 0])
  att_df = np.average(treated_linreg.predict(X[A == 1]) - untreated_linreg.predict(X[A == 1]), weights=weights[A == 1])
  total_gap_df = np.average(Y[A == 1], weights=weights[A == 1]) - np.average(Y[A == 0], weights=weights[A == 0])

  yr = str(year)[-2:]
  bk_yr = bk[(bk['ft'] == 1) & (bk['wagesamp'] == 1.) & (bk['wave'] == float(year))]
  X_bk = bk_yr[bk_covs].values
  A_bk = bk_yr['female'].values
  Y_bk = bk_yr['lnwage'].values
  weights_bk = bk_yr['famwgt'].values

  treated_linreg_bk = LinearRegression()
  treated_linreg_bk.fit(X_bk[A_bk == 1], Y_bk[A_bk == 1], sample_weight=weights_bk[A_bk == 1])
  untreated_linreg_bk = LinearRegression()
  untreated_linreg_bk.fit(X_bk[A_bk == 0], Y_bk[A_bk == 0], sample_weight=weights_bk[A_bk == 0])
  att_bk = np.average(treated_linreg_bk.predict(X_bk[A_bk == 1]) - untreated_linreg_bk.predict(X_bk[A_bk == 1]), weights=weights_bk[A_bk == 1])
  total_gap_bk = np.average(Y_bk[A_bk == 1], weights=weights_bk[A_bk == 1]) - np.average(Y_bk[A_bk == 0], weights=weights_bk[A_bk == 0])

  print(f"========= {year-1} =========")
  print("## Our results ##")
  print(f"Total gap: {total_gap_df:.3f}")
  print(f"Adjusted gap: {att_df:.3f}")
  print(f"Total ratio: {np.exp(total_gap_df):.3f}")
  print(f"Adjusted ratio: {np.exp(att_df):.3f}")
  print(".....")
  print("## Blau/Kahn results ##")
  print(f"Total gap: {total_gap_bk:.3f}")
  print(f"Adjusted gap: {att_bk:.3f}")
  print(f"Total ratio: {np.exp(total_gap_bk):.3f}")
  print(f"Adjusted ratio: {np.exp(att_bk):.3f}")
  print("")

## Our results ##
Total gap: -0.301
Adjusted gap: -0.099
Total ratio: 0.740
Adjusted ratio: 0.906
.....
## Blau/Kahn results ##
Total gap: -0.301
Adjusted gap: -0.079
Total ratio: 0.740
Adjusted ratio: 0.924

## Our results ##
Total gap: -0.261
Adjusted gap: -0.087
Total ratio: 0.771
Adjusted ratio: 0.917
.....
## Blau/Kahn results ##
Total gap: -0.259
Adjusted gap: -0.090
Total ratio: 0.772
Adjusted ratio: 0.914

## Our results ##
Total gap: -0.226
Adjusted gap: -0.094
Total ratio: 0.797
Adjusted ratio: 0.911
.....
## Blau/Kahn results ##
Total gap: -0.231
Adjusted gap: -0.088
Total ratio: 0.793
Adjusted ratio: 0.916



Store Table S1 data as csv in /figs folder.

In [24]:
possible_years = [1990, 1999, 2011]
results = []

for year in possible_years:
    # Our data
    X = df[df['year'] == year][covs].values
    A = df[df['year'] == year]['treatment'].values
    Y = df[df['year'] == year]['lnwage'].values
    weights = df[df['year'] == year]['famwgt'].values

    # Our regression
    treated_linreg = LinearRegression().fit(X[A == 1], Y[A == 1], sample_weight=weights[A == 1])
    untreated_linreg = LinearRegression().fit(X[A == 0], Y[A == 0], sample_weight=weights[A == 0])

    att_df = np.average(treated_linreg.predict(X[A == 1]) - untreated_linreg.predict(X[A == 1]), weights=weights[A == 1])
    total_gap_df = np.average(Y[A == 1], weights=weights[A == 1]) - np.average(Y[A == 0], weights=weights[A == 0])

    # BK data
    bk_yr = bk[(bk['ft'] == 1) & (bk['wagesamp'] == 1.0) & (bk['wave'] == float(year))]
    X_bk = bk_yr[bk_covs].values
    A_bk = bk_yr['female'].values
    Y_bk = bk_yr['lnwage'].values
    weights_bk = bk_yr['famwgt'].values

    # BK regression
    treated_linreg_bk = LinearRegression().fit(X_bk[A_bk == 1], Y_bk[A_bk == 1], sample_weight=weights_bk[A_bk == 1])
    untreated_linreg_bk = LinearRegression().fit(X_bk[A_bk == 0], Y_bk[A_bk == 0], sample_weight=weights_bk[A_bk == 0])

    att_bk = np.average(treated_linreg_bk.predict(X_bk[A_bk == 1]) - untreated_linreg_bk.predict(X_bk[A_bk == 1]), weights=weights_bk[A_bk == 1])
    total_gap_bk = np.average(Y_bk[A_bk == 1], weights=weights_bk[A_bk == 1]) - np.average(Y_bk[A_bk == 0], weights=weights_bk[A_bk == 0])

    # Store results
    results.append({
        'Year': year - 1,
        'Total Gap (Ours)': round(total_gap_df, 3),
        'Adjusted Gap (Ours)': round(att_df, 3),
        'Total Ratio (Ours)': round(np.exp(total_gap_df), 3),
        'Adjusted Ratio (Ours)': round(np.exp(att_df), 3),
        'Total Gap (BK)': round(total_gap_bk, 3),
        'Adjusted Gap (BK)': round(att_bk, 3),
        'Total Ratio (BK)': round(np.exp(total_gap_bk), 3),
        'Adjusted Ratio (BK)': round(np.exp(att_bk), 3)
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Save to CSV in ../figs/
results_df.to_csv("../figs/blau_kahn_comparison.csv", index=False)

print(f"\n✅ Results saved to: ../figs/blau_kahn_comparison.csv")


✅ Results saved to: ../figs/blau_kahn_comparison.csv
