In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor, plot_tree
sns.set()

In [3]:
def str_to_array(s):
    """Convert string representation of array to numpy array."""
    # Remove brackets, newlines and convert to list of numbers
    nums = s.strip('[]').replace('\n', ' ').split()
    return np.array([int(x) for x in nums])

In [4]:
# Load job_dict.txt into a dictionary
id_to_job = {}
job_to_id = {}
with open('../crosswalks/job_dict.txt', 'r') as f:
    for line in f:
        key, value = line.strip().split('\t')
        id_to_job[int(value)] = key
        job_to_id[key] = int(value)

id_to_year = {}
year_to_id = {}
with open('../crosswalks/year_dict.txt', 'r') as f:
    for line in f:
        key, value = line.strip().split('\t')
        id_to_year[int(value)] = key
        year_to_id[key] = int(value)

bk_jobs = np.array(['business', 'financialop', 'computer', 'architect', 'scientist', 'socialworker', 'postseceduc', 'legaleduc', 'artist', 'lawyerphysician', 'healthcare', 'healthsupport', 'protective', 'foodcare', 'building', 'sales', 'officeadmin', 'farmer', 'constructextractinstall', 'production', 'transport'])



In [5]:
occ1990dd = pd.read_csv('../data/occ1990dd_names.csv', header=None)
job_name_dict = {occ1990dd.iloc[i, 0] : occ1990dd.iloc[i, 1] for i in range(len(occ1990dd))}
job_name = lambda x: job_name_dict[int(x)] if x.isnumeric() else x

In [6]:
df = pd.read_csv('../data/master_dataset_gender-1990-2019_1-16.csv')

In [7]:
job_ids = np.stack([str_to_array(x) for x in df['tokenized_jobs'].values])
job_binarized_ids = np.vectorize(lambda x: id_to_job[x])(job_ids)
converted_job_names = np.vectorize(lambda x: job_name(x))(job_binarized_ids)
prev_job_strings = [" -> ".join([x for x in converted_job_names[i] if x != '<pad>']) for i in range(len(converted_job_names))]
df['prev_job_strings'] = prev_job_strings
df['current_job_string'] = [x.split(' -> ')[-1] for x in prev_job_strings]

In [8]:
COV_NAMES = ['expf', 'expfsq', 'expp', 'exppsq', 'edyrs', 'colldeg', 'advdeg', 'northeast', 'northcentral', 'south', 'black', 'hisp', 'otherrace', 'cb', 'gov', 'durables', 'nondurables', 'transport_ind', 'utilities', 'communications', 'retailtrade', 'wholesaletrade', 'finance', 'socartother', 'hotelsrestaurants', 'professional', 'medical', 'education', 'publicadmin', 'business', 'financialop', 'computer', 'architect', 'scientist', 'socialworker', 'postseceduc', 'legaleduc', 'artist', 'lawyerphysician', 'healthcare', 'healthsupport', 'protective', 'foodcare', 'building', 'sales', 'officeadmin', 'farmer', 'constructextractinstall', 'production', 'transport']
bk_job_indicators = df[bk_jobs].values
assert bk_job_indicators.sum(0)[17] == 0.  # No farmers.
# Manager is the omitted job, so rows that sum to zero consist of managers.
manager_indicators = np.where(bk_job_indicators.sum(1) == 0, 1, 0)
bk_job_indicators[:, 17] = manager_indicators
bk_jobs[17] = 'manager'
assert all(bk_job_indicators.sum(-1) == 1)
# Find the column that is equal to 1 for each row.
bk_job_indicators = np.argmax(bk_job_indicators, axis=1)
df['bk_job_ind'] = bk_job_indicators

all_but_last_jobs = [" -> ".join(x.split(" -> ")[:-1]) for x in df['prev_job_strings']]
df['all_but_last_jobs'] = all_but_last_jobs

In [9]:
def get_descriptive_stats(df, top_clusters,r2s=None):
    df_jobs = df.copy()
    df_jobs['all_but_last_jobs'] = df['all_but_last_jobs'].str.split(' -> ').apply(set)
    df_jobs = df_jobs.explode('all_but_last_jobs')
    #
    percent_treated_by_bk_occ = df.groupby('bk_job_ind')['female'].mean().reset_index()
    mean_male_wage_by_bk_occ = df[df['female'] == 0].groupby('bk_job_ind')['lnwage'].mean().reset_index()
    mean_male_wage_by_cluster = df[df['female'] == 0].groupby('combined_cluster')['lnwage'].mean().reset_index()
    #
    df_cluster_freq = df_jobs.groupby(['combined_cluster', 'all_but_last_jobs']).size().reset_index(name='cluster_freq')
    df_bk_freq = df_jobs.groupby(['bk_job_ind', 'all_but_last_jobs']).size().reset_index(name='bk_freq')
    # Get bk_job_ind for each cluster
    cluster_to_bk = df.groupby('combined_cluster')['bk_job_ind'].first().reset_index()
    # Merge bk_job_ind into df_cluster_freq
    df_cluster_freq = df_cluster_freq.merge(cluster_to_bk, on='combined_cluster', how='left')
    df_freq = pd.merge(df_cluster_freq, df_bk_freq, on=['bk_job_ind', 'all_but_last_jobs'])
    # Calculate proportions
    df_freq['cluster_prop'] = df_freq['cluster_freq'] / df_freq.groupby('combined_cluster')['cluster_freq'].transform('sum')
    df_freq['bk_prop'] = df_freq['bk_freq'] / df_freq.groupby('bk_job_ind')['bk_freq'].transform('sum')
    df_freq['relative_prop'] = df_freq['cluster_prop'] / df_freq['bk_prop']
    df_freq_sorted = df_freq.sort_values(by=['combined_cluster', 'relative_prop'], ascending=[True, False])

    # Now, repeat the process for last_job:
    df_last = df.copy()
    df_cluster_freq_last = df_last.groupby(['combined_cluster', 'current_job_string']).size().reset_index(name='cluster_freq')
    df_bk_freq_last = df_last.groupby(['bk_job_ind', 'current_job_string']).size().reset_index(name='bk_freq')
    # Merge bk_job_ind into df_cluster_freq_last
    df_cluster_freq_last = df_cluster_freq_last.merge(cluster_to_bk, on='combined_cluster', how='left')
    df_freq_last = pd.merge(df_cluster_freq_last, df_bk_freq_last, on=['bk_job_ind', 'current_job_string'])
    # Calculate proportions
    df_freq_last['cluster_prop'] = df_freq_last['cluster_freq'] / df_freq_last.groupby('combined_cluster')['cluster_freq'].transform('sum')
    df_freq_last['bk_prop'] = df_freq_last['bk_freq'] / df_freq_last.groupby('bk_job_ind')['bk_freq'].transform('sum')
    df_freq_last['relative_prop'] = df_freq_last['cluster_prop'] / df_freq_last['bk_prop']
    df_freq_sorted_last = df_freq_last.sort_values(by=['combined_cluster', 'relative_prop'], ascending=[True, False])
    #
    df['weighted_females'] = df['female'] * df['famwgt']
    grouped = df.groupby('combined_cluster').agg({'weighted_females':'sum', 'famwgt':'sum'}).reset_index()
    grouped['mean_female'] = grouped['weighted_females'] / grouped['famwgt']
    cluster_mean_female = grouped[['combined_cluster', 'mean_female']]
    num_female_in_cluster = df[df['female'] == 1].groupby('combined_cluster').size().reset_index(name='num_female')
    num_male_in_cluster = df[df['female'] == 0].groupby('combined_cluster').size().reset_index(name='num_male')
    bk_jobs_full_names = ['Business Operations Specialists', 'Financial Operations Specialists', 'Computer and Math Technicians', 'Architects and Engineers', 'Life, Physical & Social Science Technicians', 'Social Workers', 'Postsecondary Educators', 'Other Education, Legal, and Library Workers', 'Art Design, Entertainment, Sports & Media', 'Lawyers & Physicians', 'Nurse and Healthcare Practitioners & Technicians', 'Healthcare Support', 'Protective Service', 'Food Prep, Serving & Personal Care', 'Building & Grounds Cleaning & Maintenance', 'Sales Occupations', 'Office and Administrative Support', 'Managers', 'Construction, Extraction & Installation Occupations', 'Production Occupations', 'Transportat & Materials Moving']
    for row_ind, cluster_ind in enumerate(top_clusters):
        mean_female = cluster_mean_female[cluster_mean_female['combined_cluster'] == cluster_ind]['mean_female'].values[0]
        num_female = num_female_in_cluster[num_female_in_cluster['combined_cluster'] == cluster_ind]['num_female'].values[0]
        num_male = num_male_in_cluster[num_male_in_cluster['combined_cluster'] == cluster_ind]['num_male'].values[0]    
        bk_job_ind = df_freq_sorted[(df_freq_sorted['combined_cluster']  == cluster_ind)]['bk_job_ind'].iloc[0]
        most_important_prev_jobs = df_freq_sorted[(df_freq_sorted['combined_cluster']  == cluster_ind) & (df_freq_sorted['cluster_prop'] > 0.025)]['all_but_last_jobs'].values
        most_important_current_jobs = df_freq_sorted_last[(df_freq_sorted_last['combined_cluster']  == cluster_ind) & (df_freq_sorted_last['cluster_prop'] > 0.025)]['current_job_string'].values
        # Only include jobs that start with a capital letter to avoid the placeholder jobs
        most_important_prev_jobs = [job for job in most_important_prev_jobs if len(job) > 0 and job[0].isupper()]
        most_important_current_jobs = [job for job in most_important_current_jobs if len(job) > 0 and job[0].isupper()]
        mean_treated_in_bk_occ = percent_treated_by_bk_occ[percent_treated_by_bk_occ['bk_job_ind'] == bk_job_ind]['female'].values[0]
        mean_bk_occ_wage = mean_male_wage_by_bk_occ[mean_male_wage_by_bk_occ['bk_job_ind'] == bk_job_ind]['lnwage'].values[0]
        mean_cluster_wage = mean_male_wage_by_cluster[mean_male_wage_by_cluster['combined_cluster'] == cluster_ind]['lnwage'].values[0]
        print("Cluster: {}. Mean female: {:.3f}. Num female: {}, Num male: {}".format(cluster_ind, mean_female, num_female, num_male))
        if r2s is not None:
            print("R2: {:.4f}".format(r2s[row_ind]))
        print("Occupation category: {}. Mean {} in occupation category: {:.3f}".format(bk_jobs_full_names[bk_job_ind], 'female', mean_treated_in_bk_occ))
        print("Mean occupation category wage (untreated): {:.3f}. Mean cluster wage (untreated): {:.3f}".format(mean_bk_occ_wage, mean_cluster_wage))
        print("Most common current jobs:\n\t-{}".format("\n\t-".join(most_important_current_jobs[:2])))
        print("Most common jobs in history:\n\t-{}".format("\n\t-".join(most_important_prev_jobs[:7])))
        print("")


### First, create clusters based on coarse-grained occupations

In [10]:
#### Create unique cluster ID 
df['combined_cluster'] = -1  # Initialize with -1
bk_job_inds = np.arange(len(bk_jobs))
n_clusters = 15  # n_clusters per category, was 30
bar = tqdm(enumerate(bk_job_inds), total=len(bk_job_inds))
for ind, job in bar:
    mask = df['bk_job_ind'] == job
    sub_df = df[mask]
    reps = sub_df[[f'rep_r_learner_split_0_seed_1_dim_{i}' for i in range(64)]]
    tsne = TSNE(n_components=2, random_state=42)
    reps_2d = tsne.fit_transform(reps)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(reps_2d)
    df.loc[mask, 'combined_cluster'] = ind * n_clusters + cluster_labels
print(f"Number of unique clusters: {df['combined_cluster'].nunique()}")


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

Number of unique clusters: 315





In [11]:
### Regression tree
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

tree_df = df.copy()  

# Split into train/test
rs = np.random.RandomState(42)
train_mask = rs.choice([True, False], size=len(tree_df), p=[0.5, 0.5])
test_mask = ~train_mask

cluster_dummies = pd.get_dummies(tree_df['combined_cluster'], prefix='cluster')

# Initialize lists to store results
feature_results = []
X_train = pd.DataFrame()  # Empty dataframe to start
X_test = pd.DataFrame()
y_train = tree_df[train_mask]['lnwage'] - tree_df[train_mask]['fine_grained_lasso_wage_pred']
y_test = tree_df[test_mask]['lnwage'] - tree_df[test_mask]['fine_grained_lasso_wage_pred']

# Sort clusters by importance using initial fit on training data
tree = DecisionTreeRegressor(max_depth=50, min_samples_leaf=100)
initial_importances = tree.fit(cluster_dummies[train_mask], y_train).feature_importances_
sorted_features = pd.DataFrame({
    'cluster': cluster_dummies.columns,
    'importance': initial_importances
}).sort_values('importance', ascending=False)
# Only include the first 15 features
sorted_features = sorted_features.head(15)

# Incrementally add features and evaluate
for i, (_, row) in enumerate(sorted_features.iterrows(), 1):
    cluster_col = row['cluster']
    
    # Add next most important feature
    X_train[cluster_col] = cluster_dummies[train_mask][cluster_col]
    X_test[cluster_col] = cluster_dummies[test_mask][cluster_col]
    
    # Fit tree and evaluate
    tree = DecisionTreeRegressor(max_depth=50, min_samples_leaf=100)
    tree.fit(X_train, y_train)
    
    # Get predictions and R2 scores
    train_pred = tree.predict(X_train)
    test_pred = tree.predict(X_test)
    # Incorporate the fact that we're predicting residual
    train_pred = train_pred + tree_df[train_mask]['fine_grained_lasso_wage_pred']
    test_pred = test_pred + tree_df[test_mask]['fine_grained_lasso_wage_pred']
    y_train_r2 = tree_df[train_mask]['lnwage']
    y_test_r2 = tree_df[test_mask]['lnwage']
    train_r2 = r2_score(y_train_r2, train_pred)
    test_r2 = r2_score(y_test_r2, test_pred)
    
    feature_results.append({
        'num_features': i,
        'feature_added': cluster_col,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'cluster_id': int(cluster_col.split('_')[-1]),
    })

# Convert results to dataframe
results_df = pd.DataFrame(feature_results)

#### The data used in the next cell makes Figure 2 and Table S2

In [12]:
results_df = pd.DataFrame(feature_results)
results_df

cluster_inds = results_df['cluster_id'].values[:15]
get_descriptive_stats(df, cluster_inds, r2s=results_df['test_r2'].values[:15])


Cluster: 260. Mean female: 0.324. Num female: 416, Num male: 873
R2: 0.4558
Occupation category: Managers. Mean female in occupation category: 0.428
Mean occupation category wage (untreated): 3.495. Mean cluster wage (untreated): 3.243
Most common current jobs:
	-Managers and administrators; n.e.c
Most common jobs in history:
	-Managers and administrators; n.e.c
	-Salespersons; n.e.c.
	-Managers and specialists in marketing; advert.; PR

Cluster: 256. Mean female: 0.793. Num female: 723, Num male: 182
R2: 0.4563
Occupation category: Managers. Mean female in occupation category: 0.428
Mean occupation category wage (untreated): 3.495. Mean cluster wage (untreated): 3.309
Most common current jobs:
	-Managers and administrators; n.e.c
Most common jobs in history:
	-Secretaries and stenographers
	-Bookkeepers and accounting and auditing clerks
	-Administrative support jobs; n.e.c.
	-Retail salespersons and sales clerks
	-Managers and administrators; n.e.c

Cluster: 116. Mean female: 0.796. 

#### Do regression tree on fine-grained occupations

In [13]:
# Get the 3 most common job strings with >20% gender parity
job_gender = df.groupby('current_job_string')['female'].mean() * 100  # Convert to percentage
filtered_jobs = job_gender[(job_gender >= 20) & (job_gender <= 80)].index.tolist()
top_3_jobs = df['current_job_string'].value_counts()[filtered_jobs].nlargest(3).index

In [14]:
#### Create unique cluster ID 
df['combined_cluster'] = -1  # Initialize with -1
# top_10_jobs = df['current_job_string'].value_counts().nlargest(10).index
job_gender = df.groupby('current_job_string')['female'].mean() * 100  # Convert to percentage
filtered_jobs = job_gender[(job_gender >= 20) & (job_gender <= 80)].index.tolist()
top_3_jobs = df['current_job_string'].value_counts()[filtered_jobs].nlargest(3).index
n_clusters = 15
bar = tqdm(enumerate(top_3_jobs), total=len(top_3_jobs))
for ind, job in bar:
    mask = df['current_job_string'] == job
    sub_df = df[mask]
    reps = sub_df[[f'rep_r_learner_split_0_seed_1_dim_{i}' for i in range(64)]]
    tsne = TSNE(n_components=2, random_state=42)
    reps_2d = tsne.fit_transform(reps)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(reps_2d)
    df.loc[mask, 'combined_cluster'] = ind * n_clusters + cluster_labels
print(f"Number of unique clusters: {df['combined_cluster'].nunique()}")


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 3/3 [00:26<00:00,  8.67s/it]

Number of unique clusters: 46





In [15]:
def get_descriptive_stats_for_job_rank(job_rank):
    job_to_consider = top_3_jobs[job_rank]
    tree_df = df[df['current_job_string'] == job_to_consider]
    cluster_dummies = pd.get_dummies(tree_df['combined_cluster'], prefix='cluster')

    # Initialize and fit the decision tree
    tree = DecisionTreeRegressor(max_depth=50, min_samples_leaf=100)  # Adjust parameters as needed
    X = cluster_dummies
    y = tree_df['lnwage'] - tree_df['fine_grained_lasso_wage_pred']

    tree.fit(X, y)
    importances = pd.DataFrame({
        'cluster': cluster_dummies.columns,
        'importance': tree.feature_importances_
    })
    importances = importances.sort_values('importance', ascending=False)

    cluster_inds = importances[importances['importance'] > 0]['cluster'].values[:3]
    cluster_inds = [int(i.split('_')[-1]) for i in cluster_inds if int(i.split('_')[-1]) >= 0]
    descriptive_stats = get_descriptive_stats(df, cluster_inds)


### The outputs of the next three cells make up Table S3

In [16]:
get_descriptive_stats_for_job_rank(0)

Cluster: 7. Mean female: 0.205. Num female: 94, Num male: 304
Occupation category: Managers. Mean female in occupation category: 0.428
Mean occupation category wage (untreated): 3.495. Mean cluster wage (untreated): 3.663
Most common current jobs:
	-Managers and administrators; n.e.c
Most common jobs in history:
	-Economists; market and survey researchers
	-Managers and specialists in marketing; advert.; PR
	-Salespersons; n.e.c.
	-Retail salespersons and sales clerks
	-Managers and administrators; n.e.c
	-Financial managers

Cluster: 4. Mean female: 0.129. Num female: 64, Num male: 393
Occupation category: Managers. Mean female in occupation category: 0.428
Mean occupation category wage (untreated): 3.495. Mean cluster wage (untreated): 3.750
Most common current jobs:
	-Managers and administrators; n.e.c
Most common jobs in history:
	-Mechanical engineers
	-Engineers and other professionals; n.e.c.
	-Electrical engineers
	-Industrial engineers
	-Engineering technicians
	-Computer syst

In [17]:
get_descriptive_stats_for_job_rank(1)

Cluster: 19. Mean female: 0.586. Num female: 81, Num male: 48
Occupation category: Sales Occupations. Mean female in occupation category: 0.466
Mean occupation category wage (untreated): 3.231. Mean cluster wage (untreated): 2.644
Most common current jobs:
	-Retail salespersons and sales clerks
Most common jobs in history:
	-Laborers; freight; stock; and material handlers; n.e.c.
	-Cashiers
	-Retail salespersons and sales clerks
	-Managers and administrators; n.e.c

Cluster: 21. Mean female: 0.698. Num female: 108, Num male: 29
Occupation category: Sales Occupations. Mean female in occupation category: 0.466
Mean occupation category wage (untreated): 3.231. Mean cluster wage (untreated): 2.772
Most common current jobs:
	-Retail salespersons and sales clerks
Most common jobs in history:
	-Child care workers
	-Secretaries and stenographers
	-Customer service reps; invest.; adjusters; excl. insur.
	-Administrative support jobs; n.e.c.
	-Cashiers
	-Retail salespersons and sales clerks
	-Ma

In [18]:
get_descriptive_stats_for_job_rank(2)

Cluster: 42. Mean female: 0.801. Num female: 76, Num male: 26
Occupation category: Construction, Extraction & Installation Occupations. Mean female in occupation category: 0.087
Mean occupation category wage (untreated): 3.011. Mean cluster wage (untreated): 2.728
Most common current jobs:
	-Machine operators; n.e.c.
Most common jobs in history:
	-Child care workers
	-Health and nursing aides
	-Cashiers
	-Housekeepers; maids; butlers; and cleaners
	-Retail salespersons and sales clerks
	-Assemblers of electrical equipment
	-Machine operators; n.e.c.

Cluster: 35. Mean female: 0.222. Num female: 29, Num male: 100
Occupation category: Construction, Extraction & Installation Occupations. Mean female in occupation category: 0.087
Mean occupation category wage (untreated): 3.011. Mean cluster wage (untreated): 3.136
Most common current jobs:
	-Machine operators; n.e.c.
Most common jobs in history:
	-Miscellaneous textile machine operators
	-Material recording; sched.; prod.; plan.; expediti

In [81]:
# Get the 3 most common job strings with >20% gender parity
job_gender = df.groupby('current_job_string')['female'].mean() * 100  # Convert to percentage
filtered_jobs = job_gender[(job_gender >= 20) & (job_gender <= 80)].index.tolist()
top_3_jobs = df['current_job_string'].value_counts()[filtered_jobs].nlargest(3).index

In [68]:
#### Create unique cluster ID 
df['combined_cluster'] = -1  # Initialize with -1
# top_10_jobs = df['current_job_string'].value_counts().nlargest(10).index
job_gender = df.groupby('current_job_string')['female'].mean() * 100  # Convert to percentage
filtered_jobs = job_gender[(job_gender >= 20) & (job_gender <= 80)].index.tolist()
top_3_jobs = df['current_job_string'].value_counts()[filtered_jobs].nlargest(3).index
n_clusters = 15
bar = tqdm(enumerate(top_3_jobs), total=len(top_3_jobs))
for ind, job in bar:
    mask = df['current_job_string'] == job
    sub_df = df[mask]
    reps = sub_df[[f'rep_r_learner_split_0_seed_1_dim_{i}' for i in range(64)]]
    tsne = TSNE(n_components=2, random_state=42)
    reps_2d = tsne.fit_transform(reps)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(reps_2d)
    df.loc[mask, 'combined_cluster'] = ind * n_clusters + cluster_labels
print(f"Number of unique clusters: {df['combined_cluster'].nunique()}")


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 3/3 [00:27<00:00,  9.19s/it]

Number of unique clusters: 46





In [69]:
# # df['current_job_string'].value_counts().nlargest(10).index
# # Get the 3 most common job strings with more than 20% female
# current_job_counts = df['current_job_string'].value_counts()
# female_counts = df.groupby('current_job_string')['female'].sum()
# female_percentages = female_counts / current_job_counts
# top_female_jobs = female_percentages[female_percentages > 0.2].index
# print(top_female_jobs)


In [77]:
# Alter this to change the rank of the job being considered
job_rank = 2  # [1, 4] # This is bk rank

In [79]:
### Regression tree
from sklearn.tree import DecisionTreeRegressor, plot_tree
import matplotlib.pyplot as plt

# most_common_bk_job_inds = df['bk_job_ind'].value_counts().nlargest(10).index
# tree_df = df[df['bk_job_ind'] == most_common_bk_job_inds[job_rank]] # toggle the bracket here to change the job eg between 0 for most common job and 1 for second most common job
job_to_consider = top_3_jobs[job_rank]
tree_df = df[df['current_job_string'] == job_to_consider]
cluster_dummies = pd.get_dummies(tree_df['combined_cluster'], prefix='cluster')

# Initialize and fit the decision tree
tree = DecisionTreeRegressor(max_depth=50, min_samples_leaf=100)  # Adjust parameters as needed
X = cluster_dummies
y = tree_df['lnwage'] - tree_df['fine_grained_lasso_wage_pred']

tree.fit(X, y)
importances = pd.DataFrame({
    'cluster': cluster_dummies.columns,
    'importance': tree.feature_importances_
})
importances = importances.sort_values('importance', ascending=False)

print("Top clusters by importance:")
print(importances[importances['importance'] > 0].head(10))

# For the top clusters, print example career paths
for cluster_col in importances[importances['importance'] > 0]['cluster'].head(10):
    cluster_id = int(cluster_col.split('_')[1])  # Extract cluster ID from column name
    print(f"\nCluster {cluster_id} (Importance: {importances[importances['cluster'] == cluster_col]['importance'].values[0]:.4f})")
    bk_job_category = tree_df[tree_df['combined_cluster'] == cluster_id]['bk_job_ind'].values[0]
    print("BK Job Category:", bk_jobs[bk_job_category])
    print("Sample career paths:")
    sample_paths = tree_df[tree_df['combined_cluster'] == cluster_id]['prev_job_strings'].sample(min(3, sum(tree_df['combined_cluster'] == cluster_id)))
    print(sample_paths.values)

percent_female = np.average(tree_df['female'], weights=tree_df['famwgt'])

Top clusters by importance:
       cluster  importance
12  cluster_42    0.411775
5   cluster_35    0.238770
14  cluster_44    0.175862
1   cluster_31    0.073656
7   cluster_37    0.061810
6   cluster_36    0.018518
10  cluster_40    0.006526
11  cluster_41    0.005788
3   cluster_33    0.003943
0   cluster_30    0.003240

Cluster 42 (Importance: 0.4118)
BK Job Category: constructextractinstall
Sample career paths:
['student -> student -> unemployed -> Production checkers; graders; and sorters in manufacturing -> Production checkers; graders; and sorters in manufacturing -> Assemblers of electrical equipment -> Winding and twisting textile and apparel operatives -> homemaker -> Machine operators; n.e.c. -> Machine operators; n.e.c. -> Winding and twisting textile and apparel operatives -> Winding and twisting textile and apparel operatives -> Winding and twisting textile and apparel operatives -> Machine operators; n.e.c. -> Machine operators; n.e.c. -> Winding and twisting textile an

In [80]:
cluster_inds = importances[importances['importance'] > 0]['cluster'].values[:3]
cluster_inds = [int(i.split('_')[-1]) for i in cluster_inds if int(i.split('_')[-1]) >= 0]
get_descriptive_stats(df, cluster_inds)


Cluster: 42. Mean female: 0.801. Num female: 76, Num male: 26
Occupation category: Construction, Extraction & Installation Occupations. Mean female in occupation category: 0.087
Mean occupation category wage (untreated): 3.011. Mean cluster wage (untreated): 2.728
Most common current jobs:
	-Machine operators; n.e.c.
Most common jobs in history:
	-Child care workers
	-Health and nursing aides
	-Cashiers
	-Housekeepers; maids; butlers; and cleaners
	-Retail salespersons and sales clerks
	-Assemblers of electrical equipment
	-Machine operators; n.e.c.

Cluster: 35. Mean female: 0.222. Num female: 29, Num male: 100
Occupation category: Construction, Extraction & Installation Occupations. Mean female in occupation category: 0.087
Mean occupation category wage (untreated): 3.011. Mean cluster wage (untreated): 3.136
Most common current jobs:
	-Machine operators; n.e.c.
Most common jobs in history:
	-Miscellaneous textile machine operators
	-Material recording; sched.; prod.; plan.; expediti