Choose the kernel called research

In [1]:
import ipykernel
import os
from io import StringIO
import copy

## TODO(Sort)
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import requests

import plotly.graph_objects as go
import statsmodels.formula.api as smf
from plotnine import *
from binsreg import *
import seaborn as sns

Load in the main data

In [3]:
def read_tsv(path):
    """
    Load TSV using pandas
    """
    return pd.read_csv(path, delimiter="\t")


bls_oews = pd.read_csv("../data/national_May2021_dl.csv")
# this contains the model exposure information
full_labels = read_tsv("../data/full_labelset.tsv")
full_labels.rename(
    columns={
        "gpt4_exposure_alt_rubric": "gpt4_alt_exposure",
        "human_exposure_agg": "human_exposure",
    },
    inplace=True,
)
df_tasks = read_tsv("../data/full_onet_data.tsv")

In [4]:
df_tasks

Unnamed: 0.1,Unnamed: 0,O*NET-SOC Code,Task ID,Task,Task Type,Title,coreweight,equalweight
0,0,11-1011.00,8823.0,Direct or coordinate an organization's financi...,Core,Chief Executives,2,1
1,1,11-1011.00,8831.0,Appoint department heads or managers and assig...,Core,Chief Executives,2,1
2,2,11-1011.00,8825.0,Analyze operations to evaluate performance of ...,Core,Chief Executives,2,1
3,3,11-1011.00,8826.0,"Direct, plan, or implement policies, objective...",Core,Chief Executives,2,1
4,4,11-1011.00,8827.0,"Prepare budgets for approval, including those ...",Core,Chief Executives,2,1
...,...,...,...,...,...,...,...,...
19260,19260,53-7121.00,12807.0,Unload cars containing liquids by connecting h...,Supplemental,"Tank Car, Truck, and Ship Loaders",1,1
19261,19261,53-7121.00,12804.0,"Clean interiors of tank cars or tank trucks, u...",Supplemental,"Tank Car, Truck, and Ship Loaders",1,1
19262,19262,53-7121.00,12803.0,Lower gauge rods into tanks or read meters to ...,Supplemental,"Tank Car, Truck, and Ship Loaders",1,1
19263,19263,53-7121.00,12805.0,Operate conveyors and equipment to transfer gr...,Supplemental,"Tank Car, Truck, and Ship Loaders",1,1


In [5]:
# Write tasks to text file for the over time classification
# df_tasks["Task"].to_csv("../data/tasks.txt", index=False, header=False)


Next compute the occupational exposure scores

In [6]:
alpha_score_map = {"E0": 0.0, "E1": 1.0, "E2": 0.0, "E3": 0.0}
beta_score_map = {"E0": 0.0, "E1": 1.0, "E2": 0.5, "E3": 0.5}
gamma_score_map = {"E0": 0.0, "E1": 1.0, "E2": 1.0, "E3": 1.0}
t_score_map = {"T0": 0.0, "T1": 0.25, "T2": 0.5, "T3": 0.75, "T4": 1.0}
t_score_map_acemoglu = {"T0": 0.0, "T1": 0.0, "T2": 1, "T3": 1, "T4": 1}

for prefix in ["gpt4", "human"]:
    full_labels[f"{prefix}_alpha"] = full_labels[f"{prefix}_exposure"].apply(
        lambda x: alpha_score_map[x]
    )
    full_labels[f"{prefix}_beta"] = full_labels[f"{prefix}_exposure"].apply(
        lambda x: beta_score_map[x]
    )
    full_labels[f"{prefix}_gamma"] = full_labels[f"{prefix}_exposure"].apply(
        lambda x: gamma_score_map[x]
    )

full_labels["gpt4_alt_beta"] = full_labels["gpt4_alt_exposure"].apply(
    lambda x: beta_score_map[x]
)

full_labels["automation"] = full_labels["gpt4_automation"].apply(
    lambda x: t_score_map[x]
)

full_labels["automation_acemoglu"] = full_labels["gpt4_automation"].apply(
    lambda x: t_score_map[x]
)

Now compute weights for each occupation

In [7]:
# I have also saved this in the data folder under Task Ratings.txt
df_taskratings = read_tsv(
    "https://www.onetcenter.org/dl_files/database/db_27_2_text/Task%20Ratings.txt"
)

df_taskratings_relevant = df_taskratings[df_taskratings["Scale ID"].isin(["RT", "IM"])][
    ["O*NET-SOC Code", "Task ID", "Scale ID", "Data Value"]
]
dfr_taskratings = pd.pivot(
    df_taskratings_relevant, index=["O*NET-SOC Code", "Task ID"], columns=["Scale ID"]
).reset_index()
dfr_taskratings.columns = ["O*NET-SOC Code", "Task ID", "importance", "relevance"]

# converting the core / supplemental rating to a numerical weight where core is worth 2x the taskweight of a supplemental task.
df_tasks["coreweight"] = df_tasks["Task Type"].map(
    {"Core": 2, "Supplemental": 1, np.nan: 1}
)
df_tasks["equalweight"] = df_tasks["Task Type"].map(
    {"Core": 1, "Supplemental": 1, np.nan: 1}
)
task_tmp = pd.merge(df_tasks, dfr_taskratings, how="left")

In [8]:
impWeightOcc = (
    task_tmp[["O*NET-SOC Code", "importance", "relevance", "coreweight", "equalweight"]]
    .groupby("O*NET-SOC Code")
    .sum()
    .reset_index()
)
impWeightOcc.rename(
    columns={
        "importance": "impTotal",
        "relevance": "relTotal",
        "coreweight": "coreweightTotal",
        "equalweight": "equalweightTotal",
    },
    inplace=True,
)
task_tmp_weighted = pd.merge(task_tmp, impWeightOcc, how="left", on="O*NET-SOC Code")
task_tmp_weighted["importance_weight"] = (
    task_tmp_weighted["importance"] / task_tmp_weighted["impTotal"]
)
task_tmp_weighted["relevance_weight"] = (
    task_tmp_weighted["relevance"] / task_tmp_weighted["relTotal"]
)
task_tmp_weighted["core_weight"] = (
    task_tmp_weighted["coreweight"] / task_tmp_weighted["coreweightTotal"]
)
task_tmp_weighted["equal_weight"] = (
    task_tmp_weighted["equalweight"] / task_tmp_weighted["equalweightTotal"]
)

In [9]:
## merge weights to the label frame
full_labels_weight = pd.merge(
    full_labels,
    task_tmp_weighted[
        [
            "O*NET-SOC Code",
            "Task ID",
            "importance_weight",
            "relevance_weight",
            "core_weight",
            "equal_weight",
        ]
    ],
    how="left",
)

## scoring and labeling
full_labels_weight["gpt_relevant"] = full_labels_weight["gpt_3_relevant"].astype(float)

Now create occupation-level scores. Note the weighting scheme can be changed and alters scores.

In [10]:
def weighted_mean(df, groupfields, aggfields, weightfield):
    df2 = df[aggfields].multiply(df[weightfield], axis="index")
    aa = df[[weightfield] + groupfields]
    df3 = df2.join(aa)
    dfg = df3[groupfields + aggfields].groupby(groupfields).sum().reset_index()
    return dfg

In [11]:
## Must be one of {'equal_weight','core_weight','relevance_weight','importance_weight'}
weight_field = "equal_weight"

## How we are grouping tasks (by occupation)
group_fields = ["O*NET-SOC Code", "Title"]

## These are the fields that will be aggregated to the occupation-level
rating_fields = [
    "gpt4_alpha",
    "gpt4_beta",
    "gpt4_gamma",
    "human_alpha",
    "human_beta",
    "human_gamma",
    "automation",
    "automation_acemoglu",
    "gpt4_alt_beta",
]

## Run the weighting
occ_level = weighted_mean(full_labels_weight, group_fields, rating_fields, weight_field)

occ_level["OCC_CODE"] = occ_level["O*NET-SOC Code"].str.slice(start=0, stop=7)

In [12]:
occ_level.to_pickle("../data/occupation_level.pkl")