In [1]:
## Packages and functions. Special thanks to Morgan Frank for ThreadedMap, loadBagrowColors, and the functions below!
## Accompanying ThreadedMap, loadBagrowColors, and myStats python files must be placed in the python path.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.collections import LineCollection
import pandas as pd
#from loadBagrowColors import colors as myColors
import igraph as ig
#import threadMap
from scipy.stats import pearsonr
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, leaves_list
from scipy.spatial.distance import pdist

Matplotlib created a temporary cache directory at /var/folders/fh/fwc37qhn04d8sxp65hwv1kxm0000gn/T/matplotlib-dg6wai4e because the default path (/Users/gabesmithline/.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
Matplotlib is building the font cache; this may take a moment.


ModuleNotFoundError: No module named 'igraph'

In [3]:
# set the seed
np.random.seed(42)


def echo(X):
    return np.array(X)[0]


def simpleHist(X, numBins=20):
    counts, bins = np.histogram(X, bins=numBins)
    plt.plot(bins[:-1], counts, "k-o")


def clusterMat(M, how="average", **kwargs):
    Z = linkage(M, how)
    order = leaves_list(Z)
    m = M[order, :]
    m = m[:, order]
    plt.imshow(m, aspect="auto", interpolation="nearest", **kwargs)
    plt.colorbar()


def RCA(X, binarized=False):
    rca = (X / np.sum(X, axis=1, keepdims=True)) / (
        np.sum(X, axis=0, keepdims=True) / np.sum(X)
    )
    if not binarized:
        return rca
    rca2 = np.zeros(rca.shape)
    rca2[(rca > 1)] = 1.0
    return rca2


def relateFeatures(mat, verbose=True):
    # row-wise comparison
    # defaults to Sorenson Similarity
    numRows = mat.shape[0]
    out = np.zeros((numRows, numRows))
    S = np.sum(mat, axis=1)
    progress = 0.1
    for i in range(numRows):
        out[i, :] = 2 * np.dot(mat, mat[i, :]) / (S + S[i])
        if verbose and i / numRows >= progress:
            print("%0.2f" % progress)
            while progress <= i / numRows:
                progress += 0.1
    return out


def get_rca(temp):
    # should be:
    #  rows: jobs (or cities)
    #  columns: skills
    return (temp / np.sum(temp, axis=1, keepdims=True)) / (
        np.sum(temp, axis=0, keepdims=True) / temp.sum()
    )


def findIndex(x, X):
    for i in range(len(X)):
        if X[i] == x:
            return i
    return i

In [4]:
tab20_colorNames = [
    "dark blue",
    "light blue",
    "orange",
    "light orange",
    "dark green",
    "light green",
    "red",
    "light red",
    "purple",
    "light purple",
    "brown",
    "light brown",
    "pink",
    "light pink",
    "grey",
    "light grey",
    "pale green",
    "light pale green",
    "cyan",
    "light cyan",
]

In [5]:
## file read-ins

## set your working directory here and drop the data folder in it!
curdir = "/Your/Folder/Goes/Here/"

## data read-in
jobTasksIn = pd.read_csv(f"{curdir}data/task_ratings_file_7-12.csv")
taskDWAS = pd.read_csv(f"{curdir}data/DWA_Tasks_Labels.tsv", sep="\t")

In [6]:
taskDWAS = taskDWAS[
    ["O*NET-SOC Code", "Task ID", "DWA ID", "DWA Title"]
].drop_duplicates()
jobTasksDWAs = pd.merge(
    jobTasksIn, taskDWAS, how="inner", on=["Task ID", "O*NET-SOC Code"]
)
jobDWAs = (
    jobTasksDWAs[
        [item for item in list(jobTasksDWAs) if item not in ["Task", "Task ID"]]
    ]
    .groupby(["O*NET-SOC Code", "DWA Title", "DWA ID", "Title"])
    .mean()
    .reset_index()
)
jobTasks = jobDWAs[
    [
        "O*NET-SOC Code",
        "Title",
        "DWA ID",
        "DWA Title",
        "mean_rating_human_alpha",
        "gpt4_rubric1_alpha",
    ]
].copy()
temp = jobTasks.copy()


def getTemp():
    temp = jobTasks.copy()
    return temp

In [None]:
## setting parameters for the network plot. this is important!!
jobTitles = temp["Title"].unique()
DWAs = temp["DWA Title"].unique()
rating_desired = (
    "mean_rating_human_alpha"  # this is where we set the rating we're trying to plot!
)


def getJobTitle(code):
    x = jobTasks[jobTasks["O*NET-SOC Code"] == code]
    if x["Title"].count() > 0:
        return x.iloc[0]["Title"]
    return "Missing job code!"


numJobs = len(jobTitles)
numDWA = len(DWAs)
print("#Jobs: %d, #DWA: %d" % (numJobs, numDWA))
jobDwaMat = np.zeros((numJobs, numDWA))
SML = np.zeros(numDWA)


def findIndex(x, X):
    for i in range(len(X)):
        if X[i] == x:
            return i
    return i


count = -1
progress = 0.1
total = float(temp["Title"].count())
for _, row in temp.iterrows():
    count += 1
    job, dwa = row["Title"], row["DWA Title"]
    jobIndex = findIndex(job, jobTitles)
    dwaIndex = findIndex(dwa, DWAs)
    jobDwaMat[jobIndex, dwaIndex] = 1
    SML[dwaIndex] = row[rating_desired]
    if count / total >= progress:
        print("%0.2f" % progress)
        while progress <= count / total:
            progress += 0.1

jobDwaRca = RCA(jobDwaMat)
jobDwaRca2 = RCA(jobDwaMat, True)

dwaDwa = np.nan_to_num(relateFeatures(jobDwaRca2.T, verbose=False))
jobJob = np.nan_to_num(relateFeatures(jobDwaRca2, verbose=False))

plt.figure()
clusterMat(dwaDwa)
plt.figure()
clusterMat(jobJob)

In [8]:
## network build code


def buildGraph(mat, nodeLabels):
    G = ig.Graph(directed=False)
    for i in range(mat.shape[0]):
        G.add_vertex(i, title=nodeLabels[i])
    edgeData = []
    for i in range(mat.shape[0]):
        for j in range(i + 1, mat.shape[0]):
            if mat[i, j] > 0:
                edgeData.append([i, j, mat[i, j]])
    G.add_edges([(e[0], e[1]) for e in edgeData])
    G.es["weight"] = [e[2] for e in edgeData]
    G.vs["louvain community"] = np.array(
        G.community_multilevel(weights="weight").membership
    )
    layout = G.layout_fruchterman_reingold(weights="weight", niter=1000)
    layout = np.array(layout.coords)
    G.vs["x"] = layout[:, 0]
    G.vs["y"] = layout[:, 1]
    return G


def plotNetwork(
    G,
    layout=None,
    nodeColors=None,
    linewidth=1,
    nodeSize=10,
    lineColor=[0, 0, 0, 0.3],
    nodeAlpha=1,
    cmap=plt.cm.tab10,
    fileName=None,
):
    numNodes = len(G.vs)
    if nodeColors is None:
        nodeColors = [[0, 0, 0] for i in range(numNodes)]
    if layout is None:
        layout = np.array(G.layout_fruchterman_reingold().coords)
    lines = []
    for e in G.es:
        p1, p2 = layout[e.source, :], layout[e.target, :]
        lines.append([p1, p2])
    lines = np.array(lines)
    lc = LineCollection(lines, linewidths=linewidth, colors=lineColor)
    plt.gca().add_collection(lc)
    plt.scatter(
        layout[:, 0],
        layout[:, 1],
        s=nodeSize,
        c=nodeColors,
        zorder=3,
        cmap=cmap,
        alpha=nodeAlpha,
    )
    plt.xticks([])
    plt.yticks([])
    plt.grid("off")
    plt.axis("off")
    plt.tight_layout()
    if not fileName is None:
        plt.savefig(fileName, bbox_inches="tight")


def wrapNetPlot(G, filename=None, C=None):
    plt.figure(figsize=(8, 8))
    degrees = np.array(G.degree())
    d, D = np.min(degrees), np.max(degrees)
    layout = np.vstack((G.vs["x"], G.vs["y"])).T
    if C is None:
        C = np.array(G.vs["louvain community"])
        cc = plt.cm.tab20.colors
        C = [cc[c % len(cc)] for c in C]
    plotNetwork(
        G,
        layout,
        nodeColors=C,
        nodeSize=25 * (degrees - d) / (D - d) + 10,
        lineColor=[0.3, 0.3, 0.3, 0.1],
        linewidth=1,
        nodeAlpha=1,
        fileName=filename,
    )
    print(
        "network modularity: %0.3f, # node communities: %d"
        % (
            G.modularity(G.vs["louvain community"], weights="weight"),
            np.max(G.vs["louvain community"]) + 1,
        )
    )

In [None]:
jobNetwork = buildGraph(jobJob, jobTitles)

## plots the network of jobs (not colored by exposure)
wrapNetPlot(jobNetwork, f"{curdir}/gpt_jobNetwork.pdf")

In [None]:
## outputs the clusters
Fout = open(f"{curdir}/jobNetworkCommunities.csv", "w")
Fout.write("Community\tColor\tJob Title\n")
for i in range(np.max(jobNetwork.vs["louvain community"]) + 1):
    print((i, tab20_colorNames[i]))
    V = [v["title"] for v in jobNetwork.vs if v["louvain community"] == i]
    print(V)
    print("--------")
    for v in V:
        Fout.write("%d\t%s\t%s\n" % (i, tab20_colorNames[i], v))
Fout.close()

In [11]:
jobType = []
for v in jobNetwork.vs:
    jobType.append({"Title": v["title"], "Type": v["louvain community"]})
jobType = pd.DataFrame(jobType)

temp = pd.merge(getTemp(), jobType)
T = (
    temp.groupby("Type")
    .agg(
        {
            f"{rating_desired}": [
                np.mean,
                np.median,
                np.std,
                lambda x: np.std(x) / np.sqrt(len(x)),
            ]
        }
    )
    .reset_index()
)
T.columns = T.columns.map("".join)
T = T.sort_values(by=f"{rating_desired}mean")

# map for cluster names
occtypes = {
    0: "Managers",
    1: "Clerks and Services",
    2: "Technologists",
    3: "Architects and Engineers",
    4: "Scientists and Researchers",
    5: "Medical Workers",
    6: "Legal Services",
    7: "Teachers",
    8: "Arts, Media, and Entertainment",
    9: "Operators",
    10: "Machinists",
}

# save clusters
temp["jobGroup"] = temp["Type"].map(occtypes)
T["jobArchetypeName"] = T.Type.map(occtypes)
temp.to_csv(f"{curdir}data/jobNetworkMembershipWithTypes.csv", index=False)

In [None]:
## cluster plot with means
plt.figure()
for i in range(T["jobArchetypeName"].count()):
    plt.errorbar(
        T[f"{rating_desired}mean"].iloc[i],
        i,
        xerr=1.96 * T[f"{rating_desired}<lambda_0>"].iloc[0],
        fmt="o",
        markersize=4,
        color=plt.cm.tab20.colors[T["Type"].iloc[i]],
    )

# x =
plt.yticks(np.arange(T["Type"].count()), T["jobArchetypeName"], fontsize=15)
plt.ylabel("Job Grouping", fontsize=15)
plt.xticks(fontsize=15)
plt.xlabel("Avg. LLM Exposure (E1)\n by Job Group", fontsize=15)
plt.savefig(f"{curdir}/data/vecfigs/gptByJobType.pdf", bbox_inches="tight")
T

In [None]:
T.to_latex()

In [None]:
jobNetwork.vs.attributes()

In [None]:
### now to look at detailed work activities (DWAs)
## build network
dwaNetwork = buildGraph(dwaDwa, DWAs)
wrapNetPlot(dwaNetwork, f"{curdir}/dwaNetwork.pdf")

In [20]:
dwaType = []
for v in dwaNetwork.vs:
    dwaType.append({"DWA Title": v["title"], "DWA Type": v["louvain community"]})
dwaType = pd.DataFrame(dwaType)
temp = pd.merge(temp, dwaType)
T = temp.groupby("Type").agg({f"{rating_desired}": np.mean}).reset_index()
TT = temp.groupby("Title").agg({f"{rating_desired}": np.mean}).reset_index()
for v in jobNetwork.vs:
    x = T[T["Type"] == v["louvain community"]]
    v[f"type {rating_desired}"] = x.iloc[0][f"{rating_desired}"]
    x = TT[TT["Title"] == v["title"]]
    v[f"{rating_desired}"] = x.iloc[0][f"{rating_desired}"]

In [None]:
## network plot colored by exposure
plt.figure(figsize=(10, 8))
layout = np.vstack((jobNetwork.vs["x"], jobNetwork.vs["y"])).T
plotNetwork(
    jobNetwork,
    layout,
    nodeColors=jobNetwork.vs[f"type {rating_desired}"],
    nodeSize=10,
    lineColor=[0.3, 0.3, 0.3, 0.1],
    linewidth=1,
    cmap=plt.cm.viridis,
)
C = plt.colorbar()
C.ax.tick_params(labelsize=15)
# C.set_label(f"Job Type {rating_desired}",fontsize=15)
C.set_label(f"Human Ratings for LLM-Exposure", fontsize=15)
plt.tight_layout()
plt.savefig(f"{curdir}/gpt_jobnetwork.pdf")

In [None]:
## write clusters to file
Fout = open(f"{curdir}/dwaNetworkCommunities.csv", "w")
Fout.write("Community\tColor\tDWA Title\n")
for i in range(np.max(dwaNetwork.vs["louvain community"]) + 1):
    print((i, tab20_colorNames[i]))
    V = [v["title"] for v in dwaNetwork.vs if v["louvain community"] == i]
    print(V)
    print("--------")
    for v in V:
        Fout.write("%d\t%s\t%s\n" % (i, tab20_colorNames[i], v))
Fout.close()