In [1]:
import pandas as pd
import numpy as np

from scipy.stats import mannwhitneyu

from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_style('darkgrid')

In [2]:
colors = ['#003DA6', '#B53A6D', '#7AC4A5', '#F17F4C', '#15AFD0']

basefont = mpl.font_manager.FontProperties(family='DejaVu Sans', style='normal', size=14)
mpl.rcParams['axes.titlesize'] = 14

plot_style = {
    'axes.grid': True,
    'axes.grid.axis': 'y',
    'axes.spines.left': False,
    'axes.spines.right': False,
    'axes.spines.top': False,
    'axes.spines.bottom': True,
    'axes.facecolor': 'w',
    'axes.edgecolor': 'black',
    'grid.color': '#aaaaaa',
    'grid.linewidth': '0.8',
    'grid.linestyle': '-',
    'axes.titlesize': 14,
    'legend.framealpha': 1.0,
    'errorbar.capsize': 8
}

plt.rcParams.update(plot_style)

In [3]:
df = pd.read_csv('../../Private/data_cleaned.csv')

# Replicate Figure 8 for each AI subfield

In [13]:
acad_needs = df[(df.Sector == 'Academia') & (df.CV == 1)].NeedChanges.dropna()

sum(acad_needs == 1) / len(acad_needs)

0.018867924528301886

In [33]:
# Write a function to perform the statistical testing and produce a figure
def needs_v_access(subfield):
    # Use only the data corresponding to this subfield
    sf = df[df[subfield] == 1]
    
    # Extract the relevant data
    acad_needs = sf[sf.Sector == 'Academia'].NeedChanges.dropna()
    acad_access = sf[sf.Sector == 'Academia'].AccessChanges.dropna()
    ind_needs = sf[sf.Sector == 'Industry'].NeedChanges.dropna()
    ind_access = sf[sf.Sector == 'Industry'].AccessChanges.dropna()
    
    # Perform the stats tests
    acad_mwu = mannwhitneyu(acad_needs, acad_access)
    ind_mwu = mannwhitneyu(ind_needs, ind_access)
    acad_delta = (np.mean(acad_needs - acad_access)) / np.mean(acad_access)  # What % higher are reported needs than reported access?
    ind_delta = (np.mean(ind_needs - ind_access)) / np.mean(ind_access)  # What % higher are reported needs than reported access?
    
    # Print the results
    print(f"—————{subfield.upper()}—————")
    print(f"For academia: p = {round(acad_mwu[1], 4)}; delta = {round(acad_delta * 100, 2)}%")
    print(f"For industry: p = {round(ind_mwu[1], 4)}; delta = {round(ind_delta * 100, 2)}%")
    
    # Create the figure
    fig, (ax0, ax1) = plt.subplots(ncols=2, sharey=True, figsize=(16, 6))

    # Set the labels for different levels in the responses
    labs = ['Much less', 'Somewhat less', 'About the same', 'Somewhat more', 'Much more']

    # On the left-hand plot, plot the reported changes in compute NEEDS for academia in red and the reported changes in compute ACCESS in green
    ax0.bar(np.arange(len(labs)) - 0.2, [sum(acad_needs == i) / len(acad_needs) for i in range(5)], 
           width = 0.4, color=colors[1], label='Reported Change in\nCompute Needs')
    ax0.bar(np.arange(len(labs)) + 0.2, [sum(acad_access == i) / len(acad_access) for i in range(5)], 
           width = 0.4, color=colors[2], label='Reported Change in\nCompute Access')

    # On the right-hand plot, plot the reported changes in compute NEEDS for industry in red and the reported changes in compute ACCESS in green
    ax1.bar(np.arange(len(labs)) - 0.2, [sum(ind_needs == i) / len(ind_needs) for i in range(5)], 
           width = 0.4, color=colors[1], label='Reported Change in\nCompute Needs')
    ax1.bar(np.arange(len(labs)) + 0.2, [sum(ind_access == i) / len(ind_access) for i in range(5)], 
           width = 0.4, color=colors[2], label='Reported Change in\nCompute Access')
    
    # Add significance results if appropriate
    for ax, p in zip([ax0, ax1], [acad_mwu[1], ind_mwu[1]]):
        if p < 0.05:
            s = 'p < 0.05' if p >= 0.01 else 'p < 0.01'
            s = s if p >= 0.001 else 'p < 0.001'
            ax.annotate(s, xy = (0.1, 0.35), va = 'center', fontsize = 14, fontname = 'DejaVu Sans')

    # Stylize the axes 
    for ax in [ax0, ax1]:
        ax.set_xticks(np.arange(len(labs)))
        ax.set_xticklabels(labs, rotation=30, ha='right', fontsize=12, fontname='DejaVu Sans')
        ax.set_yticks([0.1,0.2,0.3,0.4])
        ax.set_yticklabels(['0.1', '0.2', '0.3', '0.4'], fontsize=12, fontname='DejaVu Sans')
    ax0.set_ylabel('Proportion of Respondents', fontsize=14, fontname='DejaVu Sans')

    # Add titles to each subplot, significance results where appropriate, and a legend to the right of the right-hand plot, then adjust spacing and add a title
    ax0.set_title('Academia', fontsize=16, fontname='DejaVu Sans')
    ax1.set_title('Industry', fontsize=16, fontname='DejaVu Sans')
    ax1.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop=basefont)
    fig.subplots_adjust(wspace=0.1)
    fig.suptitle(f'Needs vs. Access {subfield.upper()}', fontsize=16, fontname='DejaVu Sans', y=1.05, x=0.55)

    plt.savefig(f'../figures/supplementary/Fig8_by_subfield/{subfield}.jpg', dpi=300, bbox_inches='tight')
    plt.close()


In [34]:
# For each subfield, if that subfield has more than 50 respondents, produce a graphic and summary statistics
subfields = df.columns[17:72]
for subfield in subfields:
    if sum(df[subfield]) >= 50:
        needs_v_access(subfield)

—————CV—————
For academia: p = 0.0; delta = 19.38%
For industry: p = 0.7295; delta = -3.11%
—————ROBOTICS—————
For academia: p = 0.1244; delta = 7.64%
For industry: p = 0.5574; delta = -4.55%
—————NLP—————
For academia: p = 0.0; delta = 19.29%
For industry: p = 0.4092; delta = 5.65%
—————RL—————
For academia: p = 0.0189; delta = 10.18%
For industry: p = 0.343; delta = -7.02%
—————OTHER—————
For academia: p = 0.0006; delta = 14.56%
For industry: p = 0.4832; delta = -5.48%
—————IMAGERECOGNITION—————
For academia: p = 0.0001; delta = 21.38%
For industry: p = 0.6538; delta = -6.52%
—————IMAGESEGMENTATION—————
For academia: p = 0.0081; delta = 18.69%
For industry: p = 0.6827; delta = 2.7%
—————OBJECTTRACKINGANDORREIDENTIFICATION—————
For academia: p = 0.0244; delta = 16.67%
For industry: p = 0.7572; delta = -3.85%
—————INFORMATIONEXTRACTION—————
For academia: p = 0.003; delta = 26.67%
For industry: p = 0.5109; delta = 6.98%
—————LANGUAGEMODELING—————
For academia: p = 0.0002; delta = 23.58%