# Individual and gender inequality in computer science: A career study of cohorts from 1970 to 2000

## Part 3: Explanation

In this notebook, we partially explain the patterns in computer science by the Matthew Effect. First, we infer the parameters of reproductive feedback for all cohorts and career ages. Second, we plot these parameters for average cohorts and average career ages.

---

### 1. Imports

Many of the custom functions we need are stored in a utilities file.

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from matplotlib.ticker import MultipleLocator
from utils import *

### 2. Directories

Create directories if they don't exist:

In [None]:
directory = '../results'
if not os.path.exists(directory):
    os.makedirs(directory)

directory = '../results/fig5'
if not os.path.exists(directory):
    os.makedirs(directory)

### 3. Load data

Load files from the 'data' directory:

In [None]:
counts = pd.read_csv('../data/counts.csv.gz')

### 4. Produce figures

#### 4.1. Figures 5A and F

Measurement of the strength of reproductive feedback as the exponent `beta`, and the threshold above which feedback fully unfolds as the lower cutoff `xmin`, for cohorts and career ages, separately for productivity (A) and impact (F):

In [None]:
a_cum_pub_xmin = np.zeros([31, 14])
a_cum_pub_beta = np.zeros([31, 14])
a_cum_pub_beta_std = np.zeros([31, 14])
a_cum_cit_xmin = np.zeros([31, 14])
a_cum_cit_beta = np.zeros([31, 14])
a_cum_cit_beta_std = np.zeros([31, 14])

start_years = range(1970, 2001)
career_ages = range(2, 16)

for i in range(31):
    for j in range(14):
        # prepare data
        df_pa_cum = pd.merge(
            left = counts[(counts['cohort'] == start_years[i]) & (counts['career_age'] == career_ages[j]-1)][['author', 'cum_num_pub', 'cum_num_cit']], 
            right = counts[(counts['cohort'] == start_years[i]) & (counts['career_age'] == career_ages[j])][['author', 'num_pub', 'num_cit']], 
            on = 'author'
        )
        
        # citation analysis
        try:
            df_pa_cum_cit = df_pa_cum[['cum_num_cit', 'num_cit']]
            df_pa_cum_cit = df_pa_cum_cit[(df_pa_cum_cit != 0).all(1)]
            stats, _ = fit_bivariate_dblp(
                x=df_pa_cum_cit['cum_num_cit'], 
                y=df_pa_cum_cit['num_cit'], 
                pdf='../results/fig5/fig5f_pa_cum_cit_'+str(start_years[i])+'_'+str(career_ages[j])+'_alpha.pdf' if i==30 and j==13 else None, 
                fit='ols', 
                reduction='bin', 
                bootstrap=True, 
                straps=100, 
                estimate_lower_cutoff=True, 
                xlabel='C(%.0f)' %(career_ages[j]-1), 
                ylabel='c(%.0f)' %career_ages[j], 
                title='Cohort: %.0f' %start_years[i], 
                letter='F', 
                marker=0, 
                color=1
            )
            a_cum_cit_xmin[i, j] = stats['xmin'][0]
            a_cum_cit_beta[i, j] = stats['beta'][0]
            a_cum_cit_beta_std[i, j] = stats['beta_std'][0]
        except:
            a_cum_cit_xmin[i, j] = np.nan
            a_cum_cit_beta[i, j] = np.nan
            a_cum_cit_beta_std[i, j] = np.nan
        
        # productivity analysis
        try:
            df_pa_cum_pub = df_pa_cum[['cum_num_pub', 'num_pub']]
            df_pa_cum_pub = df_pa_cum_pub[(df_pa_cum_pub != 0).all(1)]
            stats, _ = fit_bivariate_dblp(
                x=df_pa_cum_pub['cum_num_pub'], 
                y=df_pa_cum_pub['num_pub'], 
                pdf='../results/fig5/fig5a_pa_cum_pub_'+str(start_years[i])+'_'+str(career_ages[j])+'_alpha.pdf' if i==30 and j==13 else None, 
                fit='ols', 
                reduction='bin', 
                bootstrap=True, 
                straps=100, 
                estimate_lower_cutoff=True, 
                xlabel='P(%.0f)' %(career_ages[j]-1), 
                ylabel='p(%.0f)' %career_ages[j], 
                title='Cohort: %.0f' %start_years[i], 
                letter='A', 
                marker=0, 
                color=0
            )
            a_cum_pub_xmin[i, j] = stats['xmin'][0]
            a_cum_pub_beta[i, j] = stats['beta'][0]
            a_cum_pub_beta_std[i, j] = stats['beta_std'][0]
        except:
            a_cum_pub_xmin[i, j] = np.nan
            a_cum_pub_beta[i, j] = np.nan
            a_cum_pub_beta_std[i, j] = np.nan

#### 4.2. Figures 5B-E and G-J

Plots of the exponent `beta` (B, D, G, and I) and the lower cutoff `xmin` (C, E, H, and J) for an average cohort (B-C and G-H) and an average career age (D-E and I-J):

In [None]:
l_data = [a_cum_cit_beta, a_cum_cit_xmin, a_cum_cit_beta, a_cum_cit_xmin, a_cum_pub_beta, a_cum_pub_xmin, a_cum_pub_beta, a_cum_pub_xmin]
l_across = ['career_ages', 'career_ages', 'cohorts', 'cohorts', 'career_ages', 'career_ages', 'cohorts', 'cohorts']
l_title = ['Impact', 'Impact', 'Impact', 'Impact', 'Productivity', 'Productivity', 'Productivity', 'Productivity']
l_ylabel = ['Exponent', 'Lower Cutoff', 'Exponent', 'Lower Cutoff', 'Exponent', 'Lower Cutoff', 'Exponent', 'Lower Cutoff']
l_ylim = [(0.0584, 1.2041), (.2, 17.8), (0.0584, 1.2041), (.2, 17.8), (0.0584, 1.2041), (.2, 17.8), (0.0584, 1.2041), (.2, 17.8)]
l_letter = ['G', 'H', 'I', 'J', 'B', 'C', 'D', 'E']
l_color = ['purple', 'purple', 'purple', 'purple', 'green', 'green', 'green', 'green']
l_filename = ['../results/fig5/fig5g_pa_cum_cit_beta_career_ages', 
              '../results/fig5/fig5h_pa_cum_cit_xmin_career_ages', 
              '../results/fig5/fig5i_pa_cum_cit_beta_cohorts', 
              '../results/fig5/fig5j_pa_cum_cit_xmin_cohorts', 
              '../results/fig5/fig5b_pa_cum_pub_beta_career_ages', 
              '../results/fig5/fig5c_pa_cum_pub_xmin_career_ages', 
              '../results/fig5/fig5d_pa_cum_pub_beta_cohorts', 
              '../results/fig5/fig5e_pa_cum_pub_xmin_cohorts'
             ]

for i in range(8):
    data = l_data[i]
    across = l_across[i]
    title = l_title[i]
    ylabel = l_ylabel[i]
    ylim = l_ylim[i]
    letter = l_letter[i]
    color = l_color[i]
    filename = l_filename[i]
    
    linewidth = 2
    fontsize = 18
    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_subplot(111)
    if across == 'career_ages':
        plt.fill_between(range(2, 16), np.nanmin(data, axis=0), np.nanmax(data, axis=0), linewidth=0, color=color, alpha=.25)
        plt.plot(range(2, 16), np.nanmean(data, axis=0), linewidth=linewidth, color=color)
        ax.set_xlabel('Career Age', fontsize=fontsize)
        ax.set_xlim([1.35, 15.65])
        ax.set_xticks([2, 5, 10, 15])
    ax.set_ylim(ylim)
    if across == 'cohorts':
        plt.fill_between(range(0, 31), np.nanmin(data, axis=1), np.nanmax(data, axis=1), linewidth=0, color=color, alpha=.25)
        plt.plot(range(0, 31), np.nanmean(data, axis=1), linewidth=linewidth, color=color)
        ax.set_xlabel('Cohort', fontsize=fontsize)
        ax.set_xticks([0, 10, 20, 30])
        ax.set_xticklabels([1970, 1980, 1990, 2000])
    ax.set_ylim(ylim)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('both')
    ax.set_ylabel(ylabel, fontsize=fontsize)
    ax.set_title(title, fontsize=fontsize)
    ax.tick_params(axis='x', which='major', direction='in', width=linewidth, size=4*linewidth, labelsize=fontsize)
    ax.tick_params(axis='x', which='minor', direction='in', width=linewidth, size=2*linewidth, labelsize=fontsize)
    ax.tick_params(axis='y', which='major', direction='in', width=linewidth, size=4*linewidth, labelsize=fontsize)
    ax.tick_params(axis='y', which='minor', direction='in', width=linewidth, size=2*linewidth, labelsize=fontsize)
    ax.spines['left'].set_linewidth(linewidth)
    ax.spines['right'].set_linewidth(linewidth)
    ax.spines['bottom'].set_linewidth(linewidth)
    ax.spines['top'].set_linewidth(linewidth)
    ax.xaxis.set_minor_locator(MultipleLocator(1))
    plt.gcf().text(0., 0.9, letter, fontsize=fontsize*2)
    plt.subplots_adjust(left=0.25, right=0.95, bottom=0.2, top=0.9)
    fig.savefig(filename+'.pdf')