# Guiding questions

How does graduate student gender diversity at Columbia compare to its peer institutions?

In [34]:
from collections import defaultdict
import math
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [35]:
TOTALS = pickle.load(open('../../../data/Detailed_Sex_RaceEthnicity_Graduates.p', 'rb'))

years = sorted(TOTALS.keys())
institutions = sorted(TOTALS[ max(years) ].keys())
fields = sorted(TOTALS[ max(years) ]['Columbia University in the City of New York'].keys())

print(f'Data set describes years {min(years)} - {max(years)} for the following institutions: {institutions}.')

Data set describes years 1994 - 2016 for the following institutions: ['Brown University', 'Columbia University in the City of New York', 'Cornell University', 'Dartmouth College', 'Duke University', 'Harvard University', 'Massachusetts Institute of Technology', 'Princeton University', 'Stanford University', 'University of Pennsylvania', 'Yale University'].


In [38]:
# Convenient function for making subplots for every institution
def subplots(num=len(institutions), cols=2, figsize=8):
    rows = math.ceil(num / cols)
    return plt.subplots(rows, cols, figsize=(figsize * cols, figsize * rows))

# Returns female proportions for an institution in a field
def get_proportions(inst, field):
    proportions = []
    for year in years:
        field_stats = TOTALS[year][inst].get(field, None)
        if field_stats is None:
            proportions.append(None)
        else:
            fem_count = field_stats.get('Female', 0)
            if isinstance(fem_count, dict):
                fem_count = fem_count['Total for selected values']
                
            proportions.append(
                fem_count /
                    field_stats['Total for selected values']['Total for selected values']
            )
    return proportions

# Plots female proportions for every institution for a field, highlighting one, in a line graph
def plot_inst(ax, field, highlight='Columbia University in the City of New York'):
    for inst in institutions:
        if inst == highlight:
            continue
        ax.plot(years, get_proportions(inst, field), label=inst)
    ax.plot(years, get_proportions(highlight, field), label=highlight, linewidth=3, color='black')
    
    ax.set_ylabel('Percent female')
    ax.legend()
    ax.set_ylim(top=1, bottom=0)
    ax.set_title(f'{field}: Comparing Columbia\'s peers')
    
    # Format y tick labels to be percentages
    vals = ax.get_yticks()
    ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])

# Returns female and male headcounts for a field in an institution
def get_counts(field, inst):
    females = []
    males = []
    for year in years:
        stats = TOTALS[year][inst].get(field, None)
        if stats is None:
            females.append(None)
            males.append(None)
            continue
        females.append(stats['Female']['Total for selected values'])
        males.append(stats['Male']['Total for selected values'])
    return females, males

# Plots a female and male headcounts line graph for a field in an institution
def plot_counts(ax, field, inst='Columbia University in the City of New York'):
    females, males = get_counts(field, inst)
    ax.plot(years, females, label='Female')
    ax.plot(years, males, label='Male')
    ax.legend()
    ax.set_title(f'{field}: Columbia\'s headcounts by sex')
    ax.set_ylabel('Number of people')

# Plots a female vs. male connected scatter plot
def plot_connected(ax, field, inst='Columbia University in the City of New York'):
    females, males = get_counts(field, inst)
    ax.plot(females, males, color='lightgray')
    ax.scatter(females, males, c=[i for i in range(len(males))], cmap='viridis')
    
    for i in [0, -1]:
        ax.annotate(years[i], (females[i], males[i]))
    
    _, ymax = ax.get_ylim()
    _, xmax = ax.get_xlim()
    top = max(ymax, xmax)
    
    ax.set_ylim(bottom=0, top=top)
    ax.set_xlim(left=0, right=top)
    
    ax.plot([0, 1000], [0, 1000], color='gray', linestyle='--')

    ax.set_title(f'{field}: Columbia, men vs. women')
    ax.set_xlabel('Number of women')
    ax.set_ylabel('Number of men')

# Plots three specified plots for a field in a row of a subplots array
def plot_triplet(axArr, index, field):
    plot_inst(axArr[i, 0], field)
    plot_counts(axArr[i, 1], field)
    plot_connected(axArr[i, 2], field)

# Plots triplets for every institution
def plot_all():
    fig, axArr = subplots(len(fields)*3, cols=3)
    for i, field in enumerate(fields):
        plot_triplet(axArr, i, field)
    fig.tight_layout()

In [None]:
selected_fields = [
    'Civil engineering',
    'Computer sciences',
    'Electrical engineering',
    'Engineering science, mechanics, and physics',
    'Mechanical engineering',
    'Physics',
]

fig, axArr = subplots(len(selected_fields)*3, cols=3)
for i, field in enumerate(selected_fields):
    plot_triplet(axArr, i, field)
fig.tight_layout()

fig.savefig('../../../img/graduates_sex_detailed_comparison_triplet.png')