In [35]:
import pandas as pd
from matplotlib import pyplot as plt
from glob import glob
from scipy.stats import mannwhitneyu, ttest_ind, pearsonr, spearmanr
from tqdm import tqdm
import numpy as np 
import math
import seaborn as sns
from scipy.stats import ttest_ind


demos = ['women', 'black', 'hispanic', 'native american']
%matplotlib inline

### 1. Startup

In [36]:
def startup(start, end):
    """ 
    Takes in a start and end year and returns a dictionary of dataframes. Read in data from 
    IPEDS completions, university meta-data and demographic columns of interest.
    """
    
    unis = pd.read_csv('../../data/IPEDS/university_info/hd2021.csv', encoding='cp1252')
    unis = unis.set_index('UNITID')

    completions = {}
    for year in range(start, end+1):
        data              = pd.read_csv('../../data/IPEDS/completions/c%s_a.csv' % year)
        data              = data.set_index('UNITID')
        data              = data[data['AWLEVEL'] == 5]
        completions[year] = data

    demo_cols = ['CTOTALW', 'CAIANT', 'CBKAAT', 'CHISPT', 'CTOTALT']
    
    return unis, completions, demo_cols

unis, completions, demo_cols = startup(2011, 2021)

### 2. CS & NonCS

In [37]:
def cs_noncs(data):

    cs_map      = data['CIPCODE'].apply(lambda x : 11 <= x and x < 12)
    cs          = data[cs_map]
    cs_sum      = cs.groupby('UNITID').sum()

    noncs_map   = ~cs_map
    noncs       = data[noncs_map]
    noncs_sum   = noncs.groupby('UNITID').sum()

    cs_unis     = set(cs_sum.index.unique())
    noncs_unis  = set(noncs_sum.index.unique())
    common_unis = list(cs_unis.intersection(noncs_unis))

    cs_sum      = cs_sum.loc[common_unis]
    noncs_sum   = noncs_sum.loc[common_unis]

    return cs_sum, noncs_sum

def cs_noncs_allyears(completions):

    all_cs      = []
    all_noncs   = []
    
    for year in completions:
        data          = completions[year]
        cs, noncs     = cs_noncs(data)

        cs['year']    = year
        noncs['year'] = year

        all_cs.append(cs)
        all_noncs.append(noncs)
        
    all_cs    = pd.concat(all_cs)
    all_noncs = pd.concat(all_noncs)
    return all_cs, all_noncs

cs, noncs = cs_noncs_allyears(completions)

### 3. Compute Demographic Percentages

In [38]:
def make_percents(cs, noncs, demo_cols):
    cs      = cs.set_index(['year'], append=True)
    noncs   = noncs.set_index(['year'], append=True)

    cs_pct      = cs.apply(lambda x:    x[demo_cols] / x['CTOTALT'], axis = 1)
    noncs_pct   = noncs.apply(lambda x: x[demo_cols] / x['CTOTALT'], axis = 1)

    return cs_pct, noncs_pct

cs_pct, noncs_pct = make_percents(cs, noncs, demo_cols)


### 4. Compute _"Trend"_ correlation measure

In [39]:
cs_pct        = cs_pct.reset_index(level=1)
noncs_pct     = noncs_pct.reset_index(level=1)

cs_pct        = cs_pct.dropna()
noncs_pct     = noncs_pct.dropna()

years_of_data = cs_pct.groupby('UNITID').count()['year']
all_11_years  = years_of_data[years_of_data == 11].index
cs_pct        = cs_pct.loc[all_11_years]

years_of_data = noncs_pct.groupby('UNITID').count()['year']
all_11_years  = years_of_data[years_of_data == 11].index
noncs_pct     = noncs_pct.loc[all_11_years]


In [40]:
cols          = [('black', 'CBKAAT'), ('women', 'CTOTALW'), ('hispanic', 'CHISPT')]

for label, col in cols:

    cs_trends    = cs_pct.groupby('UNITID').apply(lambda x: pearsonr(x['year'], x[col]))
    cs_trends    = cs_trends.apply(lambda x: pd.Series({'cs_stat': x[0], 'cs_pval': x[1]}))

    noncs_trends = noncs_pct.groupby('UNITID').apply(lambda x: pearsonr(x['year'], x[col]))
    noncs_trends = noncs_trends.apply(lambda x: pd.Series({'noncs_stat': x[0], 'noncs_pval': x[1]}))

    trends = pd.merge(cs_trends, noncs_trends, left_index=True, right_index=True)

    trends = trends.dropna()
    trends['quadrant'] = trends.apply(lambda x: 1 if x['cs_stat'] > 0 and x['noncs_stat'] > 0 else\
                                                2 if x['cs_stat'] < 0 and x['noncs_stat'] > 0 else\
                                                3 if x['cs_stat'] < 0 and x['noncs_stat'] < 0 else\
                                                4, axis=1)

    trends = trends.dropna()
    trends.to_csv('../../out/trends_%s.csv' % label)

