# EDA for Data Incubator project proposal, fall 2020

*Draft 1: How does performance relate to compensation in publicly funded universities?

*Draft 2: How to best model enrollement at public universities?

<hr>

## Purpose of this Jupyter notebook is to accompany the project dashboard

### Here's [my project dashboard](http://barb-data-inc.herokuapp.com/) showing last updated version of dashboard/report.

Note: Here is a link to [Urban Institute's data explorer](https://educationdata.urban.org/data-explorer/colleges/) where I downloaded the data. The data explorer's sources include: College Scorecard and Integrated Postsecondary Education Data System.

Link to [my document](https://github.com/dagny099/does_good_payoff/blob/master/docs/getting-started.rst) showing selection criteria and variables.



In [1]:
# --------------------------------
# IMPORT MODULES
# --------------------------------
import pandas as pd
import numpy as np
from datetime import datetime as dt

import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import chart_studio.plotly as py
import cufflinks as cf
import plotly

import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
# Create a variable, options, with a list of dicts w/ keys {'label': , 'value'}
def make_options(df):
    n=df.unitid.nunique()
    state_options=[{'label': 'All '+str(n)+" schools", 'value':''}]
    for state in df.state_name.unique():
        n=df.groupby(['state_name'])['unitid'].nunique().loc[state]
        state_options.append({'label': state+" ("+str(n)+" schools)", 'value': state })
    return state_options


In [3]:
# Return Standard Error of Mean
def sem_btwn(x):
    return round(np.std(x)/np.sqrt(x.count()),3)


In [4]:
# Create a summary of early yrs - late yrs:
def tbl_early_late(df):
    tbl = pd.concat([pd.DataFrame(df[0:2].apply(np.mean),columns=['Avg 2001-02']),
            pd.DataFrame(df[-2:].apply(np.mean),columns=['Avg 2016-17'])],axis=1)
    return round(pd.concat([tbl, 
            pd.DataFrame(df[0:2].apply(np.mean) - df[-2:].apply(np.mean), columns=['DIFF'])],axis=1),3)


In [53]:
# Filter data for a balanced dataset w.r.t. measure-of-interest
def get_school_data(df, which_columns=['number_enrolled_total'], earliestYr=0, nYrs=0):
    # Use this to find the most common number of years for which data exists:
    if nYrs==0:
        nYrs = df.groupby(['unitid'])[which_columns[0]].count().value_counts().index[0]

    # Filter years, if desired
    df = df[df.year.dt.year>=earliestYr]

    # Make a temp df with number of years w/ data available for key measure
    tmpDf = df.groupby('unitid')[which_columns].count()

    # Make a list of schools w/ data in all years, only include those:
    unitids = tmpDf[tmpDf[which_columns[0]]==nYrs].index.to_list()
    filt = df.apply(lambda row: row['unitid'] in unitids, axis=1)

    print(f"These were the input parameters: {which_columns[0]}, {earliestYr}")
    print(f"Returning a data frame w: {df[filt].unitid.nunique()} schools data in, from {df[filt].year.min().year} thru {df[filt].year.max().year} (that makes {nYrs} yrs of data for {which_columns[0]})")
    
    # Make a DF & list of options for the dropdown menu - ENROLLMENT SECTION
    return df[filt]


In [28]:
# --------------------------------
# Acquire the Data:
# --------------------------------
path2file="/Users/bhs/PYTHON-STUFF/DataIncubator/does_good_payoff/data/interim/analyzeMe_n175.csv"
df = pd.read_csv(path2file, na_values=np.nan,#parse_dates=['year'], 
                   dtype={'unitid':'category', 'inst_name': 'category', 'state_name': 'category',
                         'enrollement_rate': 'float64', 'female_pct': 'float64', 'married_pct': 'float64'})

# Set 'year' as datetime 
df['year'] = df['year'].apply(pd.to_datetime, format='%Y')

# Keep these columns (subset from CSV) as potential features for model:
keepcols = ['admission_rate','enrollement_rate','number_applied','number_admitted','number_enrolled_total',
        'rev_tuition_fees_gross', 'rev_tuition_fees_net','rev_total_current','rev_fed_approps_grants','rev_state_local_approps_grants','rev_other',
       'exp_total_current','exp_instruc_total','exp_acad_supp_total','exp_student_serv_total','exp_res_pub_serv_total',
        'completers_150pct','completion_rate_150pct','female_pct','married_pct',
      'year','unitid','inst_name','state_name']

dropcols = [c for c in df.columns if c not in keepcols]

df.drop(dropcols,axis=1,inplace=True)


In [54]:
# ENROLLMENT SECTION
df_enroll = get_school_data(df, ['enrollement_rate'], 2001)
make_options(df_enroll)

These were the input parameters: enrollement_rate, 2001
Returning a data frame w: 165 schools data in, from 2001 thru 2017 (that makes 17 yrs of data for enrollement_rate)


[{'label': 'All 165 schools', 'value': ''},
 {'label': 'Arizona (3 schools)', 'value': 'Arizona'},
 {'label': 'California (27 schools)', 'value': 'California'},
 {'label': 'Florida (8 schools)', 'value': 'Florida'},
 {'label': 'Georgia (7 schools)', 'value': 'Georgia'},
 {'label': 'Indiana (8 schools)', 'value': 'Indiana'},
 {'label': 'Michigan (12 schools)', 'value': 'Michigan'},
 {'label': 'Minnesota (6 schools)', 'value': 'Minnesota'},
 {'label': 'New York (27 schools)', 'value': 'New York'},
 {'label': 'Ohio (7 schools)', 'value': 'Ohio'},
 {'label': 'Oregon (3 schools)', 'value': 'Oregon'},
 {'label': 'Pennsylvania (19 schools)', 'value': 'Pennsylvania'},
 {'label': 'Texas (16 schools)', 'value': 'Texas'},
 {'label': 'Virginia (11 schools)', 'value': 'Virginia'},
 {'label': 'Wisconsin (11 schools)', 'value': 'Wisconsin'}]

These were the input parameters: rev_total_current, 0
Returning a data frame w: 171 schools data in, from 1990 thru 2017 (that makes 28 yrs of data for rev_total_current)


[{'label': 'All 171 schools', 'value': ''},
 {'label': 'Arizona (3 schools)', 'value': 'Arizona'},
 {'label': 'California (26 schools)', 'value': 'California'},
 {'label': 'Florida (9 schools)', 'value': 'Florida'},
 {'label': 'Georgia (8 schools)', 'value': 'Georgia'},
 {'label': 'Indiana (8 schools)', 'value': 'Indiana'},
 {'label': 'Michigan (12 schools)', 'value': 'Michigan'},
 {'label': 'Minnesota (6 schools)', 'value': 'Minnesota'},
 {'label': 'New York (29 schools)', 'value': 'New York'},
 {'label': 'Ohio (7 schools)', 'value': 'Ohio'},
 {'label': 'Oregon (3 schools)', 'value': 'Oregon'},
 {'label': 'Pennsylvania (18 schools)', 'value': 'Pennsylvania'},
 {'label': 'Texas (18 schools)', 'value': 'Texas'},
 {'label': 'Virginia (13 schools)', 'value': 'Virginia'},
 {'label': 'Wisconsin (11 schools)', 'value': 'Wisconsin'}]

In [20]:
dfFin1

Unnamed: 0,year,unitid,inst_name,state_name,admission_rate,enrollement_rate,number_applied,number_admitted,number_enrolled_total,rev_tuition_fees_gross,...,rev_total_current,exp_instruc_total,exp_res_pub_serv_total,exp_acad_supp_total,exp_student_serv_total,exp_total_current,completers_150pct,completion_rate_150pct,female_pct,married_pct


In [None]:
# DEFINE PLOTTING COLORS & LABELS FOR CONSISTENTLY GRAPHING SERIES
seriez = {'number_applied': {'color': '#F44DDB', 'label': "Number of student applications"},
            'number_admitted': {'color': '#CF1214', 'label': "Number of students admitted"},
            'number_enrolled_total': {'color': '#0E3DEC', 'label': "Number enrolled"},
            'admission_rate': {'color': '#CF1214', 'label': "Admission rate (# applications/# admissions)"},
            'enrollement_rate': {'color': '#0E3DEC', 'label': "Enrollment rate (# admissions/# enrolled"},
        }
        

In [None]:
# Background 1 Callback
active_tab='CumulativeAdmissions'
selected=''

# Set xlabel based on selection (useful for debuggins)
xlab=[opt['label'] for opt in state_options_enrollment if opt['value']==selected]
    
    # Render the tab content based on value of 'active_tab' (other input, 'selected' used to crossfilter)
if active_tab == "CumulativeAdmissions":
    which_columns = ['number_applied','number_admitted','number_enrolled_total']
    graph_title = 'Enrollment in College Fails to Keep Pace with Admissions'+": "+xlab[0]
    ylabel = 'Total Number of Students'        
    markdown_comments = """Insert comments A, B, C"""
    if len(selected)==0:
        df_fig = df.groupby(['year'])[which_columns].aggregate([('Sum','sum'), ('Nschools','count')])
    else:
        df_fig = df[df.state_name==selected].groupby(['year'])[which_columns].aggregate([('Sum','sum'), ('Nschools','count')])
    fig = go.Figure()
    for col in which_columns:
        fig.add_trace(
            go.Scatter(x=df_fig.index, y=df_fig[col]['Sum'], 
                name = seriez[col]['label'], marker_color='rgba(152, 0, 0, .8)',
                line = dict(color = seriez[col]['color']), opacity = 0.8))

elif active_tab == "RateAdmissions":
    which_columns = ['admission_rate','enrollement_rate']
    graph_title = 'Admission and Enrollment Rates over time'+": "+xlab[0]
    ylabel = 'Rate'
    markdown_comments = """Insert comments D, E"""
    if len(selected)==0:
        df_fig = df.groupby(['year'])[which_columns].aggregate([('Avg',np.mean), ('stdev',np.std), ('Nschools','count'), ('SEM', sem_btwn)])        
    else:
        df_fig = df[df.state_name==selected].groupby(['year'])[which_columns].aggregate([('Avg',np.mean), ('stdev',np.std), ('Nschools','count'), ('SEM', sem_btwn)])
    fig = go.Figure()
    for col in which_columns:
        fig.add_trace(
            go.Scatter(x=df_fig.index, y=df_fig[col]['Avg'], 
                name = seriez[col]['label'], marker_color='rgba(152, 0, 0, .8)',
                line = dict(color = seriez[col]['color']), 
                error_y = dict(type='data', array=df_fig[col]['SEM'], visible=True),
                opacity = 0.8))
# Set options common to all traces with fig.update_traces
fig.update_traces(marker_line_width=2, marker_size=10)
fig.update_layout(title=graph_title,yaxis={'title': ylabel},xaxis={'title': xlab[0]}, 
                  yaxis_zeroline=True, xaxis_zeroline=True)
fig.show()


In [None]:
selected=''
xlab=[opt['label'] for opt in state_options_enrollment if opt['value']==selected]
which_columns = ['admission_rate','enrollement_rate']
graph_title = 'ALL Admission and Enrollment Rates over time'
ylabel = 'Rate'
markdown_comments = """Insert comments F, G, H"""
# Data for figure:
if len(selected)==0:
    df_fig = df.groupby(['year'])[which_columns].aggregate([('Avg',np.mean), ('stdev',np.std), ('Nschools','count'), ('SEM', sem_btwn)])
else:
    df_fig = df[df.state_name==selected].groupby(['year'])[which_columns].aggregate([('Avg',np.mean), ('stdev',np.std), ('Nschools','count'), ('SEM', sem_btwn)])

fig = px.scatter(df_fig, x=df_fig.index, y=df_fig['admission_rate',]['Avg'], trendline="ols")
fig.update_layout(title= dict(text=graph_title), font=dict(size=16))
fig.update_traces(marker=dict(color=seriez['admission_rate']['color']), line=dict(color=seriez['admission_rate']['color'], width=4, dash='dot'))

res_tmp = px.get_trendline_results(fig)
res_tmp = res_tmp.px_fit_results.iloc[0].summary().as_html()
trend_AR = pd.read_html(res_tmp, header=0, index_col=0)[0]
trend_AR.drop(['Date:','Time:'], axis=0, inplace=True)
trend_AR.rename(columns={'y': 'admission_rate'},inplace=True)

fig2 = px.scatter(df_fig, x=df_fig.index, y=df_fig['enrollement_rate']['Avg'], trendline="ols")
fig2.update_traces(marker=dict(color=seriez['enrollement_rate']['color']), line=dict(color=seriez['enrollement_rate']['color'], width=4, dash='dot'))
res_tmp = px.get_trendline_results(fig2)
res_tmp = res_tmp.px_fit_results.iloc[0].summary().as_html()
trend_ER = pd.read_html(res_tmp, header=0, index_col=0)[0]
trend_ER.rename(columns={'y': 'enrollment_rate', 'R-squared:':'R-squared'},inplace=True)
#                          trend_ER.columns[2], trend_ER.columns[1]+trend_ER.columns[2]},inplace=True)
trend_ER.drop(['Date:','Time:'], axis=0, inplace=True)
# trend_ER.drop(['Date:','Time:'], axis=1, inplace=True)

fig.add_trace(fig2.data[0])
fig.add_trace(fig2.data[1])

fig.update_traces(marker_line_width=2, marker_size=10)
fig.update_layout(title=graph_title,yaxis={'title': ylabel},xaxis={'title': xlab[0]}, 
                  yaxis_zeroline=True, xaxis_zeroline=True)
# fig.show()
df_tab = pd.concat([trend_AR, trend_ER], axis=1)
df_tab