# EDA for Data Incubator project proposal, fall 2020

*Draft 1: How does performance relate to compensation in publicly funded universities?

*Draft 2: How to best model enrollement at public universities?

<hr>

## Purpose of this Jupyter notebook is to work on visualizations for the project dashboard

### Here's [my project dashboard](http://barb-data-inc.herokuapp.com/) showing last updated version of dashboard/report.

Note: Here is a link to [Urban Institute's data explorer](https://educationdata.urban.org/data-explorer/colleges/) where I downloaded the data. The data explorer's sources include: College Scorecard and Integrated Postsecondary Education Data System.

Link to [my document](https://github.com/dagny099/does_good_payoff/blob/master/docs/getting-started.rst) showing selection criteria and variables.



In [2]:
# --------------------------------
# IMPORT MODULES & LOAD DATA
# --------------------------------
import pandas as pd
import numpy as np
from datetime import datetime as dt

# Import saved csv to dataframe:
path2project = "/Users/bhs/PYTHON-STUFF/DataIncubator/does_good_payoff"
# path2file=path2project+"/data/interim/analyzeMe_n175.csv"
path2file=path2project+"/data/interim/n501-analyzeMe.csv"
df = pd.read_csv(path2file, na_values=np.nan, parse_dates=['year'], low_memory=False,
                   dtype={'unitid':'category', 'inst_name': 'category', 'state_name': 'category',
                         'enrollement_rate': 'float64', 'female_pct': 'float64', 'married_pct': 'float64'})


In [None]:
# --------------------------------
# HELPER FUNCTIONS => Move to own py file
# --------------------------------

In [70]:
# Prepare list of dicts for STATE drop-down menu
def make_options(df):
    n=df.unitid.nunique()
    state_options=[{'label': 'All '+str(n)+" schools", 'value':''}]
    for state in df.state_name.unique():
        n=df.groupby(['state_name'])['unitid'].nunique().loc[state]
        state_options.append({'label': state+" ("+str(n)+" schools)", 'value': state })
    return state_options

# Standard error of mean distribution
def sem_btwn(x):
    return round(np.std(x)/np.sqrt(x.count()),3)

# Create a summary of early yrs - late yrs:
def tbl_early_late(df):
    tbl = pd.concat([pd.DataFrame(df[0:2].apply(np.mean),columns=['Avg 2001-02']),
            pd.DataFrame(df[-2:].apply(np.mean),columns=['Avg 2016-17'])],axis=1)
    return round(pd.concat([tbl, 
            pd.DataFrame(df[0:2].apply(np.mean) - df[-2:].apply(np.mean), columns=['DIFF'])],axis=1),3)


# Filter data for a balanced dataset w.r.t. measure-of-interest
def get_school_data(df, which_columns=['number_enrolled_total'], earliestYr=0, nYrs=0):
    # Use this to find the most common number of years for which data exists:
    if nYrs==0:
        nYrs = df.groupby(['unitid'])[which_columns[0]].count().value_counts().index[0]

    # Filter years, if desired
    df = df[df.year.dt.year>=earliestYr]

    # Make a temp df with number of years w/ data available for key measure
    tmpDf = df.groupby('unitid')[which_columns].count()

    # Make a list of schools w/ data in all years, only include those:
    unitids = tmpDf[tmpDf[which_columns[0]]==nYrs].index.to_list()
    filt = df.apply(lambda row: row['unitid'] in unitids, axis=1)

    # print(f"These were the input parameters: {which_columns[0]}, {earliestYr}")
    # print(f"Returning a data frame w: {df[filt].unitid.nunique()} schools data in, from {df[filt].year.min().year} thru {df[filt].year.max().year} (that makes {nYrs} yrs of data for {which_columns[0]})")
    
    # Return a dataframe with balanced data for measure of interest
    return df[filt]

In [None]:
# --------------------------------
# Prune Features, axis=1: (Drop some columns)
# --------------------------------

# Keep these columns (subset from CSV) as potential features for model:
keepcols = ['admission_rate','enrollement_rate','number_applied','number_admitted','number_enrolled_total',
        'rev_tuition_fees_gross', 'rev_tuition_fees_net','rev_total_current','rev_fed_approps_grants','rev_state_local_approps_grants','rev_other',
       'exp_total_current','exp_instruc_total','exp_acad_supp_total','exp_student_serv_total','exp_res_pub_serv_total',
        'completers_150pct','completion_rate_150pct','female_pct','married_pct',
      'year','unitid','inst_name','state_name']

dropcols = [c for c in df.columns if c not in keepcols]
df.drop(dropcols,axis=1,inplace=True)

In [156]:
# --------------------------------
# Prune Data, axis=0: (Check for availability of key measure in enough years to make a time series)
# --------------------------------

# DATA FOR ENROLLMENT SECTION
df_enroll  = get_school_data(df, ['enrollement_rate'], 2001)
state_options_enrollment = make_options(df_enroll)
df['unitid'] = df['unitid'].astype('object')  #for plotting later
df['inst_name'] = df['inst_name'].astype('object')   #for plotting later

# DATA FOR FINANCE SECTION
dfFin1 = get_school_data(df, ['rev_total_current'])
state_options_finance = make_options(dfFin1)

In [155]:
# Prep data for showing Expenses & Revenue breakdowns
dfFin1['unitid'] = dfFin1['unitid'].astype('object')  #for plotting later
dfFin1['inst_name'] = dfFin1['inst_name'].astype('object')   #for plotting later

dfFin1['exp_OTHER_STUFF'] = dfFin1['exp_total_current'] - \
    (dfFin1['exp_instruc_total']+dfFin1['exp_res_pub_serv_total']+dfFin1['exp_acad_supp_total']+dfFin1['exp_student_serv_total'])

dfFin1['rev_OTHER_STUFF'] = dfFin1['rev_total_current'] - \
    (dfFin1['rev_tuition_fees_net']+dfFin1['rev_fed_approps_grants']+dfFin1['rev_state_local_approps_grants']+dfFin1['rev_other'])

which_cols = ['exp_instruc_total', 'exp_res_pub_serv_total', 'exp_acad_supp_total', 'exp_student_serv_total', 'exp_OTHER_STUFF']
dfExp = pd.melt(dfFin1, id_vars =['year','unitid','inst_name','state_name'], value_vars= which_cols)

which_cols = ['rev_tuition_fees_net', 'rev_fed_approps_grants', 'rev_state_local_approps_grants', 'rev_other', 'rev_OTHER_STUFF']
dfRev = pd.melt(dfFin1, id_vars =['year','unitid','inst_name','state_name'], value_vars= which_cols)



# VISUALIZE DATA

## Get each graph looking right for inclusion in dashboard

In [5]:
#  Import stats modules
from sklearn.metrics import r2_score

# Import visualization modules
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import chart_studio.plotly as py
import cufflinks as cf
import plotly

import matplotlib.pyplot as plt
%matplotlib inline


In [4]:
seriez = {'number_applied': {'color': '#F44DDB', 'label': "Number of student applications", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'number_admitted': {'color': '#CF1214', 'label': "Number of students admitted", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'number_enrolled_total': {'color': '#0E3DEC', 'label': "Number enrolled", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'admission_rate': {'color': '#CF1214', 'label': "Admission rate (# applications/# admissions)", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'enrollement_rate': {'color': '#0E3DEC', 'label': "Enrollment rate (# admissions/# enrolled", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'rev_tuition_fees_gross': {'color': '#8E44AD', 'label': "Revenue: Tuition&Fees (gross)", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_tuition_fees_net': {'color': '#2E86C1 ', 'label': "Revenue: Tuition&Fees (net)", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_total_current': {'color': '#1E8449', 'label': "Revenue: Total", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_fed_approps_grants': {'color': '#F1948A', 'label': "Revenue: Fed grants & approp", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_state_local_approps_grants': {'color': '#C0392B', 'label': "Revenue: State grants & approp", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_other': {'color': '#00ACC1', 'label': "Revenue: Other", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_total_current': {'color': '#F4D03F', 'label': "Expenses: Total", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_instruc_total': {'color': '#FFA726', 'label': "Expenses: Instruction", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_acad_supp_total': {'color': '#B2EBF2', 'label': "Expenses: Acad Supp", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_student_serv_total': {'color': '##D7CCC8', 'label': "Expenses: Stud Serv", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_res_pub_serv_total': {'color': '#0E3DEC', 'label': "Expenses: Resch & Pub Serv", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'completers_150pct': {'color': '#0E3DEC', 'label': "Completers 150%", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'completion_rate_150pct': {'color': '#0E3DEC', 'label': "Completion Rate 150%", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'female_pct': {'color': '#0E3DEC', 'label': "% Female", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'married_pct': {'color': '#0E3DEC', 'label': "% Marries", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'state_name': {'color': '#0E3DEC', 'label': "State", 'timeDep': 'No', 'marker_color': 'rgba(250, 250, 250, .8)'},
        }


In [187]:
# --------------------------------
# CUMULATIVE ADMISSIONS
# --------------------------------
active_tab = "CumulativeAdmissions"
df = df_enroll
which_columns = ['number_applied','number_admitted','number_enrolled_total']
selected = ''

xlab=[opt['label'] for opt in state_options_enrollment if opt['value']==selected]
graph_title = 'Enrollment in College Fails to Keep Pace with Admissions'+": "+xlab[0]
ylabel = 'Total Number of Students'        

# Data for figure:
if len(selected)==0:
    df_fig = df.groupby(['year'])[which_columns].aggregate([('Sum','sum'), ('Nschools','count')])
else:
    df_fig = df[df.state_name==selected].groupby(['year'])[which_columns].aggregate([('Sum','sum'), ('Nschools','count')])
fig = go.Figure()
for col in which_columns:
    fig.add_trace(
        go.Scatter(x=df_fig.index, y=df_fig[col]['Sum'], 
            name = seriez[col]['label'], marker_color='rgba(152, 0, 0, .8)',
            line = dict(color = seriez[col]['color']), opacity = 0.8))
# Raw Data for table:
df_tab = df_fig.xs(key='Sum', axis=1, level=1)  
df_tab = tbl_early_late(df_tab).rename_axis('').reset_index()
# Set options common to all traces with fig.update_traces
fig.update_traces(mode='lines+markers', marker_line_width=2, marker_size=10)
fig.update_layout(title={'text': graph_title, 'font':dict(size=16), 'yref': 'paper', 'y': 1, 'yanchor': 'bottom', 'pad':dict(l=20, r=0, t=0, b=5)}, xaxis={'title': xlab[0]}, yaxis={'title': ylabel},
                               margin=dict(l=20, r=20, t=20, b=20),
            yaxis_zeroline=True, xaxis_zeroline=True)


In [188]:
# --------------------------------
# ENROLLMENT OVER TIME
# --------------------------------
active_tab = "RateAdmissions"
which_columns = ['admission_rate','enrollement_rate']
graph_title = 'Admission and Enrollment Rates over time'+": "+xlab[0]
ylabel = 'Rate'
markdown_comments = """This tab will likely be eliminated in favor of the version with trendlines."""
# Data for figure:
if len(selected)==0:
    df_fig = df.groupby(['year'])[which_columns].aggregate([('Avg',np.mean), ('stdev',np.std), ('Nschools','count'), ('SEM', sem_btwn)])
else:
    df_fig = df[df.state_name==selected].groupby(['year'])[which_columns].aggregate([('Avg',np.mean), ('stdev',np.std), ('Nschools','count'), ('SEM', sem_btwn)])
fig = go.Figure()
for col in which_columns:
    ys=df_fig[col]['Avg']
    xs=np.array(range(0,len(ys)))
    mod = np.polyfit(xs[:-1],ys[:-1],1)
    predict=np.poly1d(mod)
    r2 = round(r2_score(ys[:-1], predict(xs[:-1])),2)

    # Add line graph w/ data except last one:
    fig.add_trace(go.Scatter(x=df_fig.index[:-1], y=ys[:-1], name = seriez[col]['label'], 
            line_color = seriez[col]['color'], marker_color=seriez[col]['color'],
            error_y = dict(type='data', array=df_fig[col]['SEM'], visible=True),
            mode='lines+markers', opacity = 0.9))
    # Add best fit line
    fig.add_trace(go.Scatter(x=df_fig.index, y=predict(xs), mode='lines', name='Best Fit R^2 '+str(r2),
                      line_color = seriez[col]['color'], line_dash='dash', opacity = 0.8))
    # Add PREDICTED last data point w/ different marker
    fig.add_trace(go.Scatter(x=pd.Series(df_fig.index[-1]), y=pd.Series(predict(xs[-1])), showlegend=False, marker_symbol="x", marker_line_color='white', mode='markers', marker_color=seriez[col]['color']))
    # Add TRUE last data point w/ different marker
    fig.add_trace(go.Scatter(x=pd.Series(df_fig.index[-1]), y=pd.Series(ys[-1]), showlegend=False, mode='markers', marker_color=seriez[col]['color']))

# Raw Data for table:
df_tab = df_fig.xs(key='Avg', axis=1, level=1)        
df_tab = tbl_early_late(df_tab).rename_axis('').reset_index()
# Set options common to all traces with fig.update_traces
fig.update_traces(marker_line_width=2, marker_size=10)
fig.update_layout(title=graph_title, xaxis={'title': xlab[0]}, yaxis={'title': ylabel, 'range': [0,1]},
          yaxis_zeroline=True, xaxis_zeroline=True)

In [190]:
# --------------------------------
# CUMULATIVE REVENUE AND EXPENSES
# --------------------------------
which_columns = ['rev_total_current','exp_total_current','rev_tuition_fees_gross','rev_tuition_fees_net','exp_instruc_total']
xlab=[opt['label'] for opt in state_options_finance if opt['value']==selected]
graph_title = 'Net Revenue and Expenses over Time'+": "+xlab[0]
ylabel = '$$$'        
markdown_comments = """Trend towards more profitable institutions?"""
if len(selected)==0:
    df_fig = dfFin1.groupby(['year'])[which_columns].aggregate([('Sum','sum'), ('Nschools','count')])
else:
    df_fig = dfFin1[dfFin1.state_name==selected].groupby(['year'])[which_columns].aggregate([('Sum','sum'), ('Nschools','count')])
fig = go.Figure()
for col in which_columns:
    fig.add_trace(
        go.Scatter(x=df_fig.index, y=df_fig[col]['Sum'], name = seriez[col]['label'], 
                   marker_color=seriez[col]['marker_color'], line = dict(color = seriez[col]['color']), opacity = 0.8))
# FIGURE OUT WHAT IS APT FOR THIS TABLE
df_tab = df_fig.xs(key='Sum', axis=1, level=1)        
df_tab = tbl_early_late(df_tab).rename_axis('').reset_index()
# Set options common to all traces with fig.update_traces
fig.update_traces(mode='lines+markers', marker_line_width=2, marker_size=10)
fig.update_layout(xaxis={'title': xlab[0]}, yaxis={'title': ylabel},
    title={'text': graph_title, 'font':dict(size=16), 'yref': 'paper', 'y': 1, 'yanchor': 'bottom', 'pad':dict(l=20, r=0, t=0, b=5)}, 
    margin=dict(l=20, r=20, t=20, b=20),
    legend=dict(yanchor="top",y=0.99,xanchor="left",x=0.01), legend_title_text='',
    yaxis_zeroline=True, xaxis_zeroline=True)


In [191]:
# --------------------------------
# TUITION TRENDS
# --------------------------------

which_columns = ['rev_tuition_fees_gross','rev_tuition_fees_net']
xlab=[opt['label'] for opt in state_options_finance if opt['value']==selected]
graph_title = 'Deductions in Revenue from Tuition & Fees'+": "+xlab[0]
ylabel = '$$$'
markdown_comments = """I suspect this trend of growing deductions in tuition & fees stems from **INCREASE in financial aid packages**. Research TODO."""
# fig, df_tab = make_figure_section_1(dfFin1, which_columns, selected, 'Sum')        
if len(selected)==0:
    df_fig = dfFin1.groupby(['year'])[which_columns].aggregate([('Avg','mean'), ('Nschools','count')])
else:
    df_fig = dfFin1[dfFin1.state_name==selected].groupby(['year'])[which_columns].aggregate([('Avg','mean'), ('Nschools','count')])
fig = go.Figure()
for col in which_columns:
    fig.add_trace(
        go.Scatter(x=df_fig.index, y=df_fig[col]['Avg'], name = seriez[col]['label'], 
                   marker_color=seriez[col]['marker_color'], line = dict(color = seriez[col]['color']), opacity = 0.8))
# FIGURE OUT WHAT IS APT FOR THIS TABLE
df_tab = df_fig.xs(key='Avg', axis=1, level=1)        
df_tab = tbl_early_late(df_tab).rename_axis('').reset_index()
# Set options common to all traces with fig.update_traces
fig.update_traces(mode='lines+markers', marker_line_width=2, marker_size=10)
fig.update_layout(xaxis={'title': xlab[0]}, yaxis={'title': ylabel},
            title={'text': graph_title, 'font':dict(size=16), 'yref': 'paper', 'y': 1, 'yanchor': 'bottom', 'pad':dict(l=20, r=0, t=0, b=5)}, 
            margin=dict(l=20, r=20, t=20, b=20),
            legend=dict(yanchor="top",y=0.99,xanchor="left",x=0.01),
            # width=800, height=400,
            yaxis_zeroline=True, xaxis_zeroline=True)


In [194]:
# --------------------------------
# VISUALIZE RELATIONSHIPS
# --------------------------------
which_columns = ['admission_rate','enrollement_rate']
xlab[0] = 'Enrollement Rate'
ylabel = 'Admission Rate'
graph_title = 'Admission and Enrollment for Each School, All Years'
df_enroll['unitid'] = df_enroll['unitid'].astype('object')
fig = px.scatter(df_enroll, x='enrollement_rate', y='admission_rate', color='state_name', hover_data=['unitid', 'year'], labels={'state_name': ' '})
fig.update_traces(marker_line_width=1, marker_size=8, marker_line_color='white')
fig.update_layout(title=graph_title, xaxis={'title': xlab[0]}, yaxis={'title': ylabel},
            legend=dict(font = dict(size=8, color='black')),
            width=800, height=400)  


In [138]:
# --------------------------------
# Plot each time series - APPLICATIONS OVER TIME
# --------------------------------
which_columns = 'number_applied'
df_enroll.head()



Unnamed: 0,year,unitid,inst_name,state_name,admission_rate,enrollement_rate,number_applied,number_admitted,number_enrolled_total,rev_tuition_fees_gross,...,rev_total_current,exp_instruc_total,exp_res_pub_serv_total,exp_acad_supp_total,exp_student_serv_total,exp_total_current,completers_150pct,completion_rate_150pct,female_pct,married_pct
0,2001-01-01,104151,ARIZONA STATE UNIVERSITY-MAIN CAMPUS,Arizona,0.752696,0.395045,20861.0,15702.0,6203.0,213935008.0,...,757889000.0,281203328.0,114094864.0,90315456.0,32206918.0,724043000.0,2077.0,0.519,0.560438,0.145523
1,2002-01-01,104151,ARIZONA STATE UNIVERSITY-MAIN CAMPUS,Arizona,0.847425,0.41261,18155.0,15385.0,6348.0,221455008.0,...,760471000.0,288129696.0,138618320.0,91586424.0,32810496.0,754534000.0,2568.0,0.52,0.563042,0.148799
2,2003-01-01,104151,ARIZONA STATE UNIVERSITY-MAIN CAMPUS,Arizona,0.884003,0.407433,19785.0,17490.0,7126.0,275820000.0,...,838414000.0,307579232.0,141886336.0,110302928.0,33253956.0,836313000.0,2594.0,0.546,0.566987,0.143571
3,2004-01-01,104151,ARIZONA STATE UNIVERSITY AT THE TEMPE CAMPUS,Arizona,0.860984,0.399296,20789.0,17899.0,7147.0,331756000.0,...,939787000.0,364841728.0,162849104.0,126081992.0,40742620.0,941477000.0,3090.0,0.549,0.561103,0.135377
4,2005-01-01,104151,Arizona State University at the Tempe Campus,Arizona,0.910214,0.425135,19914.0,18126.0,7706.0,369411008.0,...,1043546000.0,374654016.0,187340384.0,140387840.0,45985048.0,1032345000.0,3249.0,0.564,0.5526,0.125603


In [103]:
# --------------------------------
# Plot Breakdown of revenues:
# --------------------------------


In [102]:
fig = px.bar(dfRev, x='year', y='value', color = 'variable',
        hover_data=['inst_name'],  #Doesn't work when column is type category
             labels={'y': 'TotalSpend'}, title='Types of Revenue Sources over time')
fig.show()

In [94]:
# --------------------------------
# Plot Breakdown of expenditures:
# --------------------------------
fig = px.bar(dfExp[dfExp.state_name=='California'], x='year', y='value', color = 'variable',
    #     hover_data=['unitid'],  #Not sure why but this doesn't work
             labels={'y': 'TotalSpend'}, title='Types of Expenditures over time')
fig.show()

In [95]:
# EXAMPLE - NEED TO CREATE S DIFF VERSION OF DATA FRAME
df = px.data.gapminder()
gdp = df['pop'] * df['gdpPercap']
fig = px.bar(df, x='year', y=gdp, color='continent', labels={'y':'gdp'},
             hover_data=['country'],
             title='Evolution of world GDP')
fig.show()


In [97]:
dfExp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31080 entries, 0 to 31079
Data columns (total 6 columns):
year          31080 non-null datetime64[ns]
unitid        31080 non-null category
inst_name     31080 non-null category
state_name    31080 non-null category
variable      31080 non-null object
value         29288 non-null float64
dtypes: category(3), datetime64[ns](1), float64(1), object(1)
memory usage: 917.3+ KB


In [163]:
which_columns = ['admission_rate','enrollement_rate']
# df_enroll['unitid'] = df_enroll['unitid'].astype('object')
# fig = px.scatter(df_enroll, x='admission_rate', y='enrollement_rate', color='state_name', hover_data=['unitid'])
# fig.show()
# Raw Data for table:
df_fig = df_enroll.groupby(['year'])['number_admitted','number_enrolled_total'].aggregate([('Avg',np.mean), ('stdev',np.std), ('Nschools','count'), ('SEM', sem_btwn)])
df_tab = df_fig.xs(key='Avg', axis=1, level=1)        
df_tab = tbl_early_late(df_tab).rename_axis('').reset_index()
df_tab

Unnamed: 0,Unnamed: 1,Avg 2001-02,Avg 2016-17,DIFF
0,number_admitted,3815.322,7260.624,-3445.302
1,number_enrolled_total,1567.464,1968.906,-401.442


In [144]:
# EXAMPLE- THIS COULD BE NICE TO SHOW FOR SCHOOLS IN EACH STATE (TODO)
col = 'number_enrolled_total'
DF = df_enroll
DF['log_'+col] = np.log(df[col])
fig2 = px.line(DF, y="log_"+col, x="year", color="state_name", line_group="unitid",
#               line_shape="spline", render_mode="svg",
             color_discrete_sequence=px.colors.qualitative.G10,
             title='log_'+col, hover_name="inst_name").update_traces(mode='markers')

# fig2 = px.line(DF, y=col, x="year", color="state_name", line_group="unitid",
# #               line_shape="spline", render_mode="svg",
#              color_discrete_sequence=px.colors.qualitative.G10,
#              title=col, hover_name="inst_name").update_traces(mode='markers')

fig2.show()


divide by zero encountered in log



In [None]:
# --------------------------------
# FINANCES OVER TIME
# --------------------------------
active_tab == "CumulativeFin"
df = dfFin1
which_columns = ['rev_total_current','exp_total_current','rev_tuition_fees_gross','rev_tuition_fees_net','exp_instruc_total']
selected = ''

xlab=[opt['label'] for opt in state_options_finance if opt['value']==selected]
graph_title = 'Net Revenue and Expenses over Time'+": "+xlab[0]
ylabel = '$$$'        

if len(selected)==0:
    tmp = df.groupby(['year'])[which_columns].sum()
else:
    tmp = df[df.state_name==selected].groupby(['year'])[which_columns].sum()
   
#Data for figure
tmp.iplot(kind='scatter',width=4,title=graph_title,xTitle=xlab[0],yTitle=ylabel,theme='white',asFigure=True)

In [None]:
# --------------------------------
# FINANCE STUFF
# --------------------------------
active_tab = "TuitionTrends"
df = dfFin1
which_columns = ['rev_tuition_fees_gross','rev_tuition_fees_net']
selected = ''

xlab=[opt['label'] for opt in state_options_finance if opt['value']==selected]
graph_title = 'Deductions in Revenue from Tuition & Fees'+": "+xlab[0]
ylabel = '$$$'

if len(selected)==0:
    tmp = dfFin1.groupby(['year'])[which_columns].apply(np.mean)
else:
    tmp = dfFin1[df.state_name==selected].groupby(['year'])[which_columns].apply(np.mean)

# Data for figure
tmp.iplot(kind='scatter',width=4,title=graph_title,xTitle=xlab[0],yTitle=ylabel,theme='white',asFigure=True)

In [165]:
seriez = {'number_applied': {'color': '#F44DDB', 'label': "Number of student applications", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'number_admitted': {'color': '#CF1214', 'label': "Number of students admitted", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'number_enrolled_total': {'color': '#0E3DEC', 'label': "Number enrolled", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'admission_rate': {'color': '#CF1214', 'label': "Admission rate (# applications/# admissions)", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'enrollement_rate': {'color': '#0E3DEC', 'label': "Enrollment rate (# admissions/# enrolled", 'timeDep': 'Yes', 'marker_color': 'rgba(152, 0, 0, .8)'},
            'rev_tuition_fees_gross': {'color': '#8E44AD', 'label': "Revenue: Tuition&Fees (gross)", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_tuition_fees_net': {'color': '#2E86C1 ', 'label': "Revenue: Tuition&Fees (net)", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_total_current': {'color': '#1E8449', 'label': "Revenue: Total", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_fed_approps_grants': {'color': '#F1948A', 'label': "Revenue: Fed grants & approp", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_state_local_approps_grants': {'color': '#C0392B', 'label': "Revenue: State grants & approp", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'rev_other': {'color': '#00ACC1', 'label': "Revenue: Other", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_total_current': {'color': '#F4D03F', 'label': "Expenses: Total", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_instruc_total': {'color': '#FFA726', 'label': "Expenses: Instruction", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_acad_supp_total': {'color': '#B2EBF2', 'label': "Expenses: Acad Supp", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_student_serv_total': {'color': '##D7CCC8', 'label': "Expenses: Stud Serv", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'exp_res_pub_serv_total': {'color': '#0E3DEC', 'label': "Expenses: Resch & Pub Serv", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'completers_150pct': {'color': '#0E3DEC', 'label': "Completers 150%", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'completion_rate_150pct': {'color': '#0E3DEC', 'label': "Completion Rate 150%", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'female_pct': {'color': '#0E3DEC', 'label': "% Female", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'married_pct': {'color': '#0E3DEC', 'label': "% Marries", 'timeDep': 'Yes', 'marker_color': 'rgba(250, 250, 250, .8)'},
            'state_name': {'color': '#0E3DEC', 'label': "State", 'timeDep': 'No', 'marker_color': 'rgba(250, 250, 250, .8)'},
        }


In [None]:
# --------------------------------
# MODEL ENROLLMENT OVER TIME -- removed this tab, keep code here in case I want to refer to it
# --------------------------------
active_tab = "PlotlyExpress"
df = df_enroll
which_columns = ['admission_rate','enrollement_rate']
selected = ''

xlab=[opt['label'] for opt in state_options_enrollment if opt['value']==selected]
graph_title = 'Predict 2017 Enrollment using 2001-16'
ylabel = 'Rate'

# Data for figure:
if len(selected)==0:
    df_fig = df.groupby(['year'])[which_columns].aggregate([('Avg',np.mean), ('stdev',np.std), ('Nschools','count'), ('SEM', sem_btwn)])
else:
    df_fig = df[df.state_name==selected].groupby(['year'])[which_columns].aggregate([('Avg',np.mean), ('stdev',np.std), ('Nschools','count'), ('SEM', sem_btwn)])

fig = px.scatter(df_fig, x=df_fig.index, y=df_fig['admission_rate',]['Avg'], trendline="ols")
fig.update_layout(title= dict(text=graph_title)) # , font=dict(size=16)
fig.update_traces(marker=dict(color=seriez['admission_rate']['color']), line=dict(color=seriez['admission_rate']['color'], width=4, dash='dot'))
res_tmp = px.get_trendline_results(fig)
res_tmp = res_tmp.px_fit_results.iloc[0].summary().as_html()
trend_AR = pd.read_html(res_tmp, header=0, index_col=0)[0]
trend_AR.drop(['Date:','Time:'], axis=0, inplace=True)
trend_AR.rename(columns={'y': 'admission_rate'},inplace=True)

fig2 = px.scatter(df_fig, x=df_fig.index, y=df_fig['enrollement_rate']['Avg'], trendline="ols")
fig2.update_traces(marker=dict(color=seriez['enrollement_rate']['color']), line=dict(color=seriez['enrollement_rate']['color'], width=4, dash='dot'))
res_tmp = px.get_trendline_results(fig2)
res_tmp = res_tmp.px_fit_results.iloc[0].summary().as_html()
trend_ER = pd.read_html(res_tmp, header=0, index_col=0)[0]
trend_ER.drop(['Date:','Time:'], axis=0, inplace=True)
trend_ER.rename(columns={'y': 'enrollment_rate', 'R-squared:':'R-squared'},inplace=True)

fig.add_trace(fig2.data[0])
fig.add_trace(fig2.data[1])
fig.update_traces(marker_line_width=2, marker_size=10)
fig.update_layout(title=graph_title,yaxis={'title': ylabel},xaxis={'title': xlab[0]}, 
          yaxis_zeroline=True, xaxis_zeroline=True)
fig.show()
df_tab = pd.concat([trend_AR, trend_ER], axis=1)