In [57]:
import os
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

if not os.path.exists("images"):
    os.mkdir("images")
    
# Read in csv
df = pd.read_csv('fittedFailureRate.csv')
df['FR_fitted'] = df['FR_fitted']
df.head()

Unnamed: 0.1,Unnamed: 0,Webpage,Page_Type,Institution,Funding,Tax_type,Region,Type,CU,CSU,CS_Department,CCCS,Elements,Errors,FR,FR_fitted
0,0,https://www.adams.edu/,A,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,698,1,0.001433,0.900164
1,1,https://www.adams.edu/academics/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,729,1,0.001372,0.891194
2,2,https://www.adams.edu/academics/graduate/couns...,D,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,507,1,0.001972,0.969114
3,3,https://www.adams.edu/catalog/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,512,1,0.001953,0.966918
4,4,https://www.adams.edu/faculty-staff/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,645,1,0.00155,0.916706


In [58]:
df2 = df[['Institution', 'Funding', 'Tax_type']]
df2.drop_duplicates(subset=['Institution'], inplace=True)

In [59]:
fig = px.histogram(df2, y="Funding", color='Tax_type',
                   color_discrete_sequence=["#21314D", "#D2492A", "#808DA0","#8b8e8d"],
                   labels={
                     "Funding": "Funding",
                     "Tax_type": "Tax Status"
                  },
                  width=800, height=400)
fig.update_traces(marker_line_color='#000000',
                  marker_line_width=1.5)
fig.update_layout(font_size=14, title_font_size=20)
fig.show()
fig.write_image("images/fundingBar.png")

In [60]:
fig = px.histogram(df2, x="Funding", color='Tax_type',
                   color_discrete_sequence=px.colors.qualitative.Dark2,
                   labels={
                     "Funding": "Funding",
                     "Tax_type": "Tax Status"
                  },
                  title="Institutions by Funding and Tax Status")
fig.update_traces(marker_line_color='#000000',
                  marker_line_width=1.5)
fig.update_layout(font_size=14, title_font_size=20)
fig.show()
fig.write_image("images/fundingBarVert.png")

In [61]:
df_region = df.groupby(['Region'])['Region'].count().reset_index(name='Count')
fig = px.pie(df_region, values='Count', names='Region', 
             width=600, height=600)
fig.update_traces(hoverinfo='label+percent', textinfo='label+percent', textfont_size=14,
                  marker=dict(colors=px.colors.qualitative.Pastel1, line=dict(color='#000000', width=2)))
fig.update_layout(font_size=14, title_font_size=20)
fig.update_layout(showlegend=False)
fig.show()
fig.write_image("images/regionsPie.png")

In [62]:
df_type = df.groupby(['Type'])['Type'].count().reset_index(name='Count')
fig = px.pie(df_type, values='Count', names='Type', 
             width=600, height=600)
fig.update_traces(hoverinfo='label+percent', textinfo='label+percent', textfont_size=14, textfont_color = '#000000',
                  marker=dict(colors=px.colors.qualitative.Pastel1, line=dict(color='#000000', width=2)))
fig.update_layout(font_size=14, title_font_size=20)
fig.update_layout(showlegend=False)
#fig.update_layout(title_x=0.4)
fig.show()

# Contrast is good!
fig.write_image("images/typesPie.png")

In [63]:
fig = px.scatter(df, x='Elements', y='Errors', 
                 color_discrete_sequence=["#304871"],
                 trendline="ols", trendline_color_override="#D2492A")
fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.04)", showlegend=False,)
fig.show() 
fig.write_image("images/errorElementCorr.png")

In [64]:
fig = px.histogram(df, x="Errors",
                   labels={
                     "Errors": "Errors"
                   },
                   width=600, height=600)
fig.show()

In [65]:
fig = px.histogram(df, x="FR",
                   labels={
                     "FR": "Modified Failure Rate"
                   },
                   width=600, height=600)
fig.show()

In [66]:
fig = px.histogram(df, x="FR_fitted",
                   labels={
                     "FR_fitted": "Fitted Modified Failure Rate"
                   },
                   width=600, height=600)
fig.show()

In [67]:
fig = px.histogram(df, x="FR",
                   labels={
                     "FR": "Modified Failure Rate"
                   },
                   width=600, height=600, 
                   histnorm='probability density',
                   title='Probability Density of Modified Failure Rate')
fig.show()

In [68]:
df_fig = df[['Errors']]
fig = ff.create_distplot([df_fig[c] for c in df_fig.columns], df_fig.columns, bin_size=10)

# Add title
fig.update_layout(title_text='Errors', 
                  showlegend=False,
                  width=800, height=600)
fig.update_layout(font_size=14, title_font_size=20)
fig.show()
fig.write_image("images/ErrorsHist.png")

In [69]:
df_fig = df[['FR']]
fig = ff.create_distplot([df_fig[c] for c in df_fig.columns], df_fig.columns, bin_size=0.005, colors=['#21314D'])

# Add title
fig.update_layout(showlegend=False,
                  width=600, height=600)
fig.update_layout(font_size=14, title_font_size=20)
fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)")
fig.show()
fig.write_image("images/MFRHist.png")

In [70]:
df_fig = df[['FR_fitted']]
fig = ff.create_distplot([df_fig[c] for c in df_fig.columns], df_fig.columns, bin_size=0.05, colors=['#21314D'])

# Add title
fig.update_layout(showlegend=False,
                  width=600, height=600)
fig.update_layout(font_size=14, title_font_size=20)
fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)")
fig.show()
fig.write_image("images/FittedMFRHist.png")

In [71]:
# Calculate Standard Deviation of Pa11y Errors
ser_std = df.groupby(['Institution'])['Errors'].std()
# Calculate Standard Deviation of Normalized Errors
ser_norm_std = df.groupby(['Institution'])['FR'].std()
# Mean within institutions
ser_mean = df.groupby(['Institution'])['Errors'].mean()
ser_mean.rename("Mean", inplace=True)
# Normalized Mean within institutions
ser_norm_mean = df.groupby(['Institution'])['FR'].mean()
ser_norm_mean.rename("Mean", inplace=True)
# CV within institutions
ser_cv = ser_std.divide(ser_mean)
ser_cv.rename("CV", inplace=True)
# Normalized CV within institutions
ser_norm_cv = ser_norm_std.divide(ser_norm_mean)
ser_norm_cv.rename("CV", inplace=True)


fig = go.Figure()
fig.add_trace(go.Box(y = ser_cv, line=dict(color="#5B9AC8"),
                     name='CV of Errors',
                     boxmean=True, showlegend=False))
fig.layout.xaxis2 = go.layout.XAxis(overlaying='x', range=[0, 2], showticklabels=False)
fig.add_scatter(x = [0, 1], y = [1.5, 1.5], mode='lines', xaxis='x2', name="CV across all institutions",
                showlegend=False, line=dict(dash='dash', color = "firebrick", width = 2))
fig.add_trace(go.Box(y = ser_norm_cv, line=dict(color="#3674A1"),
                     name='CV of Modified Failure Rate',
                     boxmean=True, showlegend=False))
fig.layout.xaxis3 = go.layout.XAxis(overlaying='x', range=[0, 2], showticklabels=False)
fig.add_scatter(x = [1, 2], y = [1.1, 1.1], mode='lines', xaxis='x2', name="Normalized CV across all institutions",
                showlegend=False, line=dict(dash='dash', color = "darkred", width = 2))
fig.update_layout(font_size=16)
fig.show()
fig.write_image("images/CVbox.png")

In [72]:
fig = px.box(df, x="Type", y="FR_fitted", color="Funding")
fig.show()

In [73]:
fig = px.box(df, x="Type", y="FR_fitted", color="Type",
             color_discrete_sequence=["#21314D", "#D2492A", "#808DA0","#8b8e8d"],
             labels={
                     "FR_fitted": "Fitted Failure Rate",
                     "Type": ""
                 }
            )
fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)", 
                  showlegend=False)
fig.show()

# TODO Add mean (https://plot.ly/python/box-plots/?_ga=2.154071416.1570564365.1673540517-1262171880.1665081304#box-plot-styling-mean--standard-deviation)
# TODO Add a line tracing the means
fig.write_image("images/TypeWhisker.png")

In [74]:
x1 = df.loc[(df['Type'] == 'University'), ['FR_fitted']]
x2 = df.loc[(df['Type'] == 'College'), ['FR_fitted']]
x3 = df.loc[(df['Type'] == 'Community College'), ['FR_fitted']]
x4 = df.loc[(df['Type'] == 'VoTech'), ['FR_fitted']]

fig = go.Figure()
fig.add_trace(go.Box(
                     y=x1['FR_fitted'],
                     marker_color='#21314D',
                     name='University',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x2['FR_fitted'],
                     marker_color='#D2492A',
                     name='College',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x3['FR_fitted'],
                     marker_color='#808DA0',
                     name='Community College',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x4['FR_fitted'],
                     marker_color='#8b8e8d',
                     name='VoTech',
                     boxmean='sd'))

#fig.layout.xaxis2 = go.layout.XAxis(overlaying='x', range=[0, 4], showticklabels=False)

#fig.add_scatter(x = [0.5, 1.5, 2.5, 3.5], 
#                y = [1.505783, 1.680101, 1.680101, 1.680101], 
#                mode='lines', xaxis='x2',
#                showlegend=False, line=dict(color = "black", width = 2))

fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)", 
                  showlegend=False)
fig.show()

# TODO Add a line tracing the means
fig.write_image("images/TypeWhiskerMean.png")

In [75]:
x1a = df.loc[(df['Type'] == 'University') & (df['Funding'] == 'public'), ['FR_fitted']]
x1b = df.loc[(df['Type'] == 'University') & (df['Funding'] == 'private'), ['FR_fitted']]
x2a = df.loc[(df['Type'] == 'College') & (df['Funding'] == 'public'), ['FR_fitted']]
x2b = df.loc[(df['Type'] == 'College') & (df['Funding'] == 'private'), ['FR_fitted']]
x3 = df.loc[(df['Type'] == 'Community College'), ['FR_fitted']]
x4a = df.loc[(df['Type'] == 'VoTech') & (df['Funding'] == 'public'), ['FR_fitted']]
x4b = df.loc[(df['Type'] == 'VoTech') & (df['Funding'] == 'private'), ['FR_fitted']]

fig = go.Figure()
fig.add_trace(go.Box(
                     y=x1a['FR_fitted'],
                     marker_color='#21314D',
                     name='Public University',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x1b['FR_fitted'],
                     marker_color='#21314D',
                     name='Private University',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x2a['FR_fitted'],
                     marker_color='#D2492A',
                     name='Public College',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x2b['FR_fitted'],
                     marker_color='#D2492A',
                     name='Private College',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x3['FR_fitted'],
                     marker_color='#808DA0',
                     name='Community College',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x4a['FR_fitted'],
                     marker_color='#8b8e8d',
                     name='Public VoTech',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x4b['FR_fitted'],
                     marker_color='#8b8e8d',
                     name='Private VoTech',
                     boxmean='sd'))

fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)", 
                  showlegend=False)
fig.show()

# TODO Add a line tracing the means
fig.write_image("images/FundTypeWhiskerMean.png")

In [76]:
x1 = df.loc[(df['Funding'] == 'public'), ['FR_fitted']]
x2 = df.loc[(df['Funding'] == 'private') & (df['Tax_type'] == 'non-profit'), ['FR_fitted']]
x3 = df.loc[(df['Funding'] == 'private') & (df['Tax_type'] == 'profit'), ['FR_fitted']]

fig = go.Figure()
fig.add_trace(go.Box(
                     y=x1['FR_fitted'],
                     marker_color='#21314D',
                     name='Public Non-profit',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x2['FR_fitted'],
                     marker_color='#D2492A',
                     name='Private Non-profit',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x3['FR_fitted'],
                     marker_color='#808DA0',
                     name='Private For Profit',
                     boxmean='sd'))

fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)", 
                  showlegend=False)
fig.show()

# TODO Add a line tracing the means
fig.write_image("images/FundTaxWhiskerMean.png")

In [77]:
x1 = df.loc[(df['Funding'] == 'public'), ['FR_fitted']]
x2 = df.loc[(df['Funding'] == 'private'), ['FR_fitted']]

fig = go.Figure()
fig.add_trace(go.Box(
                     y=x1['FR_fitted'],
                     marker_color='#21314D',
                     name='Public',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x2['FR_fitted'],
                     marker_color='#D2492A',
                     name='Private',
                     boxmean='sd'))


fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)", 
                  showlegend=False)
fig.show()

# TODO Add a line tracing the means
fig.write_image("images/FundWhiskerMean.png")

In [78]:
x1 = df.loc[(df['Tax_type'] == 'profit'), ['FR_fitted']]
x2 = df.loc[(df['Tax_type'] == 'non-profit'), ['FR_fitted']]

fig = go.Figure()
fig.add_trace(go.Box(
                     y=x1['FR_fitted'],
                     marker_color='#21314D',
                     name='For Profit',
                     boxmean='sd'))
fig.add_trace(go.Box(
                     y=x2['FR_fitted'],
                     marker_color='#D2492A',
                     name='Non-profit',
                     boxmean='sd'))


fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)", 
                  showlegend=False)
fig.show()

# TODO Add a line tracing the means
fig.write_image("images/TaxWhiskerMean.png")

In [79]:
fig = px.box(df, x="Funding", y="FR_fitted", color="Funding")
fig.show()

In [80]:
fig = px.box(df, x="Region", y="FR_fitted", color="Region")
fig.show()

In [81]:
fig = px.box(df, x="Region", y="FR_fitted", color="Funding")
fig.show()

In [82]:
fig = px.box(df, x="Region", y="FR", color="Region")
fig.show()

In [83]:
fig = px.box(df, x="Region", y="Errors", color="Region")
fig.show()

In [84]:
fig = px.box(df, x="Tax_type", y="FR_fitted", color="Tax_type")
fig.show()

In [85]:
fig = px.box(df, x="Funding", y="FR_fitted", color="Tax_type")
fig.show()

In [86]:
# Distance Education
de = {'Year': [2021,2020,2019,2018,2017,2016,2015,2014,2013,2012], 
      ' ': [77,72.8,36.3,34.7,32.9,31.2,29.3,27.7,26.4,25.5]}
df_de = pd.DataFrame(data=de)

fig = px.line(df_de, x='Year', y=' ', markers=True, 
              title='Percent of post-secondary students enrolled in at least one distance education course')
fig.update_layout(font_size=16, title_font_size=20)
fig.show()

In [87]:
x1 = df.loc[(df['Type'] == 'College'), ['FR_fitted']]
x2 = df.loc[(df['Type'] == 'University'), ['FR_fitted']]


hist_data = [x1['FR_fitted'], x2['FR_fitted']]

group_labels = ['College', 'University']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=px.colors.qualitative.Set2,
                         bin_size=.05, show_rug=False)

# Add title
fig.update_layout(title_text='Colleges and Universities')
fig.show()

In [88]:
x1 = df.loc[(df['Type'] == 'VoTech'), ['FR_fitted']]
x2 = df.loc[(df['Type'] == 'University'), ['FR_fitted']]


hist_data = [x1['FR_fitted'], x2['FR_fitted']]

group_labels = ['VoTech', 'University']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=px.colors.qualitative.Set2,
                         bin_size=.05, show_rug=False)

# Add title
fig.update_layout(title_text='VoTechs and Universities')
fig.show()

In [89]:
x1 = df.loc[(df['Type'] == 'VoTech'), ['FR_fitted']]
x2 = df.loc[(df['Type'] == 'Community College'), ['FR_fitted']]


hist_data = [x1['FR_fitted'], x2['FR_fitted']]

group_labels = ['VoTech', 'Community College']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=px.colors.qualitative.Set2,
                         bin_size=.05, show_rug=False)

# Add title
fig.update_layout(title_text='VoTechs and Community Colleges')
fig.show()

In [90]:
x1 = df.loc[(df['Type'] == 'College'), ['FR_fitted']]
x2 = df.loc[(df['Type'] == 'Community College'), ['FR_fitted']]


hist_data = [x1['FR_fitted'], x2['FR_fitted']]

group_labels = ['College', 'Community College']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=px.colors.qualitative.Set2,
                         bin_size=.05, show_rug=False)

# Add title
fig.update_layout(title_text='Colleges and Community Colleges')
fig.show()

In [91]:
x1 = df.loc[(df['Type'] == 'University'), ['FR']]
x2 = df.loc[(df['Type'] == 'College'), ['FR']]
x3 = df.loc[(df['Type'] == 'Community College'), ['FR']]
x4 = df.loc[(df['Type'] == 'VoTech'), ['FR']]


hist_data = [x1['FR'], x2['FR'], x3['FR'], x4['FR']]

group_labels = ['University', 'College', 'Community College', 'VoTech']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=px.colors.qualitative.Set2,
                         bin_size=.004, show_rug=False)

fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0.1,0.03)", 
                  showlegend=False)
# Add title
fig.update_layout(title_text='All Institutions')
fig.show()

In [92]:
x1 = df.loc[(df['Type'] == 'University'), ['FR']]
x2 = df.loc[(df['Type'] == 'College'), ['FR']]
x3 = df.loc[(df['Type'] == 'Community College'), ['FR']]
x4 = df.loc[(df['Type'] == 'VoTech'), ['FR']]


hist_data = [x1['FR'], x2['FR'], x3['FR'], x4['FR']]

group_labels = ['University', 'College', 'Community College', 'VoTech']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=["#21314D", "#D2492A", "#8082a0","#656867"],
                         bin_size=.004, show_rug=False)
fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)")
# Add title
fig.update_layout(title_text='All Institutions')
fig.show()
fig.write_image("images/DistInstitution.png")

In [93]:
x1 = df.loc[(df['Type'] == 'University'), ['FR']]
x2 = df.loc[(df['Type'] == 'College'), ['FR']]
x3 = df.loc[(df['Type'] == 'Community College'), ['FR']]
x4 = df.loc[(df['Type'] == 'VoTech'), ['FR']]


hist_data = [x1['FR'], x2['FR'], x3['FR'], x4['FR']]

group_labels = ['University', 'College', 'Community College', 'VoTech']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=["#21314D", "#D2492A", "#8082a0","#656867"],
                         bin_size=.005, show_hist=False, show_rug=True)
fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)")
# Add title
fig.update_layout(title_text='All Institutions')
fig.show()
fig.write_image("images/LineDistInstitutions.png")

In [94]:
x1 = df.loc[(df['Funding'] == 'public'), ['FR_fitted']]
x2 = df.loc[(df['Funding'] == 'private'), ['FR_fitted']]


hist_data = [x1['FR_fitted'], x2['FR_fitted']]

group_labels = ['Public', 'Private']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=["#21314D", "#D2492A"],
                         bin_size=.005, show_hist=False, show_rug=True)
fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)")
# Add title
fig.update_layout(title_text='Modified Failure Rate by Funding')
fig.show()
fig.write_image("images/LineDistFunding.png")

In [95]:
x1 = df.loc[(df['Type'] == 'University'), ['FR_fitted']]
x2 = df.loc[(df['Type'] == 'College'), ['FR_fitted']]
x3 = df.loc[(df['Type'] == 'Community College'), ['FR_fitted']]
x4 = df.loc[(df['Type'] == 'VoTech'), ['FR_fitted']]


hist_data = [x1['FR_fitted'], x2['FR_fitted'], x3['FR_fitted'], x4['FR_fitted']]

group_labels = ['University', 'College', 'Community College', 'VoTech']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=["#21314D", "#D2492A", "#8082a0","#656867"],
                         bin_size=.005, show_hist=False, show_rug=True)
fig.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0.1,0.1,0.1,0.03)")
# Add title
fig.update_layout(title_text='Modified Failure Rate After Transformation by Intitution Type')
fig.show()
fig.write_image("images/LineDistFittedInstitutions.png")