In [1]:
from __future__ import print_function
from student_risk import build_dev
import functools
import ipywidgets as widgets
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import statsmodels.graphics.api as smg
from imblearn.under_sampling import TomekLinks
from ipywidgets import GridspecLayout, interact, interactive, fixed, interact_manual
from IPython.display import clear_output, display, Javascript
from matplotlib.legend_handler import HandlerLine2D
from halo import HaloNotebook
from patsy import dmatrices
from sklearn.compose import make_column_transformer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from statsmodels.api import OLS
from statsmodels.discrete.discrete_model import Logit
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# Global variables
wsu_color = (0.596,0.117,0.196)
wsu_cmap = sns.light_palette("#981e32",as_cmap=True)

# Create Datasets for Census Model Development (~ 10 min.)

In [3]:
# SAS dataset builder
def creator(b):
    with create_output:
        build_dev.DatasetBuilderDev.build_census_dev()

create_button = widgets.Button(description='Create Dataset')
create_output = widgets.Output()
   
create_button.on_click(creator)

button_nest = widgets.VBox([create_button, create_output])
display(button_nest)

VBox(children=(Button(description='Create Dataset', style=ButtonStyle()), Output()))

# Import Saved Datasets from Disk

In [4]:
# Import pre-split data
class Importer():
        
    def __init__(self):
        self.training_set = pd.DataFrame()
        self.testing_set = pd.DataFrame()
    
    def importer(self, b):
        self.training_set = pd.read_csv('Z:\\Nathan\\Models\\student_risk\\datasets\\training_set.csv',
                                   encoding='utf-8', low_memory=False)
        self.testing_set = pd.read_csv('Z:\\Nathan\\Models\\student_risk\\datasets\\testing_set.csv', 
                                  encoding='utf-8', low_memory=False)

    def training_getter(self):
        return self.training_set
    
    def testing_getter(self):
        return self.testing_set


im = Importer()
    
importer_button = widgets.Button(description='Import Dataset')

importer_button.on_click(im.importer)
display(importer_button)

Button(description='Import Dataset', style=ButtonStyle())

# Select Campus for Census Model Development

In [5]:
dropdown = widgets.Dropdown(
    options=[('Pullman', 'PULLM'), ('Tri-Cities', 'TRICI'), ('Vancouver', 'VANCO')],
    value='PULLM',
    description='Campus:',
)

display(dropdown)

Dropdown(description='Campus:', options=(('Pullman', 'PULLM'), ('Tri-Cities', 'TRICI'), ('Vancouver', 'VANCO')…

# Select Model Variables

In [6]:
adms_features = ['acad_year','count_week_from_term_begin_dt','marital_status','resident','father_wsu_flag','mother_wsu_flag',
                 'parent1_highest_educ_lvl','parent2_highest_educ_lvl','citizenship_country','last_sch_proprietorship']

demo_features = ['age_group','age','male','race_hispanic','race_american_indian',
                 'race_alaska','race_asian','race_black','race_native_hawaiian','race_white',
                 'underrep_minority','ipeds_ethnic_group_descrshort']

acs_features = ['gini_indx','pvrt_rate','median_inc','median_value',
                 'educ_rate','pct_blk','pct_ai','pct_asn','pct_hawi','pct_oth','pct_two','pct_non','pct_hisp']
            
geo_features = ['distance','pop_dens','city_large','city_mid','city_small','suburb_large','suburb_mid','suburb_small','town_fringe',
                'town_distant','town_remote','rural_fringe','rural_distant','rural_remote']

cen_features = ['pell_eligibility_ind','pell_recipient_ind','first_gen_flag','LSAMP_STEM_Flag','anywhere_STEM_Flag',
                 'honors_program_ind','afl_greek_indicator','athlete']

plan_features = ['business','cahnrs_anml','cahnrs_envr','cahnrs_econ','cahnrext','cas_chem','cas_crim',
                 'cas_math','cas_psyc','cas_biol','cas_engl','cas_phys','cas','comm','education','medicine',
                 'nursing','pharmacy','provost','vcea_bioe','vcea_cive','vcea_desn','vcea_eecs','vcea_mech',
                 'vcea','vet_med']

perf_features = ['high_school_gpa','fall_midterm_gpa_avg','fall_midterm_gpa_avg_ind','fall_cum_gpa','spring_midterm_gpa_avg',
                 'spring_midterm_gpa_avg_ind','spring_midterm_gpa_change','spring_withdrawn_hours']
            
cls_features = ['remedial','fall_avg_difficulty','fall_avg_pct_withdrawn','fall_avg_pct_CDFW',
                'fall_lec_count','fall_lab_count','fall_lec_contact_hrs','fall_lab_contact_hrs',
                'fall_avg_pct_CDF','fall_avg_pct_DFW','fall_avg_pct_DF','total_fall_contact_hrs','total_fall_units',
                'spring_avg_difficulty','spring_avg_pct_withdrawn','spring_avg_pct_CDFW','spring_avg_pct_CDF',
                'spring_avg_pct_DFW','spring_avg_pct_DF','spring_lec_count','spring_lab_count','spring_lec_contact_hrs',
                'spring_lab_contact_hrs','total_spring_contact_hrs','cum_adj_transfer_hours','term_credit_hours']
        
tran_features = ['AD_DTA','AD_AST','AP','RS','CHS','IB','AICE','IB_AICE']

vis_features = ['attendee_alive','attendee_campus_visit','attendee_cashe','attendee_destination',
                'attendee_experience','attendee_fcd_pullman','attendee_fced','attendee_fcoc',
                'attendee_fcod','attendee_group_visit','attendee_honors_visit','attendee_imagine_tomorrow',
                'attendee_imagine_u','attendee_la_bienvenida','attendee_lvp_camp','attendee_oos_destination',
                'attendee_oos_experience','attendee_preview','attendee_preview_jrs','attendee_shaping',
                'attendee_top_scholars','attendee_transfer_day','attendee_vibes','attendee_welcome_center',
                'attendee_any_visitation_ind','attendee_total_visits']

aid_features = ['fed_efc','fed_need','unmet_need_ofr']

adms_objs = []
demo_objs = []
acs_objs = []
geo_objs = []
cen_objs = []
plan_objs = []
perf_objs = []
cls_objs = []
tran_objs = []
vis_objs = []
aid_objs = []

for feature in adms_features:
    adms_objs.append(widgets.Checkbox(value=False, description=feature))

for feature in demo_features:
    demo_objs.append(widgets.Checkbox(value=False, description=feature))
    
for feature in acs_features:
    acs_objs.append(widgets.Checkbox(value=False, description=feature))
    
for feature in geo_features:
    geo_objs.append(widgets.Checkbox(value=False, description=feature))    

for feature in cen_features:
    cen_objs.append(widgets.Checkbox(value=False, description=feature))
    
for feature in plan_features:
    plan_objs.append(widgets.Checkbox(value=False, description=feature))

for feature in perf_features:
    perf_objs.append(widgets.Checkbox(value=False, description=feature))
    
for feature in cls_features:
    cls_objs.append(widgets.Checkbox(value=False, description=feature))
    
for feature in tran_features:
    tran_objs.append(widgets.Checkbox(value=False, description=feature))    

for feature in vis_features:
    vis_objs.append(widgets.Checkbox(value=False, description=feature))
    
for feature in aid_features:
    aid_objs.append(widgets.Checkbox(value=False, description=feature))
    
adms_ui = widgets.VBox(children=adms_objs, layout=widgets.Layout(flex_flow='row wrap'))
demo_ui = widgets.VBox(children=demo_objs, layout=widgets.Layout(flex_flow='row wrap'))
acs_ui = widgets.VBox(children=acs_objs, layout=widgets.Layout(flex_flow='row wrap'))
geo_ui = widgets.VBox(children=geo_objs, layout=widgets.Layout(flex_flow='row wrap'))
cen_ui = widgets.VBox(children=cen_objs, layout=widgets.Layout(flex_flow='row wrap'))
plan_ui = widgets.VBox(children=plan_objs, layout=widgets.Layout(flex_flow='row wrap'))
perf_ui = widgets.VBox(children=perf_objs, layout=widgets.Layout(flex_flow='row wrap'))
cls_ui = widgets.VBox(children=cls_objs, layout=widgets.Layout(flex_flow='row wrap'))
tran_ui = widgets.VBox(children=tran_objs, layout=widgets.Layout(flex_flow='row wrap'))
vis_ui = widgets.VBox(children=vis_objs, layout=widgets.Layout(flex_flow='row wrap'))
aid_ui = widgets.VBox(children=aid_objs, layout=widgets.Layout(flex_flow='row wrap'))

accordion_0 = widgets.Accordion(children = [adms_ui, demo_ui, vis_ui, tran_ui])
accordion_1 = widgets.Accordion(children = [acs_ui, geo_ui, cen_ui])
accordion_2 = widgets.Accordion(children = [plan_ui, cls_ui, perf_ui])
accordion_3 = widgets.Accordion(children = [aid_ui])

accordion_0.set_title(0, 'Admissions')
accordion_0.set_title(1, 'Demographics')
accordion_0.set_title(2, 'Visitation')
accordion_0.set_title(3, 'Transfer')
accordion_1.set_title(0, 'Economic')
accordion_1.set_title(1, 'Geographic')
accordion_1.set_title(2, 'Census')
accordion_2.set_title(0, 'Plans')
accordion_2.set_title(1, 'Class')
accordion_2.set_title(2, 'Performance')
accordion_3.set_title(0, 'Financial Aid')

tab_nest = widgets.Tab()
tab_nest.children = [accordion_0,accordion_1,accordion_2,accordion_3]
tab_nest.set_title(0, 'Background')
tab_nest.set_title(1, 'Characteristics')
tab_nest.set_title(2, 'Coursework')
tab_nest.set_title(3, 'Student Data')
display(tab_nest)

Tab(children=(Accordion(children=(VBox(children=(Checkbox(value=False, description='acad_year'), Checkbox(valu…

# Build Models

In [7]:
class Builder():

    def __init__(self, dropdown, im, adms_objs, demo_objs, vis_objs, tran_objs, acs_objs, geo_objs, cen_objs, plan_objs, cls_objs, perf_objs, aid_objs):
        self.training_set = pd.DataFrame()
        self.testing_set = pd.DataFrame()
        self.selected_features = []
        self.dropdown = dropdown
        self.adms_objs = adms_objs
        self.demo_objs = demo_objs
        self.vis_objs = vis_objs
        self.tran_objs = tran_objs
        self.acs_objs = acs_objs
        self.geo_objs = geo_objs
        self.cen_objs = cen_objs
        self.plan_objs = plan_objs
        self.cls_objs = cls_objs
        self.perf_objs = perf_objs
        self.aid_objs = aid_objs
        
    def builder(self, b):
        
        self.selected_features = ['emplid', 'enrl_ind']
        
        for _ in range(len(self.adms_objs)):
            if self.adms_objs[_].value == True:
                self.selected_features.append(self.adms_objs[_].description)

        for _ in range(len(self.demo_objs)):
            if self.demo_objs[_].value == True:
                self.selected_features.append(self.demo_objs[_].description)

        for _ in range(len(self.vis_objs)):
            if self.vis_objs[_].value == True:
                self.selected_features.append(self.vis_objs[_].description)

        for _ in range(len(self.tran_objs)):
            if self.tran_objs[_].value == True:
                self.selected_features.append(self.tran_objs[_].description)

        for _ in range(len(self.acs_objs)):
            if self.acs_objs[_].value == True:
                self.selected_features.append(self.acs_objs[_].description)

        for _ in range(len(self.geo_objs)):
            if self.geo_objs[_].value == True:
                self.selected_features.append(self.geo_objs[_].description)

        for _ in range(len(self.cen_objs)):
            if self.cen_objs[_].value == True:
                self.selected_features.append(self.cen_objs[_].description)

        for _ in range(len(self.plan_objs)):
            if self.plan_objs[_].value == True:
                self.selected_features.append(self.plan_objs[_].description)

        for _ in range(len(self.cls_objs)):
            if self.cls_objs[_].value == True:
                self.selected_features.append(self.cls_objs[_].description)

        for _ in range(len(self.perf_objs)):
            if self.perf_objs[_].value == True:
                self.selected_features.append(self.perf_objs[_].description)

        for _ in range(len(self.aid_objs)):
            if self.aid_objs[_].value == True:
                self.selected_features.append(self.aid_objs[_].description)
    
        self.training_set = im.training_getter()[(im.training_getter().adj_acad_prog_primary_campus == self.dropdown.value) & (im.training_getter().adj_admit_type_cat == 'FRSH')][self.selected_features].dropna()
        self.testing_set = im.testing_getter()[(im.testing_getter().adj_acad_prog_primary_campus == self.dropdown.value) & (im.testing_getter().adj_admit_type_cat == 'FRSH')][self.selected_features].dropna()

    def training_getter(self):
        return self.training_set

    def testing_getter(self):
        return self.testing_set

    
bd = Builder(dropdown, im, adms_objs, demo_objs, vis_objs, tran_objs, acs_objs, geo_objs, cen_objs, plan_objs, cls_objs, perf_objs, aid_objs)
    
builder_button = widgets.Button(description='Build Models')

builder_button.on_click(bd.builder)
display(builder_button)

Button(description='Build Models', style=ButtonStyle())

# Descriptive Plots

In [17]:
class Plotter():
    
    def __init__(self):
        self.y = (1,100)
      
    def train_func(self, x, y):
        plt.hist(bd.training_getter()[x], color=wsu_color, bins=y)        
        plt.title('TRAINING DATA')
        
    def test_func(self, x, y):
        plt.hist(bd.testing_getter()[x], color=wsu_color, bins=y)
        plt.title('TESTING DATA')

    def plotter(self, b):
        with plotter_output:
            clear_output()
            interact(self.train_func, x=bd.training_getter().columns.tolist(), y=self.y)
            interact(self.test_func, x=bd.testing_getter().columns.tolist(), y=self.y)

            
pl = Plotter()

plotter_output = widgets.Output()
plotter_button = widgets.Button(description='Create Plots')

plotter_button.on_click(pl.plotter)

plotter_nest = widgets.VBox([plotter_button, plotter_output])
display(plotter_nest)

VBox(children=(Button(description='Create Plots', style=ButtonStyle()), Output()))

# Detect and remove outliers