In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
import hdbscan
import seaborn as sns
sns.set_style("dark")



In [2]:
pd.set_option('max_colwidth', 1000)

# Load Data

In [11]:
data = pd.read_csv('data/scorecard_reduced_features.csv')
imputed = pd.read_csv('data/scorecard_imputed.csv')
data[imputed.columns] = imputed

In [14]:
data.set_index('UNITID', inplace=True)

In [15]:
data

Unnamed: 0_level_0,INSTNM,ZIP,HCM2,CONTROL,LOCALE,CCBASIC,HBCU,PBI,ANNHI,TRIBAL,...,LO_INC_RPY_3YR_RT_SUPP,MD_INC_RPY_3YR_RT_SUPP,HI_INC_RPY_3YR_RT_SUPP,NONCOM_RPY_3YR_RT_SUPP,FIRSTGEN_RPY_3YR_RT_SUPP,PCT_LIBERAL_ARTS,PCT_VOCATIONAL,PCT_RELIGIOUS,COST,PCT_PROFESSIONAL
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,Alabama A & M University,35762,0,1,12,18,1,0,0,0,...,0.448163,0.446488,0.422018,0.373790,0.423581,0.534600,6.290000e-02,0.000000e+00,18888.00,0.407400
100663,University of Alabama at Birmingham,35294-0110,0,1,12,15,0,0,0,0,...,0.724256,0.770134,0.818059,0.678749,0.751553,0.437900,0.000000e+00,9.500000e-03,19990.00,0.580500
100690,Amridge University,36117-3553,0,2,12,21,0,0,0,0,...,0.597701,0.709091,0.788065,0.613281,0.632653,0.630100,0.000000e+00,2.603000e-01,12300.00,0.109600
100706,University of Alabama in Huntsville,35899,0,1,12,15,0,0,0,0,...,0.721404,0.824742,0.851936,0.689788,0.778157,0.276400,0.000000e+00,8.200000e-03,20306.00,0.715200
100724,Alabama State University,36104-0271,0,1,12,18,1,0,0,0,...,0.310875,0.376106,0.333333,0.315997,0.315737,0.544900,0.000000e+00,0.000000e+00,17400.00,0.573300
100751,The University of Alabama,35487-0166,0,1,13,16,0,0,0,0,...,0.740355,0.815261,0.890336,0.711204,0.786885,0.401600,7.000000e-02,6.000000e-03,26717.00,0.544000
100812,Athens State University,35611,0,1,31,22,0,0,0,0,...,0.710145,0.797872,0.895833,0.692187,0.780089,0.494100,0.000000e+00,7.700000e-03,22418.29,0.530300
100830,Auburn University at Montgomery,36117-3596,0,1,12,18,0,0,0,0,...,0.553120,0.703460,0.781690,0.561475,0.637097,0.448500,0.000000e+00,0.000000e+00,16556.00,0.621000
100858,Auburn University,36849,0,1,13,16,0,0,0,0,...,0.805755,0.879596,0.934930,0.881687,0.843118,0.483300,4.750000e-02,2.800000e-03,23788.00,0.466300
100937,Birmingham Southern College,35254,0,2,12,21,0,0,0,0,...,0.756098,0.864000,0.896825,0.848024,0.859649,0.588100,0.000000e+00,3.650000e-02,44512.00,0.375400


In [16]:
print("Minimum:",(data.count()/data.count().max()).min() * 100,
      "\nMedian:",(data.count()/data.count().max()).median() * 100)

Minimum: 100.0 
Median: 100.0


So for the variables we've chosen, most of the data is available, with a minimum of 41% and a median of 86% of the data available.

Let's look at the median and mean of the columns, just so we know how we're manipulating the data.

In [17]:
school_stats = pd.DataFrame()
school_stats['median'] = data.median()
school_stats['mean'] = data.mean()
school_stats

Unnamed: 0,median,mean
HCM2,0.0,0.003235
CONTROL,2.0,1.876264
LOCALE,13.0,19.359078
CCBASIC,21.0,21.346138
HBCU,0.0,0.034371
PBI,0.0,0.002022
ANNHI,0.0,0.002022
TRIBAL,0.0,0.000809
AANAPII,0.0,0.002831
HSI,0.0,0.021836


Median is clearly the better choice for the boolean flags...

In [18]:
data.fillna(data.median(), inplace=True)
school_stats['filled_mean'] = data.mean()

In [19]:
school_stats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54 entries, HCM2 to PCT_PROFESSIONAL
Data columns (total 3 columns):
median         54 non-null float64
mean           54 non-null float64
filled_mean    54 non-null float64
dtypes: float64(3)
memory usage: 1.7+ KB


In [121]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, min_samples=3)
clusterer.fit(data.drop(['INSTNM', 'ZIP', 'cluster'], axis=1))

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, core_dist_n_jobs=4, gen_min_span_tree=False,
    leaf_size=40, memory=Memory(cachedir=None), metric='euclidean',
    min_cluster_size=15, min_samples=3, p=None)

In [122]:
clusterer.labels_.max(), clusterer.labels_.min()

(4, -1)

In [123]:
for i in range(clusterer.labels_.min(), clusterer.labels_.max()+1):
    print("Cluster:",i, "Members:", np.where(clusterer.labels_ == i)[0].size)

Cluster: -1 Members: 960
Cluster: 0 Members: 24
Cluster: 1 Members: 1332
Cluster: 2 Members: 68
Cluster: 3 Members: 74
Cluster: 4 Members: 15


In [124]:
data['cluster'] = clusterer.labels_

In [125]:
for i in range(clusterer.labels_.min(), clusterer.labels_.max()+1):
    print("Cluster:", i, "\n", data[data.cluster == i].INSTNM, '\n')

Cluster: -1 
 UNITID
100654                                                 Alabama A & M University
100663                                      University of Alabama at Birmingham
100690                                                       Amridge University
101073                                                Concordia College Alabama
101693                                                     University of Mobile
101912                                                       Oakwood University
102049                                                       Samford University
102058                                                         Selma University
102270                                                         Stillman College
102298                                                        Talladega College
102377                                                      Tuskegee University
102614                                           University of Alaska Fairbanks
102632             