In [5]:
import numpy as np
import pandas as pd
import pickle
import tqdm.auto as tqdm
import re
import sklearn
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from hdbscan import HDBSCAN
from pprint import pprint

In [6]:
def load_csvs(pathname):
    RESULTS_FULL_PATH_CSV = "../results/" + pathname + "_full.csv"
    RESULTS_PREREQ_PATH_CSV = "../results/" + pathname + "_prereq.csv"
    COEFS_FULL_PATH_CSV = "../results/" + pathname + "_coefs_full.csv"
    COEFS_PREREQ_PATH_CSV = "../results/" + pathname + "_coefs_prereq.csv"
    
    results_full_df = pd.read_csv(RESULTS_FULL_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    results_prereq_df = pd.read_csv(RESULTS_PREREQ_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    coefs_full_df = pd.read_csv(COEFS_FULL_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    coefs_prereq_df = pd.read_csv(COEFS_PREREQ_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    
    return results_full_df, results_prereq_df, coefs_full_df, coefs_prereq_df

# modifies the df in place
def remove_advcourse_coefs(df):
    RE_PATTERN = r"(.+?)(\d+)"

    for ind in df.index:
        ind_split = re.split(RE_PATTERN, ind)
        ind_dept = ind_split[1]
        ind_num = int(ind_split[2])
        for col in highach_coefs_full.columns:
            col_split = re.split(RE_PATTERN, col)
            col_dept = col_split[1]
            col_num = int(col_split[2])

            # same department, course code's hundreds digit is higher
            if ind_dept == col_dept and col_num // 100 > ind_num // 100:
                df.loc[ind, col] = None
                
def rowwise_normalize(df):
    return df.div(df.sum(axis=1), axis=0)

### high-achievement

In [7]:
pathname = "high-ach"

highach_results_full, highach_results_prereq, highach_coefs_full, highach_coefs_prereq = load_csvs(pathname)

# remove coefs for courses in the same dept that are higher level (e.g. CS221 shouldn't be a predictor for CS106A)
remove_advcourse_coefs(highach_coefs_full)

# normalize by making all rows sum to 1
norm_highach_coefs_full = rowwise_normalize(highach_coefs_full)
display(norm_highach_coefs_full)

Unnamed: 0_level_0,ECON51,ECON102A,ECON1,ECON102B,CS106A,ECON50,ECON136,CS106B,MATH51,ECON137,...,IPS203,CEE363B,STATS306A,LINGUIST255E,ME280,MATH138,PHIL180A,STATS363,STATS344,STATS360
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ECON141,0.166779,0.137154,0.122720,0.121299,0.096560,0.088136,0.071662,0.064967,0.059339,0.045715,...,,,,,,,,,,
ME115A,0.000000,0.003963,0.039717,0.000000,0.019435,0.005231,,0.053492,0.023046,0.000000,...,,,,,,,,,,
EE124,-0.009435,-0.005586,-0.010812,-0.010129,0.003154,-0.015060,,0.056636,0.021272,,...,,,,,,,,,,
PHYSICS65,0.000000,0.000000,0.003091,0.000000,0.016771,0.010608,0.000000,0.026600,-0.005271,,...,,,,,,,,,,
MS&E125,0.013642,0.019891,0.095273,0.005806,0.058645,0.048102,0.003566,0.051986,0.043437,0.003719,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENGR205,,,0.000000,,0.061728,,,-0.045682,0.092559,,...,,,,,,,,,,
CHEM185,,,0.018976,,0.001108,,,0.038046,0.004306,,...,,,,,,,,,,
EE114,,,0.005330,,0.018908,,,0.006054,0.009092,,...,,,,,,,,,,
EE122A,,,-0.004628,,-0.066315,,,0.036207,0.014199,,...,,,,,,,,,,


In [8]:
# the 50 most predictive courses in the dataset, in ranked order
norm_highach_coefs_full.sum().sort_values(ascending=False)[:50]

CS106B         13.285103
CS107          13.273584
MATH51         12.926936
CS103          12.842926
CME100          9.939556
CS109           8.959265
CHEM33          8.203102
CS106A          7.828776
MATH53          7.756249
PHYSICS43       6.414141
CS106X          6.009817
MATH52          5.710967
CHEM35          5.610881
CHEM31B         5.349581
CS161           5.140199
ECON50          5.088829
CHEM31X         5.031933
ME203           4.962514
CS110           4.913180
ENGR40M         4.797996
ENGR14          4.491092
ENGR30          4.153344
PHYSICS41       3.933600
PSYCH1          3.902713
CME102          3.808252
ME101           3.672032
PHYSICS63       3.510088
ECON102A        3.501065
CHEM31A         3.410598
CHEM131         3.389242
CME100A         3.278170
CS229           3.213821
BIO41           3.153743
CS124           3.147653
ECON1           3.089653
CHEM171         3.082893
EE102A          3.004291
PHYSICS61       3.001945
ENGR15          2.898817
ME112           2.747489


In [9]:
norm_highach_coefs_full.to_csv("../results/" + pathname + "_normalized.csv")

In [10]:
clustering = DBSCAN(eps=0.1, min_samples=2).fit(norm_highach_coefs_full.fillna(0).T)

In [11]:
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

{-1: ['ECON51',
      'ECON102A',
      'ECON1',
      'ECON102B',
      'CS106A',
      'ECON50',
      'ECON136',
      'CS106B',
      'MATH51',
      'ECON137',
      'CS103',
      'ECON52',
      'CS107',
      'CS109',
      'ECON135',
      'CS161',
      'PHYSICS43',
      'MS&E120',
      'CS106X',
      'PHYSICS41',
      'CS110',
      'ENGR40M',
      'CS108',
      'MS&E111',
      'CS124',
      'PSYCH50',
      'CS145',
      'STATS60',
      'CS221',
      'CME100',
      'CS147',
      'STATS116',
      'CHEM31B',
      'SYMSYS100',
      'MS&E145',
      'MS&E125',
      'CME103',
      'PSYCH45',
      'PHIL80',
      'ENGR14',
      'CS144',
      'ENGR30',
      'ECON118',
      'CS142',
      'CME102',
      'CS181',
      'CS148',
      'ECON157',
      'PSYCH90',
      'HUMBIO3B',
      'PHIL102',
      'PHIL181',
      'CHEM35',
      'PHYSICS23',
      'BIO42',
      'BIO43',
      'CHEM135',
      'PHYSICS25',
      'CHEM130',
      'PHYSICS41A',
      'CS19

In [12]:
clustering = OPTICS(min_samples=2).fit(norm_highach_coefs_full.fillna(0).T)

  ratio = reachability_plot[:-1] / reachability_plot[1:]


In [13]:
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

{-1: ['ECON51',
      'ECON102A',
      'ECON1',
      'ECON102B',
      'CS106A',
      'ECON50',
      'ECON136',
      'CS106B',
      'MATH51',
      'ECON137',
      'CS103',
      'ECON52',
      'CS107',
      'CS109',
      'ECON126',
      'ECON135',
      'CS161',
      'PHYSICS43',
      'MS&E120',
      'CS106X',
      'PHYSICS41',
      'CS110',
      'ENGR40M',
      'CS108',
      'MS&E111',
      'ECON155',
      'ECON165',
      'ECON110',
      'PSYCH50',
      'CS145',
      'STATS60',
      'CS221',
      'CME100',
      'CS147',
      'STATS116',
      'ECON154',
      'CHEM31B',
      'SYMSYS100',
      'MS&E145',
      'CME103',
      'PSYCH45',
      'PHIL80',
      'ENGR14',
      'CS144',
      'ECON118',
      'CS142',
      'CS148',
      'ECON157',
      'ECON149',
      'PSYCH90',
      'ECON121',
      'HUMBIO3B',
      'PHIL102',
      'PHIL181',
      'CHEM35',
      'PHYSICS23',
      'BIO43',
      'CHEM135',
      'PHYSICS25',
      'CHEM130',
      

In [14]:
clustering = HDBSCAN(min_cluster_size=2, min_samples=100).fit(norm_highach_coefs_full.fillna(0).T)

In [15]:
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

{-1: ['ECON51',
      'ECON102A',
      'ECON1',
      'ECON102B',
      'CS106A',
      'ECON50',
      'ECON136',
      'CS106B',
      'MATH51',
      'ECON137',
      'CS103',
      'ECON52',
      'CS107',
      'CS109',
      'ECON126',
      'ECON135',
      'CS161',
      'PHYSICS43',
      'MS&E120',
      'CS106X',
      'PHYSICS41',
      'CS110',
      'ENGR40M',
      'CS108',
      'MS&E111',
      'CS124',
      'ECON155',
      'ECON165',
      'ECON110',
      'PSYCH50',
      'CS145',
      'STATS60',
      'CS221',
      'CME100',
      'CS147',
      'STATS116',
      'ECON144',
      'ECON154',
      'CHEM31B',
      'SYMSYS100',
      'MS&E145',
      'MS&E125',
      'CME103',
      'PSYCH45',
      'PHIL80',
      'ENGR14',
      'CS144',
      'ENGR30',
      'ECON118',
      'CS142',
      'CME102',
      'CS181',
      'CS148',
      'ECON157',
      'ECON149',
      'PSYCH90',
      'ECON121',
      'HUMBIO3B',
      'PHIL102',
      'PHIL181',
      'CHEM35

### All positive lasso

In [16]:
pathname = "lasso_positive_variable_lambda"

lasso_positive_results_full, lasso_positive_results_prereq, lasso_positive_coefs_full, lasso_positive_coefs_prereq = load_csvs(pathname)

# remove coefs for courses in the same dept that are higher level (e.g. CS221 shouldn't be a predictor for CS106A)
remove_advcourse_coefs(lasso_positive_coefs_full)

# normalize by making all rows sum to 1
norm_lasso_positive_coefs_full = rowwise_normalize(lasso_positive_coefs_full)
display(norm_lasso_positive_coefs_full)

Unnamed: 0_level_0,CME100,CHEM31X,CME102,CHEM33,ENGR20,CME104,CHEM35,CHEMENG100,CHEM131,CHEMENG120A,...,ENERGY222,ENERGY212,COMM213,AA214B,AA229,AA271A,AA270,AA214A,CHEM187A,CHEM187
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEMENG162,,,,,,,,,,,...,,,,,,,,,,
MS&E245G,0.0,0.0,0.0,0.0,,0.0,,,,,...,,,,,,,,,,
PUBLPOL105,,,,,,,,,,,...,,,,,,,,,,
PHYSICS61,,,,,,,,,,,...,,,,,,,,,,
MS&E145,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MATH147,,,,,,,,,,,...,,,,,,,,,,
MATH175,,,,,,,,,,,...,,,,,,,,,,
MATH146,,0.0,0.0,,,,,,,,...,,,,,,,,,,
MATH145,,,,,,,,,,,...,,,,,,,,,,


In [17]:
idx = norm_lasso_positive_coefs_full.index[norm_lasso_positive_coefs_full.isnull().all(1)]
nans = norm_lasso_positive_coefs_full.loc[idx]
nans = nans.sort_values(by = 'course') 
nans

Unnamed: 0_level_0,CME100,CHEM31X,CME102,CHEM33,ENGR20,CME104,CHEM35,CHEMENG100,CHEM131,CHEMENG120A,...,ENERGY222,ENERGY212,COMM213,AA214B,AA229,AA271A,AA270,AA214A,CHEM187A,CHEM187
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA100,,,,,,,,,,,...,,,,,,,,,,
AA279A,,,,,,,,,,,...,,,,,,,,,,
BIO101,,,,,,,,,,,...,,,,,,,,,,
BIO115,,,,,,,,,,,...,,,,,,,,,,
BIO144,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
STATS141,,,,,,,,,,,...,,,,,,,,,,
STATS191,,,,,,,,,,,...,,,,,,,,,,
STATS203,,,,,,,,,,,...,,,,,,,,,,
STATS217,,,,,,,,,,,...,,,,,,,,,,


In [19]:
# get all the nan courses
nan_courses = []
for index, value in nans['MATH51'].items():
    nan_courses.append(index)
nan_courses

['AA100',
 'AA279A',
 'BIO101',
 'BIO115',
 'BIO144',
 'BIOE123',
 'BIOE44',
 'CEE166B',
 'CEE176B',
 'CEE180',
 'CEE181',
 'CEE70',
 'CHEM134',
 'CHEM135',
 'CHEM176',
 'CHEM31A',
 'CHEMENG120B',
 'CHEMENG162',
 'CHEMENG174',
 'CHEMENG185A',
 'CHEMENG20',
 'CHEMENG25B',
 'CME100',
 'CME100A',
 'CME102A',
 'CME104A',
 'CME108',
 'COMM106',
 'COMM108',
 'CS106A',
 'CS106B',
 'CS106X',
 'CS107E',
 'CS131',
 'CS147',
 'CS148',
 'CS181',
 'CS193A',
 'CS194H',
 'CS210B',
 'CS224N',
 'CS224S',
 'CS224U',
 'CS227B',
 'CS240',
 'CS242',
 'CS245',
 'CS247',
 'CS248',
 'CS261',
 'CS265',
 'CS279',
 'CS348B',
 'CS377U',
 'EARTHSYS111',
 'EARTHSYS112',
 'ECON1',
 'ECON111',
 'ECON112',
 'ECON126',
 'ECON136',
 'ECON140',
 'ECON141',
 'ECON143',
 'ECON145',
 'ECON159',
 'ECON178',
 'EE103',
 'EE107',
 'EE134',
 'EE142',
 'EE155',
 'EE178',
 'EE222',
 'EE271',
 'EE364A',
 'ENERGY102',
 'ENERGY104',
 'ENERGY160',
 'ENGR10',
 'ENGR20',
 'ENGR205',
 'ENGR30',
 'ENGR50',
 'ENGR62',
 'HUMBIO128',
 'HUMBI

In [74]:
non_nans = norm_lasso_positive_coefs_full.drop(nans.index, axis=0)
assert(non_nans.sum(axis=1).all() == 1.0)
non_nans

Unnamed: 0_level_0,CME100,CHEM31X,CME102,CHEM33,ENGR20,CME104,CHEM35,CHEMENG100,CHEM131,CHEMENG120A,...,ENERGY222,ENERGY212,COMM213,AA214B,AA229,AA271A,AA270,AA214A,CHEM187A,CHEM187
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MS&E245G,0.000000,0.000000,0.000000,0.000000,,0.0,,,,,...,,,,,,,,,,
CME106,0.336257,0.000000,0.026704,0.179617,0.0,0.0,0.000000,0.000000,0.0,0.0,...,,,,,,,,,,
BIO42,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,,,,,,,,,,
PSYCH30,0.030488,0.000000,0.000000,0.044007,0.0,0.0,0.000000,0.000000,0.0,0.0,...,,,,,,,,,,
PUBLPOL104,0.000000,0.000000,0.000000,0.000000,,,0.000000,,0.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PHYSICS21,0.075049,0.124545,0.000000,0.061820,,0.0,0.087371,,0.0,,...,,,,,,,,,,
CHEMENG150,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.465792,0.0,0.0,...,,,,,,,,,,
STATS202,0.066800,0.000000,0.000000,0.053808,0.0,0.0,0.000000,0.000000,0.0,0.0,...,,,,,,,,,,
ECON102B,0.000000,0.003462,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,,,,,,,,,,


In [44]:
(non_nans.sum(axis=0) != 0).sum()

215

In [70]:
# list of all preparatory courses
prep_courses = non_nans.columns[(non_nans.sum(axis=0) != 0)]
prep_courses

In [71]:
# list of all target courses
target_courses = non_nans.index
target_courses

In [89]:
table = non_nans.sum().sort_values(ascending=False)[:len(prep_courses)].to_frame().reset_index()
table.rename(columns={"index":"Course code", 0:"Weight"}, inplace=True)
table["Prop. of total weight"] = table["Weight"] / table["Weight"].sum()
table

Unnamed: 0,Course code,Weight,Prop. of total weight
0,MATH51,1.169662e+01,7.546206e-02
1,CS107,1.116848e+01,7.205473e-02
2,CS106B,6.412431e+00,4.137052e-02
3,CS103,5.225035e+00,3.370991e-02
4,CHEM33,4.919028e+00,3.173566e-02
...,...,...,...
210,GS90,4.960173e-04,3.200112e-06
211,MCS100,3.342237e-04,2.156282e-06
212,PSYCH120,8.278301e-05,5.340840e-07
213,MS&E260,2.049218e-05,1.322076e-07


In [86]:
print(table[:20].to_latex())

\begin{tabular}{llr}
\toprule
{} & Course code &    Weight \\
\midrule
0  &      MATH51 &  0.075462 \\
1  &       CS107 &  0.072055 \\
2  &      CS106B &  0.041371 \\
3  &       CS103 &  0.033710 \\
4  &      CHEM33 &  0.031736 \\
5  &      CME102 &  0.029191 \\
6  &      CME100 &  0.027938 \\
7  &      ECON50 &  0.027140 \\
8  &      CHEM35 &  0.027070 \\
9  &       CS110 &  0.025707 \\
10 &       ECON1 &  0.022909 \\
11 &      CS106X &  0.020601 \\
12 &      PSYCH1 &  0.020377 \\
13 &      MATH53 &  0.020351 \\
14 &      CS106A &  0.019426 \\
15 &     CHEM171 &  0.017975 \\
16 &       CS161 &  0.017489 \\
17 &      EE102A &  0.017419 \\
18 &    HUMBIO2A &  0.017006 \\
19 &   PHYSICS45 &  0.015530 \\
\bottomrule
\end{tabular}



In [34]:
clustering = DBSCAN(eps=0.4, min_samples=2).fit(norm_highach_coefs_full.fillna(0).T)
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

{-1: ['ECON51',
      'ECON102A',
      'ECON1',
      'CS106A',
      'ECON50',
      'CS106B',
      'MATH51',
      'CS103',
      'CS107',
      'CS109',
      'CS161',
      'PHYSICS43',
      'CS106X',
      'PHYSICS41',
      'CS110',
      'ENGR40M',
      'CS108',
      'CS124',
      'PSYCH50',
      'CS145',
      'STATS60',
      'CS221',
      'CME100',
      'CS147',
      'STATS116',
      'CHEM31B',
      'SYMSYS100',
      'CME103',
      'PSYCH45',
      'PHIL80',
      'ENGR14',
      'ENGR30',
      'CS142',
      'CME102',
      'CS148',
      'CHEM35',
      'CHEM130',
      'PHYSICS41A',
      'PHIL1',
      'ENERGY102',
      'PHYSICS45',
      'ENGR40',
      'PSYCH80',
      'LINGUIST1',
      'MATH113',
      'AA100',
      'CHEM131',
      'MATH120',
      'HUMBIO2A',
      'ME101',
      'PHIL150',
      'ENERGY104',
      'ENGR50',
      'MATH42',
      'MATH52',
      'MATH41',
      'MATH53',
      'PSYCH1',
      'CHEM31X',
      'PHYSICS21',
      'CHE

In [26]:
clustering = OPTICS(min_samples=2).fit(norm_highach_coefs_full.fillna(0).T)
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

{-1: ['ECON51',
      'ECON102A',
      'ECON1',
      'ECON102B',
      'CS106A',
      'ECON50',
      'ECON136',
      'CS106B',
      'MATH51',
      'ECON137',
      'CS103',
      'ECON52',
      'CS107',
      'CS109',
      'ECON126',
      'ECON135',
      'CS161',
      'PHYSICS43',
      'MS&E120',
      'CS106X',
      'PHYSICS41',
      'CS110',
      'ENGR40M',
      'CS108',
      'MS&E111',
      'ECON155',
      'ECON165',
      'ECON110',
      'PSYCH50',
      'CS145',
      'STATS60',
      'CS221',
      'CME100',
      'CS147',
      'STATS116',
      'ECON154',
      'CHEM31B',
      'SYMSYS100',
      'MS&E145',
      'CME103',
      'PSYCH45',
      'PHIL80',
      'ENGR14',
      'CS144',
      'ECON118',
      'CS142',
      'CS148',
      'ECON157',
      'ECON149',
      'PSYCH90',
      'ECON121',
      'HUMBIO3B',
      'PHIL102',
      'PHIL181',
      'CHEM35',
      'PHYSICS23',
      'BIO43',
      'CHEM135',
      'PHYSICS25',
      'CHEM130',
      

  ratio = reachability_plot[:-1] / reachability_plot[1:]


In [27]:
clustering = HDBSCAN(min_cluster_size=2, min_samples=100).fit(norm_highach_coefs_full.fillna(0).T)
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

{-1: ['ECON51',
      'ECON102A',
      'ECON1',
      'ECON102B',
      'CS106A',
      'ECON50',
      'ECON136',
      'CS106B',
      'MATH51',
      'ECON137',
      'CS103',
      'ECON52',
      'CS107',
      'CS109',
      'ECON126',
      'ECON135',
      'CS161',
      'PHYSICS43',
      'MS&E120',
      'CS106X',
      'PHYSICS41',
      'CS110',
      'ENGR40M',
      'CS108',
      'MS&E111',
      'CS124',
      'ECON155',
      'ECON165',
      'ECON110',
      'PSYCH50',
      'CS145',
      'STATS60',
      'CS221',
      'CME100',
      'CS147',
      'STATS116',
      'ECON144',
      'ECON154',
      'CHEM31B',
      'SYMSYS100',
      'MS&E145',
      'MS&E125',
      'CME103',
      'PSYCH45',
      'PHIL80',
      'ENGR14',
      'CS144',
      'ENGR30',
      'ECON118',
      'CS142',
      'CME102',
      'CS181',
      'CS148',
      'ECON157',
      'ECON149',
      'PSYCH90',
      'ECON121',
      'HUMBIO3B',
      'PHIL102',
      'PHIL181',
      'CHEM35

### lasso

In [26]:
pathname = "lasso"

lasso_results_full, lasso_results_prereq, lasso_coefs_full, lasso_coefs_prereq = load_csvs(pathname)

# remove coefs for courses in the same dept that are higher level (e.g. CS221 shouldn't be a predictor for CS106A)
remove_advcourse_coefs(lasso_coefs_full)

# normalize by making all rows sum to 1
norm_lasso_coefs_full = rowwise_normalize(lasso_coefs_full)
display(norm_lasso_coefs_full)

Unnamed: 0_level_0,ECON1,ECON50,ECON141,ECON52,ECON178,ECON51,ECON102B,CHEM31X,CHEM33,CME100,...,ECON153,CEE274D,STATS363,MATH138,STATS306A,STATS344,STATS360,PHIL180A,IPS203,GEOPHYS281
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ECON112,0.257148,0.254945,0.199706,0.177406,0.152755,0.019466,0.013146,0.000000,0.000000,0.000000,...,,,,,,,,,,
EE364A,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.083523,...,,,,,,,,,,
EE142,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.038722,...,,,,,,,,,,
BIO153,0.000000,,,,,0.000000,,0.000000,0.002607,0.000000,...,,,,,,,,,,
PSYCH60B,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CEE181,-0.323450,,,,,,,0.000000,0.000000,0.000000,...,,,,,,,,,,
PHYSICS41,0.087186,-0.027982,0.000000,0.000000,0.000000,-0.211444,0.000000,0.211014,0.000000,0.223465,...,,,,,,,,,,
PSYCH50,0.078034,0.025526,0.000000,0.000000,0.000000,0.000000,0.000000,0.091675,0.005071,0.056887,...,,,,,,,,,,
CHEM31B,0.005023,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.053691,...,,,,,,,,,,


### All NaNs analysis

In [20]:
# Lookup all nan courses from lasso positive
highach_coefs_full.loc[nan_courses]

Unnamed: 0_level_0,ECON51,ECON102A,ECON1,ECON102B,CS106A,ECON50,ECON136,CS106B,MATH51,ECON137,...,IPS203,CEE363B,STATS306A,LINGUIST255E,ME280,MATH138,PHIL180A,STATS363,STATS344,STATS360
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA100,0.001560,0.003141,-0.015334,0.001582,0.010405,0.003066,0.000000,0.027723,0.005188,0.000000,...,,,,,,,,,,
AA279A,,0.002469,-0.007429,,-0.017177,,,-0.000088,0.007293,0.002469,...,,,,,,,,,,
BIO101,0.002665,0.000000,0.013013,0.000000,0.020250,0.000000,,0.012465,0.008355,,...,,,,,,,,,,
BIO115,,0.000915,-0.005450,0.000000,0.008959,,,0.004448,0.002652,,...,,,,,,,,,,
BIO144,0.000000,0.000000,0.007639,0.000000,-0.000403,0.001936,,-0.005925,-0.002214,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PUBLPOL105,0.033143,0.033591,0.013579,0.034903,0.020010,0.035953,0.002915,0.008075,0.019193,-0.004660,...,,,,,,,,,,
STATS191,0.001309,0.019285,0.064274,0.015250,0.051638,0.038853,-0.000799,0.018998,0.051773,0.007371,...,,,,,,,,,,
STATS203,0.011259,-0.004116,0.017954,0.001377,-0.025689,0.022412,0.002766,0.003621,0.000889,0.000000,...,,,,,,,,,,
STATS217,-0.000138,-0.003909,0.001489,-0.004076,-0.010604,-0.006037,0.000000,-0.005012,-0.012939,0.003884,...,,,,,,,,,,


In [27]:
lasso_coefs_full.loc[nan_courses]

Unnamed: 0_level_0,ECON1,ECON50,ECON141,ECON52,ECON178,ECON51,ECON102B,CHEM31X,CHEM33,CME100,...,ECON153,CEE274D,STATS363,MATH138,STATS306A,STATS344,STATS360,PHIL180A,IPS203,GEOPHYS281
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA100,-0.399022,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.753904,...,,,,,,,,,,
AA279A,0.000000,,,,,,,0.000000,,0.000000,...,,,,,,,,,,
BIO101,0.393485,0.000000,,0.0,,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
BIO115,0.000000,,,,,,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
BIO144,0.000000,0.000000,,0.0,,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PUBLPOL105,0.031802,0.445580,0.0,0.0,0.0,0.953616,0.469084,0.000000,0.000000,0.000000,...,,,,,,,,,,
STATS191,0.774031,0.546690,0.0,0.0,0.0,0.000000,0.000000,0.000000,1.530079,0.177377,...,,,,,,,,,,
STATS203,0.000000,1.587851,,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
STATS217,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,


In [134]:
norm_lasso_coefs_full.loc['CS223A'].sort_values(ascending=False)[:19]

ENGR40      0.320424
CS110       0.317050
CS107       0.269318
CME100      0.186395
CS109       0.113659
CS221       0.062363
MATH51H     0.000000
CS9         0.000000
CS106X      0.000000
PSYCH80     0.000000
ENGR14      0.000000
MATH131P    0.000000
CS181       0.000000
CS124       0.000000
ENGR50      0.000000
PSYCH30     0.000000
PSYCH147    0.000000
PHIL287     0.000000
ENGR40M     0.000000
Name: CS223A, dtype: float64

In [136]:
norm_lasso_positive_coefs_full.loc['CS110'].sort_values(ascending=False)[:19]

CS107     0.354520
CS107E    0.181764
CS106X    0.132405
MATH51    0.076176
CS161     0.071665
CME100    0.070495
CS103     0.065506
CS109     0.047468
EE124     0.000000
EE242     0.000000
EE118     0.000000
EE264     0.000000
EE153     0.000000
EE142     0.000000
ME318     0.000000
EE103     0.000000
EE102B    0.000000
ME203     0.000000
EE102A    0.000000
Name: CS110, dtype: float64