In [3]:
import numpy as np
import pandas as pd
import pickle
import tqdm.auto as tqdm
import re
import sklearn
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from hdbscan import HDBSCAN
from pprint import pprint

In [4]:
def load_csvs(pathname):
    RESULTS_FULL_PATH_CSV = "../results/" + pathname + "_full.csv"
    RESULTS_PREREQ_PATH_CSV = "../results/" + pathname + "_prereq.csv"
    COEFS_FULL_PATH_CSV = "../results/" + pathname + "_coefs_full.csv"
    COEFS_PREREQ_PATH_CSV = "../results/" + pathname + "_coefs_prereq.csv"
    
    results_full_df = pd.read_csv(RESULTS_FULL_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    results_prereq_df = pd.read_csv(RESULTS_PREREQ_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    coefs_full_df = pd.read_csv(COEFS_FULL_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    coefs_prereq_df = pd.read_csv(COEFS_PREREQ_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    
    return results_full_df, results_prereq_df, coefs_full_df, coefs_prereq_df

# modifies the df in place
def remove_advcourse_coefs(df):
    RE_PATTERN = r"(.+?)(\d+)"

    for ind in df.index:
        ind_split = re.split(RE_PATTERN, ind)
        ind_dept = ind_split[1]
        ind_num = int(ind_split[2])
        for col in highach_coefs_full.columns:
            col_split = re.split(RE_PATTERN, col)
            col_dept = col_split[1]
            col_num = int(col_split[2])

            # same department, course code's hundreds digit is higher
            if ind_dept == col_dept and col_num // 100 > ind_num // 100:
                df.loc[ind, col] = None
                
def rowwise_normalize(df):
    return df.div(df.sum(axis=1), axis=0)

### high-achievement

In [6]:
pathname = "high-ach"

highach_results_full, highach_results_prereq, highach_coefs_full, highach_coefs_prereq = load_csvs(pathname)

# remove coefs for courses in the same dept that are higher level (e.g. CS221 shouldn't be a predictor for CS106A)
remove_advcourse_coefs(highach_coefs_full)

# normalize by making all rows sum to 1
norm_highach_coefs_full = rowwise_normalize(highach_coefs_full)
display(norm_highach_coefs_full)

Unnamed: 0_level_0,CS161,CS109,MATH51,ECON1,CS103,PSYCH50,CS224U,CS221,CS145,CS148,...,PHYSICS364,COMM122,ME300B,MUSIC421A,COMM274D,GEOPHYS281,LINGUIST205A,PHIL180A,PHIL350A,MATH248
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CS276,0.137297,0.131646,0.108343,0.076447,0.069367,0.065319,0.064361,0.058513,0.052324,0.047778,...,,,,,,,,,,
CEE176B,0.012248,0.000000,0.034279,0.024518,0.000000,0.000000,0.000000,-0.007560,0.000000,0.000000,...,,,,,,,,,,
MS&E252,0.018333,0.059976,0.055950,0.023787,0.072977,0.017341,0.000000,0.014533,0.019238,0.000000,...,,,,,,,,,,
EARTHSYS111,0.000000,0.000000,0.034721,0.149019,0.019651,0.001486,,0.000000,,0.000000,...,,,,,,,,,,
PSYCH131,-0.036646,0.047647,0.168412,0.042378,0.059045,-0.067730,0.005659,-0.036608,0.000000,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CHEMENG162,,0.014011,0.018987,0.020595,0.014011,,,,,,...,,,,,,,,,,
ME140,,0.009182,0.030859,0.015122,-0.002391,,,0.001963,0.000000,0.002957,...,,,,,,,,,,
HUMBIO128,,,0.063443,0.000000,0.012847,0.013107,,,,,...,,,,,,,,,,
BIO144,,,-0.024443,0.084319,,0.043354,,,,,...,,,,,,,,,,


In [7]:
# the 50 most predictive courses in the dataset, in ranked order
norm_highach_coefs_full.sum().sort_values(ascending=False)[:50]

CS106B         13.208465
CS107          13.041753
MATH51         12.990425
CS103          12.886159
CME100          9.866666
CS109           8.810540
CHEM33          8.088419
MATH53          7.836511
CS106A          7.714381
PHYSICS43       6.384442
CS106X          5.841443
MATH52          5.705910
CHEM35          5.511242
CHEM31B         5.310957
ECON50          5.156150
CS161           5.082639
CHEM31X         4.995264
ME203           4.956580
CS110           4.812288
ENGR40M         4.762297
ENGR14          4.492100
ENGR30          4.140742
PHYSICS41       3.860779
PSYCH1          3.846731
CME102          3.726181
ME101           3.677626
ECON102A        3.557849
PHYSICS63       3.526665
CHEM31A         3.399451
CHEM131         3.306694
CME100A         3.265342
ECON1           3.136320
BIO41           3.130638
CS124           3.096313
CS229           3.094813
CHEM171         3.082893
PHYSICS61       3.001130
EE102A          2.980841
ENGR15          2.895953
ME112           2.744625


In [8]:
clustering = DBSCAN(eps=0.1, min_samples=2).fit(norm_highach_coefs_full.fillna(0).T)

In [9]:
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

{-1: ['CS161',
      'CS109',
      'MATH51',
      'ECON1',
      'CS103',
      'PSYCH50',
      'CS224U',
      'CS221',
      'CS145',
      'CS148',
      'MATH104',
      'CS229',
      'CS110',
      'CS246',
      'CS131',
      'CS107',
      'PHYSICS61',
      'CME102',
      'CS106B',
      'CS147',
      'CS124',
      'PHYSICS41',
      'CHEM33',
      'STATS116',
      'MATH115',
      'ME70',
      'ENGR30',
      'AA100',
      'MATH108',
      'LINGUIST130A',
      'CS231N',
      'ENGR10',
      'PHYSICS45',
      'ECON52',
      'ECON102A',
      'ECON102B',
      'STATS60',
      'MATH41',
      'CS106X',
      'PSYCH80',
      'CS238',
      'CME100',
      'MATH42',
      'CS224N',
      'PHYSICS43',
      'CS142',
      'CS155',
      'EE108',
      'EE263',
      'EE261',
      'PHYSICS21',
      'MATH131P',
      'STATS200',
      'MS&E111',
      'CS144',
      'CS181',
      'ENGR50',
      'CS168',
      'CS227B',
      'HUMBIO2A',
      'CS248',
      'PHYS

In [10]:
clustering = OPTICS(min_samples=2).fit(norm_highach_coefs_full.fillna(0).T)

  ratio = reachability_plot[:-1] / reachability_plot[1:]


In [11]:
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

{-1: ['CS161',
      'CS109',
      'MATH51',
      'ECON1',
      'CS103',
      'PSYCH50',
      'CS224U',
      'CS221',
      'CS145',
      'CS148',
      'MATH104',
      'CS229',
      'CS110',
      'CS246',
      'CS107',
      'PHYSICS61',
      'CS106B',
      'CS147',
      'PHYSICS41',
      'CS274',
      'CHEM33',
      'STATS116',
      'MATH115',
      'ME70',
      'MATH108',
      'LINGUIST130A',
      'CS231N',
      'PHYSICS45',
      'ECON52',
      'ECON102A',
      'ECON102B',
      'STATS60',
      'MATH41',
      'CS106X',
      'PSYCH80',
      'CS238',
      'CME100',
      'MATH42',
      'CS224N',
      'PHYSICS43',
      'CS142',
      'CS155',
      'EE108',
      'LINGUIST210A',
      'EE263',
      'CS205A',
      'EE261',
      'PHYSICS21',
      'MATH131P',
      'STATS206',
      'STATS200',
      'MS&E111',
      'CS144',
      'ENGR50',
      'CS168',
      'HUMBIO2A',
      'CS248',
      'PHYSICS25',
      'ECON141',
      'CS294S',
      'CS143

In [18]:
clustering = HDBSCAN(min_cluster_size=2, min_samples=100).fit(norm_highach_coefs_full.fillna(0).T)

In [19]:
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

{-1: ['CS161',
      'CS109',
      'MATH51',
      'ECON1',
      'CS103',
      'PSYCH50',
      'CS224U',
      'CS221',
      'CS145',
      'CS148',
      'MATH104',
      'CS229',
      'CS110',
      'CS246',
      'CS131',
      'CS107',
      'PHYSICS61',
      'CME102',
      'CS106B',
      'CS147',
      'CS124',
      'PHYSICS41',
      'CS274',
      'CHEM33',
      'STATS116',
      'MATH115',
      'ME70',
      'ENGR30',
      'AA100',
      'MATH108',
      'LINGUIST130A',
      'CS231N',
      'ENGR10',
      'PHYSICS45',
      'ECON52',
      'ECON102A',
      'ECON102B',
      'STATS60',
      'CS245',
      'MATH41',
      'CS106X',
      'PSYCH80',
      'CS238',
      'CME100',
      'MATH42',
      'CS224N',
      'PHYSICS43',
      'CS142',
      'CS155',
      'CS276',
      'EE108',
      'LINGUIST210A',
      'EE263',
      'CS205A',
      'EE261',
      'PHYSICS21',
      'MATH131P',
      'STATS206',
      'STATS200',
      'MS&E111',
      'CS144',
     