In [14]:
import numpy as np
import pandas as pd
import pickle
import tqdm.auto as tqdm
import re
import sklearn
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from hdbscan import HDBSCAN
from pprint import pprint
from explorecourses import *

In [4]:
def load_csvs(pathname):
    RESULTS_FULL_PATH_CSV = "../results/" + pathname + "_full.csv"
    RESULTS_PREREQ_PATH_CSV = "../results/" + pathname + "_prereq.csv"
    COEFS_FULL_PATH_CSV = "../results/" + pathname + "_coefs_full.csv"
    COEFS_PREREQ_PATH_CSV = "../results/" + pathname + "_coefs_prereq.csv"
    
    results_full_df = pd.read_csv(RESULTS_FULL_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    results_prereq_df = pd.read_csv(RESULTS_PREREQ_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    coefs_full_df = pd.read_csv(COEFS_FULL_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    coefs_prereq_df = pd.read_csv(COEFS_PREREQ_PATH_CSV, low_memory=False).rename(
        columns={"Unnamed: 0":"course"}).set_index("course")
    
    return results_full_df, results_prereq_df, coefs_full_df, coefs_prereq_df

# modifies the df in place
def remove_advcourse_coefs(df):
    RE_PATTERN = r"(.+?)(\d+)"

    for ind in df.index:
        ind_split = re.split(RE_PATTERN, ind)
        ind_dept = ind_split[1]
        ind_num = int(ind_split[2])
        for col in df.columns:
            col_split = re.split(RE_PATTERN, col)
            col_dept = col_split[1]
            col_num = int(col_split[2])

            # same department, course code's hundreds digit is higher
            if ind_dept == col_dept and col_num // 100 > ind_num // 100:
                df.loc[ind, col] = None
                
def rowwise_normalize(df):
    return df.div(df.sum(axis=1), axis=0)

### LASSO positive-only, variable lambda

In [5]:
pathname = "lasso_positive_variable_lambda"

lasso_positive_results_full, lasso_positive_results_prereq, lasso_positive_coefs_full, lasso_positive_coefs_prereq \
= load_csvs(pathname)

# remove coefs for courses in the same dept that are higher level (e.g. CS221 shouldn't be a predictor for CS106A)
remove_advcourse_coefs(lasso_positive_coefs_full)

# normalize by making all rows sum to 1
norm_lasso_positive_coefs_full = rowwise_normalize(lasso_positive_coefs_full)
display(norm_lasso_positive_coefs_full)

Unnamed: 0_level_0,CME100,CHEM31X,CME102,CHEM33,ENGR20,CME104,CHEM35,CHEMENG100,CHEM131,CHEMENG120A,...,COMM277A,PHYSICS364,CHEMENG345,PSYCH178,MATH138,GEOPHYS146A,CEE274D,PHIL350A,MATH248,GEOPHYS281
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEMENG162,,,,,,,,,,,...,,,,,,,,,,
MS&E245G,0.0,0.0,0.0,0.0,,0.0,,,,,...,,,,,,,,,,
PUBLPOL105,,,,,,,,,,,...,,,,,,,,,,
PHYSICS61,,,,,,,,,,,...,,,,,,,,,,
MS&E145,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MATH147,,,,,,,,,,,...,,,,,,,,,,
MATH175,,,,,,,,,,,...,,,,,,,,,,
MATH146,,0.0,0.0,,,,,,,,...,,,,,,,,,,
MATH145,,,,,,,,,,,...,,,,,,,,,,


In [6]:
idx = norm_lasso_positive_coefs_full.index[norm_lasso_positive_coefs_full.isnull().all(1)]
nans = norm_lasso_positive_coefs_full.loc[idx]
nans = nans.sort_values(by = 'course') 
nans

Unnamed: 0_level_0,CME100,CHEM31X,CME102,CHEM33,ENGR20,CME104,CHEM35,CHEMENG100,CHEM131,CHEMENG120A,...,COMM277A,PHYSICS364,CHEMENG345,PSYCH178,MATH138,GEOPHYS146A,CEE274D,PHIL350A,MATH248,GEOPHYS281
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA100,,,,,,,,,,,...,,,,,,,,,,
AA279A,,,,,,,,,,,...,,,,,,,,,,
BIO101,,,,,,,,,,,...,,,,,,,,,,
BIO115,,,,,,,,,,,...,,,,,,,,,,
BIO144,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
STATS141,,,,,,,,,,,...,,,,,,,,,,
STATS191,,,,,,,,,,,...,,,,,,,,,,
STATS203,,,,,,,,,,,...,,,,,,,,,,
STATS217,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# get all the nan courses
nan_courses = []
for index, value in nans['MATH51'].items():
    nan_courses.append(index)
nan_courses

['AA100',
 'AA279A',
 'BIO101',
 'BIO115',
 'BIO144',
 'BIOE123',
 'BIOE44',
 'CEE166B',
 'CEE176B',
 'CEE180',
 'CEE181',
 'CEE70',
 'CHEM134',
 'CHEM135',
 'CHEM176',
 'CHEM31A',
 'CHEMENG120B',
 'CHEMENG162',
 'CHEMENG174',
 'CHEMENG185A',
 'CHEMENG20',
 'CHEMENG25B',
 'CME100',
 'CME100A',
 'CME102A',
 'CME104A',
 'CME108',
 'COMM106',
 'COMM108',
 'CS106A',
 'CS106B',
 'CS106X',
 'CS107E',
 'CS131',
 'CS147',
 'CS148',
 'CS181',
 'CS193A',
 'CS194H',
 'CS210B',
 'CS224N',
 'CS224S',
 'CS224U',
 'CS227B',
 'CS240',
 'CS242',
 'CS245',
 'CS247',
 'CS248',
 'CS261',
 'CS265',
 'CS279',
 'CS348B',
 'CS377U',
 'EARTHSYS111',
 'EARTHSYS112',
 'ECON1',
 'ECON111',
 'ECON112',
 'ECON126',
 'ECON136',
 'ECON140',
 'ECON141',
 'ECON143',
 'ECON145',
 'ECON159',
 'ECON178',
 'EE103',
 'EE107',
 'EE134',
 'EE142',
 'EE155',
 'EE178',
 'EE222',
 'EE271',
 'EE364A',
 'ENERGY102',
 'ENERGY104',
 'ENERGY160',
 'ENGR10',
 'ENGR20',
 'ENGR205',
 'ENGR30',
 'ENGR50',
 'ENGR62',
 'HUMBIO128',
 'HUMBI

In [8]:
non_nans = norm_lasso_positive_coefs_full.drop(nans.index, axis=0)
assert(non_nans.sum(axis=1).all() == 1.0)
non_nans

Unnamed: 0_level_0,CME100,CHEM31X,CME102,CHEM33,ENGR20,CME104,CHEM35,CHEMENG100,CHEM131,CHEMENG120A,...,COMM277A,PHYSICS364,CHEMENG345,PSYCH178,MATH138,GEOPHYS146A,CEE274D,PHIL350A,MATH248,GEOPHYS281
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MS&E245G,0.000000,0.000000,0.000000,0.000000,,0.0,,,,,...,,,,,,,,,,
CME106,0.336257,0.000000,0.026704,0.179617,0.0,0.0,0.000000,0.000000,0.0,0.0,...,,,,,,,,,,
BIO42,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,,,,,,,,,,
PSYCH30,0.030488,0.000000,0.000000,0.044007,0.0,0.0,0.000000,0.000000,0.0,0.0,...,,,,,,,,,,
PUBLPOL104,0.000000,0.000000,0.000000,0.000000,,,0.000000,,0.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PHYSICS21,0.075049,0.124545,0.000000,0.061820,,0.0,0.087371,,0.0,,...,,,,,,,,,,
CHEMENG150,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.465792,0.0,0.0,...,,,,,,,,,,
STATS202,0.066800,0.000000,0.000000,0.053808,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,,,,,,,,,
ECON102B,0.000000,0.003462,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,,,,,,,,,,


In [10]:
# list of all preparatory courses
prep_courses = non_nans.columns[(non_nans.sum(axis=0) != 0)]
prep_courses

Index(['CME100', 'CHEM31X', 'CME102', 'CHEM33', 'ENGR20', 'CME104', 'CHEM35',
       'CHEMENG100', 'CHEM131', 'CS106A',
       ...
       'COMM113', 'GS4', 'PHIL50', 'STATS141', 'CS272', 'GS90', 'ECON154',
       'MATH180', 'ECON125', 'EE42'],
      dtype='object', length=215)

In [11]:
# list of all target courses
target_courses = non_nans.index
target_courses

Index(['MS&E245G', 'CME106', 'BIO42', 'PSYCH30', 'PUBLPOL104', 'MS&E252',
       'MATH42', 'BIOE41', 'AA242A', 'BIOE101',
       ...
       'PHYSICS121', 'ME227', 'CS223A', 'ME80', 'EE65', 'PHYSICS21',
       'CHEMENG150', 'STATS202', 'ECON102B', 'MATH146'],
      dtype='object', name='course', length=155)

In [41]:
top_preps = non_nans.sum().sort_values(ascending=False)[:20].index
set(top_preps).difference(set(target_courses)) # top prep courses that don't have a target course model

{'CHEM171',
 'CME100',
 'CS106A',
 'CS106B',
 'CS106X',
 'ECON1',
 'ECON50',
 'MATH51',
 'PSYCH1'}

#### columnwise sums to find total weight of all preparatory courses

In [28]:
def get_course_name(row):
    BASIC_PATTERN = r"([A-Z]+)\s*([\d]+[A-Z]*)"
    course_code = row["Course code"]
    dept = re.search(BASIC_PATTERN, course_code).group(1)
    num = re.search(BASIC_PATTERN, course_code).group(2)
    connect = CourseConnection()
    for course in connect.get_courses_by_query(course_code):
        if course.subject == dept and course.code == num:
            return course.title

In [29]:
table = non_nans.sum().sort_values(ascending=False)[:len(prep_courses)].to_frame().reset_index()
table.rename(columns={"index":"Course code", 0:"Weight"}, inplace=True)
table["Prop. of total weight"] = table["Weight"] / table["Weight"].sum()
table["Course title"] = table.apply(get_course_name, axis=1)

In [31]:
table = table[["Course code", "Course title", "Prop. of total weight"]]
print(table[:20].to_latex(index=False))

\begin{tabular}{llr}
\toprule
Course code &                                       Course title &  Prop. of total weight \\
\midrule
     MATH51 &  Linear Algebra, Multivariable Calculus, and Mo... &               0.075462 \\
      CS107 &                  Computer Organization and Systems &               0.072055 \\
     CS106B &                           Programming Abstractions &               0.041371 \\
      CS103 &              Mathematical Foundations of Computing &               0.033710 \\
     CHEM33 &      Structure and Reactivity of Organic Molecules &               0.031736 \\
     CME102 &  Ordinary Differential Equations for Engineers ... &               0.029191 \\
     CME100 &           Vector Calculus for Engineers (ENGR 154) &               0.027938 \\
     ECON50 &                                Economic Analysis I &               0.027140 \\
     CHEM35 &                                               None &               0.027070 \\
      CS110 &                  

#### trying clustering but results aren't that good

In [None]:
clustering = DBSCAN(eps=0.4, min_samples=2).fit(norm_highach_coefs_full.fillna(0).T)
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

In [None]:
clustering = OPTICS(min_samples=2).fit(norm_highach_coefs_full.fillna(0).T)
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)

In [None]:
clustering = HDBSCAN(min_cluster_size=2, min_samples=100).fit(norm_highach_coefs_full.fillna(0).T)
clusters = {}
for i, label in enumerate(clustering.labels_):
    if (label in clusters):
        clusters[label].append(norm_highach_coefs_full.columns[i])
    else:
        clusters[label] = [norm_highach_coefs_full.columns[i]]
pprint(clusters)