# Libraries

In [72]:
import numpy as np
import pandas as pd
import pickle
import tqdm.auto as tqdm
from pprint import pprint
import collections
import random
import feather

# Use restricted_wcc dataset (only courses from WCC, no summer, keep only row from last five years or from a student with a course in the last five years) from wrangling_restricted_wcc_dataset.ipynb

In [2]:
WCC_PD_PATH = "../data/wcc_pd.pkl"

with open(WCC_PD_PATH, "rb") as file:
    raw_data = pickle.load(file)
    
raw_data

Unnamed: 0,acad_career,strm,stdnt_enrl_status,unt_taken,unt_billing,crse_grade_input,earn_credit,emplid,subject,catalog_nbr,crse_acad_org,sex,course_name
0,UG,1182.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,MATH,51,MATH,M,MATH51
2,UG,1182.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,CHEM,31X,CHEMISTRY,M,CHEM31X
4,UG,1184.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,CS,106X,COMPUTSCI,M,CS106X
5,UG,1184.0,E,5.0,5.0,B+,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,CHEM,33,CHEMISTRY,M,CHEM33
7,UG,1184.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,PHIL,1,PHILOSOPHY,M,PHIL1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
784334,UG,1202.0,E,5.0,5.0,B+,Y,zYPvt/OnDJZcB1qpg0uKI7zFYa5srO2CCsIU8VsIDeWYy+...,PSYCH,1,PSYCHOLOGY,F,PSYCH1
784339,UG,1192.0,E,5.0,5.0,A,Y,zYZXgYeShvA/cD8TzNRRLxzbAblBrpRdc8KW5geNAtn0Px...,PSYCH,1,PSYCHOLOGY,F,PSYCH1
784340,UG,1196.0,E,5.0,5.0,B+,Y,zYZXgYeShvA/cD8TzNRRLxzbAblBrpRdc8KW5geNAtn0Px...,STATS,60,STATISTICS,F,STATS60
784342,UG,1202.0,E,6.0,6.0,B-,Y,zz5j89pAbihrd+eaiLAddrxwyMh4sbs5K1W8uNxhwe5ome...,CME,100A,ICME,M,CME100A


In [3]:
all_courses = set(raw_data["course_name"])
pprint(all_courses)
print(len(all_courses))

{'AA100',
 'AA200',
 'AA210A',
 'AA212',
 'AA228',
 'AA240A',
 'AA240B',
 'AA241A',
 'AA241B',
 'AA241X',
 'AA242A',
 'AA256',
 'AA272C',
 'AA279A',
 'AA279B',
 'AA280',
 'APPPHYS273',
 'BIO101',
 'BIO104',
 'BIO107',
 'BIO115',
 'BIO144',
 'BIO153',
 'BIO156',
 'BIO158',
 'BIO163',
 'BIO173',
 'BIO188',
 'BIO189',
 'BIO234',
 'BIO239',
 'BIO256',
 'BIO258',
 'BIO41',
 'BIO42',
 'BIO43',
 'BIO44Y',
 'BIOE101',
 'BIOE103',
 'BIOE103B',
 'BIOE123',
 'BIOE141A',
 'BIOE141B',
 'BIOE201C',
 'BIOE210',
 'BIOE211',
 'BIOE212',
 'BIOE244',
 'BIOE313',
 'BIOE355',
 'BIOE41',
 'BIOE42',
 'BIOE44',
 'BIOE454',
 'BIOHOPK163H',
 'BIOHOPK167H',
 'BIOHOPK263H',
 'BIOMEDIN156',
 'BIOMEDIN210',
 'BIOMEDIN212',
 'BIOMEDIN214',
 'BIOMEDIN215',
 'BIOMEDIN217',
 'BIOMEDIN224',
 'BIOMEDIN225',
 'BIOMEDIN260',
 'BIOMEDIN371',
 'BIOPHYS371',
 'CEE101A',
 'CEE101B',
 'CEE101C',
 'CEE160',
 'CEE161A',
 'CEE164',
 'CEE166A',
 'CEE166B',
 'CEE169',
 'CEE171',
 'CEE172',
 'CEE172A',
 'CEE176B',
 'CEE177',
 'CEE178

In [4]:
# descriptive statistics
print(len(raw_data), "unique course records")
print(len(raw_data["emplid"].unique()), "unique students")
print(len(raw_data["course_name"].unique()), "unique courses")

166194 unique course records
13218 unique students
878 unique courses


In [5]:
LETTER_GRADES = ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D", "D-", "NP", "W"]

# Builds the feature matrix X and output vector y for a given course, using all other courses in the dataset
# as possible predictors. Only adds a column for another course if at least one student has taken that other
# course before the given course.
# Set gender = True to also add a prediction vector for female (0/1).
# Returns a dataframe with y appended to the right of X, split them off later in get_and_split_data().
############################################################################################################
# 2020-02-23 Initial model settings: 
# Predict success in a course (B+ or better) based on having passed other courses previously (D- or better)
# Incomplete data for strm 1204 (2019-2020 Winter)
# Only consider courses taken at or after strm 1162 (2015-2016 Autumn)
# Y1GRADES = ["A+", "A", "A-", "B+"]
# X1GRADES = ["A+", "A", "A-", "B+", "B", "B-", 
#             "C+", "C", "C-", "D+", "D", "D-"]
# UNFINISHED_QUARTERS = [1204]
# CUTOFF_QUARTER = 1162

Y1GRADES = ["A+", "A", "A-", "B+"] # we encode these grades as y = 1, otherwise y = 0
X1GRADES = ["A+", "A", "A-", "B+", "B", "B-", 
            "C+", "C", "C-", "D+", "D", "D-"] # for previous courses, encode these grades as X_i = 1
UNFINISHED_QUARTERS = [1204] # unfinished quarter(s) if pulled during a quarter
CUTOFF_QUARTER = 1162

def getStudentFeatures(course_name, gender = False):
    course_entries = raw_data.loc[(raw_data['course_name'] == course_name) & # all course entries that are about the certain course we want
                                  (raw_data['crse_grade_input'].isin(LETTER_GRADES)) & # Grade needs to be a letter
                                  (raw_data['stdnt_enrl_status'] != 'D') & # not dropped
                                  (raw_data['strm'] >= CUTOFF_QUARTER) & # target (outcome) course during or after the cutoff
                                  (~raw_data['strm'].isin(UNFINISHED_QUARTERS))] # we don't have outcomes for these
    students = course_entries.emplid # Students from the course entries
    students_prev_classes = raw_data.loc[raw_data['emplid'].isin(students)] # Other classes taken by these students
    possible_prev_classes = students_prev_classes['course_name'].unique().tolist() # List of courses taken by these students
    possible_prev_classes.append("strm") # For now add term to the data of when the course was taken
    if gender:
        possible_prev_classes.append("female") # Add gender
    possible_prev_classes.append("emplid") # Add emplid
    possible_prev_classes.append("y") # Add the target to the data. Will split it later
    a = np.zeros(shape = (len(course_entries), len(possible_prev_classes))) # all 0 initialized features matrix
    full_matrix = pd.DataFrame(a, columns = possible_prev_classes) # same as above, but in pandas to allow indexing by course name
    for counter, idx in enumerate(course_entries.index): # loop over all class entries
        grade = course_entries.loc[[idx]]['crse_grade_input']
        gradeStr = grade.tolist()[0]
        if (gradeStr in Y1GRADES):
            full_matrix.iloc[counter, full_matrix.columns.get_loc("y")] = 1
        # fill predictor matrix X
        std_id = course_entries.loc[[idx]]['emplid'] # Get the student id
        quarter = course_entries.loc[[idx]]['strm'] # Get the term
        if gender:
            female = course_entries.loc[[idx]]['sex'] # Get the student gender
        student_courses = students_prev_classes.loc[students_prev_classes['emplid'] == std_id.tolist()[0]] # Get all courses the student took
        student_courses = student_courses.loc[student_courses['strm'] < quarter.tolist()[0]] # Filter the courses to be only before the course of interest
        if gender:
            if (female.tolist()[0] == 'M'):
                full_matrix.iloc[counter, full_matrix.columns.get_loc("female")] = 0
            else:
                full_matrix.iloc[counter, full_matrix.columns.get_loc("female")] = 1
        full_matrix.iloc[counter, full_matrix.columns.get_loc("strm")] = quarter.tolist()[0]
        full_matrix.iloc[counter, full_matrix.columns.get_loc("emplid")] = std_id.tolist()[0]
        for j in student_courses.index:# Loop over the student's courses
            course_name = student_courses.loc[[j]]['subject']+student_courses.loc[[j]]['catalog_nbr'] # Get the course name
            grade = student_courses.loc[[j]]['crse_grade_input']
            gradeStr = grade.tolist()[0]
            if (gradeStr in X1GRADES):
                full_matrix.iloc[counter, full_matrix.columns.get_loc(course_name.tolist()[0])] = 1 # Set the course name in X to the new value (or if it was already passed keep the value)
    return full_matrix

In [6]:
#dataset includes a y column, which is the target. It also includes strm and emplid columns that should
#be dropped before running any prediction
features_cs110 = getStudentFeatures('CS110')
features_cs110

Unnamed: 0,MATH51,CHEM31X,CS106X,CHEM33,PHIL1,CS103,CHEM35,CS109,CS161,CS107,...,PHYSICS230,PHYSICS231,BIO153,MATH220,PHYSICS62,ME105,AA272C,strm,emplid,y
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1202.0,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1192.0,$2a$15$5tXEOl2owViV9E5K1pJ/Luf.44w/Ci69ZBbFT6o...,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1186.0,$2a$15$aHTw2jbPTbRKGeDBfcQ25.KPs1kJF/UqcPukXAC...,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1194.0,$2a$15$CgRoc5cbNZ0QTWMIqNSbu..3w/a0GkYDa3ktG1x...,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1186.0,$2a$15$Ejn9P.vrIOxTGiqTGICiDeLirjfUfjv4xBdI4h6...,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1202.0,YUqcVvVPHekQ3pk5dsZJb91EacfOFwSq9yrH4Gab7GbT8O...,1.0
1535,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1196.0,YX37GM6UxzX5a19UpwoQ6J4bq4x6Obi/zOIjqRmAfPWmBc...,1.0
1536,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1194.0,zeMsirklPKg0Ts2/jhhHTE4QTDubdJ6VAh420a2SxyZdWj...,1.0
1537,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1202.0,zmNzWWYP45nMMEl7vSBuaNnqWtUycLozveGSKlGoXanF8B...,1.0


In [7]:
# sanity check, remember Y1_GRADES here is B+ or better for y = 1
SAMPLE_ROW_INDEX = random.randint(0, len(features_cs110) - 1)

print(features_cs110.iloc[SAMPLE_ROW_INDEX][features_cs110.iloc[SAMPLE_ROW_INDEX].eq(1)])
print(len(features_cs110.iloc[SAMPLE_ROW_INDEX][features_cs110.iloc[SAMPLE_ROW_INDEX].eq(1)]), "rows with 1")

sample_emplid = features_cs110.iloc[SAMPLE_ROW_INDEX]["emplid"]
print("\nemplid:", sample_emplid)
sample_strm = features_cs110.iloc[SAMPLE_ROW_INDEX]["strm"]
print("strm for CS110:", sample_strm, "\n")

print(raw_data[(raw_data["emplid"] == sample_emplid) & ((raw_data["strm"] < sample_strm) | (raw_data["course_name"] == "CS110"))][["course_name", "crse_grade_input"]])
print(len(raw_data[(raw_data["emplid"] == sample_emplid) & ((raw_data["strm"] < sample_strm) | (raw_data["course_name"] == "CS110"))][["course_name", "crse_grade_input"]]), "rows recorded")

CS103      1
CS109      1
CS107      1
MATH120    1
CS142      1
MATH51H    1
CS261      1
CS265      1
y          1
Name: 394, dtype: object
9 rows with 1

emplid: $2b$15$DnnD9NtJYCCrE3O7S1y1veI9Z.RwjYxxswU2jIVJYErPd/dS9uRqm
strm for CS110: 1174.0 

       course_name crse_grade_input
215974       CS103               A+
215975     MATH51H                A
215977       CS109                A
215978       CS261               A+
215980       CS107                A
215981       CS142               B+
215983       CS265               B+
215985     MATH120               A-
215986       CS110                A
9 rows recorded


In [8]:
print(collections.Counter(features_cs110.y))
print(features_cs110["y"].mean())

Counter({0.0: 870, 1.0: 669})
0.4346978557504873


In [9]:
features_cs110_g = getStudentFeatures("CS110", gender = True)
features_cs110_g

Unnamed: 0,MATH51,CHEM31X,CS106X,CHEM33,PHIL1,CS103,CHEM35,CS109,CS161,CS107,...,PHYSICS231,BIO153,MATH220,PHYSICS62,ME105,AA272C,strm,female,emplid,y
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1202.0,0.0,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1192.0,0.0,$2a$15$5tXEOl2owViV9E5K1pJ/Luf.44w/Ci69ZBbFT6o...,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1186.0,0.0,$2a$15$aHTw2jbPTbRKGeDBfcQ25.KPs1kJF/UqcPukXAC...,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1194.0,1.0,$2a$15$CgRoc5cbNZ0QTWMIqNSbu..3w/a0GkYDa3ktG1x...,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1186.0,0.0,$2a$15$Ejn9P.vrIOxTGiqTGICiDeLirjfUfjv4xBdI4h6...,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1202.0,0.0,YUqcVvVPHekQ3pk5dsZJb91EacfOFwSq9yrH4Gab7GbT8O...,1.0
1535,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1196.0,0.0,YX37GM6UxzX5a19UpwoQ6J4bq4x6Obi/zOIjqRmAfPWmBc...,1.0
1536,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1194.0,0.0,zeMsirklPKg0Ts2/jhhHTE4QTDubdJ6VAh420a2SxyZdWj...,1.0
1537,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1202.0,0.0,zmNzWWYP45nMMEl7vSBuaNnqWtUycLozveGSKlGoXanF8B...,1.0


In [10]:
# sanity check, remember Y1_GRADES here is B+ or better for y = 1
SAMPLE_ROW_INDEX = random.randint(0, len(features_cs110_g) - 1)

print(features_cs110_g.iloc[SAMPLE_ROW_INDEX][features_cs110_g.iloc[SAMPLE_ROW_INDEX].eq(1)])
print(len(features_cs110_g.iloc[SAMPLE_ROW_INDEX][features_cs110_g.iloc[SAMPLE_ROW_INDEX].eq(1)]), "rows with 1")

sample_emplid = features_cs110_g.iloc[SAMPLE_ROW_INDEX]["emplid"]
print("\nemplid:", sample_emplid)
sample_strm = features_cs110_g.iloc[SAMPLE_ROW_INDEX]["strm"]
print("strm for CS110:", sample_strm, "\n")

print(raw_data[(raw_data["emplid"] == sample_emplid) & ((raw_data["strm"] < sample_strm) | (raw_data["course_name"] == "CS110"))][["course_name", "crse_grade_input", "sex"]])
print(len(raw_data[(raw_data["emplid"] == sample_emplid) & ((raw_data["strm"] < sample_strm) | (raw_data["course_name"] == "CS110"))][["course_name", "crse_grade_input", "sex"]]), "rows recorded")

MATH51       1
CS107        1
CS106A       1
CS106B       1
PHYSICS45    1
ENGR14       1
y            1
Name: 648, dtype: object
7 rows with 1

emplid: $2b$15$K1fvGQrJibI27TJVmtvJgekqeqv6WEqDWb4yE98W4.2l.MD2gI2Fq
strm for CS110: 1182.0 

       course_name crse_grade_input sex
360930      CS106A                A   M
360933   PHYSICS45                A   M
360936      ENGR14               A+   M
360937      CS106B               A+   M
360938       CS107               B+   M
360939      MATH51                A   M
360941       CS110               A-   M
7 rows recorded


# Logistic regression models

In [44]:
from sklearn.linear_model import LogisticRegression

In [49]:
UNSEEN_QUARTERS = [1204]
TEST_QUARTERS = [1194, 1196] # aka "dev" quarters

def get_and_split_data(course_name, test_quarters = TEST_QUARTERS, gender = False):
    dataset = getStudentFeatures(course_name, gender)
    dataset_grades = dataset.loc[~dataset['strm'].isin(UNSEEN_QUARTERS)] #For now, dropping this quarter
    dataset_train = dataset_grades.loc[~dataset_grades['strm'].isin(test_quarters)]
    dataset_test = dataset_grades.loc[dataset_grades['strm'].isin(test_quarters)]

    dataset_train = dataset_train.drop(["strm", "emplid"], axis=1)
    dataset_test = dataset_test.drop(["strm", "emplid"], axis=1)

    X_train = dataset_train.drop(["y"], axis=1)
    y_train = dataset_train["y"]
    X_test = dataset_test.drop(["y"], axis=1)
    y_test = dataset_test["y"]
    if not len(X_train) or not len(X_test) or not len(y_train) or not len(y_test):
        return None, None, None, None, True
    return X_train, X_test, y_train, y_test, False

In [63]:
# Use the X predictor matrix and y outcome vector from 
# getStudentFeatures, fit a logistic regression model and return
# the list of coefficients, default sorted by absolute value

# Params: sort = ["pos", "neg", "abs"] to sort by largest positive, negative, or
# absolute value of coefficients
def get_coefs_from_split(X_train, X_test, y_train, y_test, fail_flag = False, sort = "abs"):
    if fail_flag:
        return None, None, None, None, None, True
    if len(y_train.unique()) == 1:
        return None, None, None, None, None, True
    model = LogisticRegression(solver = "lbfgs")
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train) 
    test_score = model.score(X_test, y_test)
    train_groundtruth = y_train.mean()
    test_groundtruth = y_test.mean()
    coefs = [(X_test.columns[i], model.coef_[0][i]) for i in range(len(X_test.columns))]
    if sort:
        if sort == "pos":
            coefs = sorted(coefs, key = lambda x: x[1], reverse = True)
        if sort == "neg":
            coefs = sorted(coefs, key = lambda x: x[1], reverse = False)
        if sort == "abs":
            coefs = sorted(coefs, key = lambda x: abs(x[1]), reverse = True)
    return coefs, train_score, test_score, train_groundtruth, test_groundtruth, False

In [54]:
# fits a simple logistic regression for one course, prints out accuracies,
# ground truths, top coefficients

def simple_fit_and_report(course_name, test_quarters = TEST_QUARTERS, gender = False, sort = 'abs'):
    X_train, X_test, y_train, y_test, fail_flag = get_and_split_data(course_name, gender = False)
    coefs, train_score, test_score, train_groundtruth, test_groundtruth, fail_flag = get_coefs_from_split(X_train, X_test, y_train, y_test, fail_flag, sort)
    print("*****", course_name, "*****")
    if fail_flag:
        print("COULD NOT FIT MODEL")
        return
    print("training accuracy: {}, ground truth (proportion y = 1): {}".format(train_score, train_groundtruth))
    print("test accuracy: {}, ground truth (proportion y = 1): {}".format(test_score, test_groundtruth))
    print("Top 10 coefs, sorted by '{}':".format(sort))
    pprint(coefs[:10])

In [56]:
simple_fit_and_report("CS221")

***** CS221 *****
training accuracy: 0.8021978021978022, ground truth (proportion y = 1): 0.7009419152276295
test accuracy: 0.7151162790697675, ground truth (proportion y = 1): 0.7906976744186046
Top 10 coefs, sorted by 'abs':
[('ENGR25B', -1.5517167933375016),
 ('CS166', 1.3159944883706742),
 ('CS131', 1.2293081114676299),
 ('MATH20', -1.2021107919219158),
 ('CS168', -1.2005719751113781),
 ('MATH171', 1.1544036634838484),
 ('CS246', -1.1405834477335415),
 ('CS376', -1.1355977540278221),
 ('STATS202', -1.117961860830787),
 ('SYMSYS100', -1.0936395843963151)]


In [55]:
simple_fit_and_report("AA200")

***** AA200 *****
COULD NOT FIT MODEL


In [16]:
simple_fit_and_report("CS110")

***** CS110 *****
training accuracy: 0.7523809523809524, ground truth (proportion y = 1): 0.42857142857142855
test accuracy: 0.6308243727598566, ground truth (proportion y = 1): 0.46236559139784944
Top 10 coefs, sorted by 'abs':
[('STATS202', -1.3405992270183344),
 ('BIO158', -1.197531680811751),
 ('MATH121', -1.1917176797583628),
 ('CS371', 1.1742892142735595),
 ('MATH41', -1.1139220892852943),
 ('ECON137', 1.077863465447102),
 ('EE108', 0.9797048403083871),
 ('MS&E221', 0.9780947213934755),
 ('PHIL102', -0.9322476213487048),
 ('OSPKYOTO40M', 0.9307112362463519)]


In [17]:
simple_fit_and_report("CS106B")

***** CS106B *****
training accuracy: 0.7422207664592204, ground truth (proportion y = 1): 0.6943989518506387
test accuracy: 0.6833013435700576, ground truth (proportion y = 1): 0.6621880998080614
Top 10 coefs, sorted by 'abs':
[('PSYCH131', -1.0648165993181853),
 ('CHEM190', 1.0475033166205256),
 ('CS103', 1.0332864964141746),
 ('BIOE244', -1.0309249956822588),
 ('HUMBIO129S', -1.0180088711335997),
 ('ENGR105', -0.9940535784662791),
 ('MATH19', -0.9575385839342244),
 ('ECON112', -0.8843072346078515),
 ('ECON155', 0.8770033842038345),
 ('MS&E226', 0.8685509199471411)]


# Loop through all courses

In [61]:
all_courses = sorted(list(set(raw_data["course_name"])))
print(len(all_courses), "unique courses in this dataset:")
pprint(all_courses)

878 unique courses in this dataset:
['AA100',
 'AA200',
 'AA210A',
 'AA212',
 'AA228',
 'AA240A',
 'AA240B',
 'AA241A',
 'AA241B',
 'AA241X',
 'AA242A',
 'AA256',
 'AA272C',
 'AA279A',
 'AA279B',
 'AA280',
 'APPPHYS273',
 'BIO101',
 'BIO104',
 'BIO107',
 'BIO115',
 'BIO144',
 'BIO153',
 'BIO156',
 'BIO158',
 'BIO163',
 'BIO173',
 'BIO188',
 'BIO189',
 'BIO234',
 'BIO239',
 'BIO256',
 'BIO258',
 'BIO41',
 'BIO42',
 'BIO43',
 'BIO44Y',
 'BIOE101',
 'BIOE103',
 'BIOE103B',
 'BIOE123',
 'BIOE141A',
 'BIOE141B',
 'BIOE201C',
 'BIOE210',
 'BIOE211',
 'BIOE212',
 'BIOE244',
 'BIOE313',
 'BIOE355',
 'BIOE41',
 'BIOE42',
 'BIOE44',
 'BIOE454',
 'BIOHOPK163H',
 'BIOHOPK167H',
 'BIOHOPK263H',
 'BIOMEDIN156',
 'BIOMEDIN210',
 'BIOMEDIN212',
 'BIOMEDIN214',
 'BIOMEDIN215',
 'BIOMEDIN217',
 'BIOMEDIN224',
 'BIOMEDIN225',
 'BIOMEDIN260',
 'BIOMEDIN371',
 'BIOPHYS371',
 'CEE101A',
 'CEE101B',
 'CEE101C',
 'CEE160',
 'CEE161A',
 'CEE164',
 'CEE166A',
 'CEE166B',
 'CEE169',
 'CEE171',
 'CEE172',
 'CEE17

In [None]:
# may need to pip install ipywidgets for tqdm to work, but if you remove the tqdm.tqdm() wrapper
# it should run fine, you just won't see the progress bar
results = []

for course_name in tqdm.tqdm(all_courses):
    X_train, X_test, y_train, y_test, fail_flag = get_and_split_data(course_name, gender = False)
    coefs, train_score, test_score, train_groundtruth, test_groundtruth, fail_flag = get_coefs_from_split(X_train, X_test, y_train, y_test, fail_flag)
    results.append([course_name, train_score, test_score, train_groundtruth, test_groundtruth])
    
results_df = pd.DataFrame(results, columns = ["course_name", "train_score", "test_score", "train_groundtruth", "test_groundtruth"])
results_df

HBox(children=(FloatProgress(value=0.0, max=878.0), HTML(value='')))

In [None]:
results_df[~results_df["train_score"].isnull()]

In [None]:
RESULTS_PATH = "../results/simple_model.fthr"

results_df.to_feather(RESULTS_PATH)

In [None]:
test_df = pd.read_feather(RESULTS_PATH)
test_df

### TODO: 
* Write code to iterate through different model parameters (Y1GRADES, X1GRADES) and save the results
* Use the official_reqs dict to remove all columns from X except those listed in the official prereqs, fit models and save the results

## LASSO logistic regression to reduce nonzero coefficients

In [None]:
TEST_QUARTERS = [1194, 1196] # aka "dev" quarters

def one_course_lasso(course_name, test_quarters = TEST_QUARTERS, gender = False):
    X_train, X_test, y_train, y_test = get_and_split_data(course_name, test_quarters, gender)
    for i in [10, 3, 1, 0.3, 0.1, 0.03, 0.01]:
        model = LogisticRegression(penalty="l1", solver="saga", C=i, max_iter=1000) 
        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test) 
        print("\n***** C = {} *****".format(i))
        print("training accuracy: {}, ground truth (proportion y = 1): {}".format(train_score, y_train.mean()))
        print("test accuracy: {}, ground truth (proportion y = 1): {}".format(test_score, y_test.mean()))
        
        coefs = [(X_train.columns[i], model.coef_[0][i]) for i in range(len(X_train.columns))]
        coefs_pos = sorted(coefs, key = lambda x: x[1], reverse = True)
        coefs_neg = sorted(coefs, key = lambda x: x[1], reverse = False)
        coefs_abs = sorted(coefs, key = lambda x: abs(x[1]), reverse = True)
        coefs_nonzero = sum([x[1] != 0 for x in coefs])
        coefs_zero = sum(x[1] == 0 for x in coefs)
        print("Number of nonzero/zero coefficients: {}/{}".format(coefs_nonzero, coefs_zero))
        print("Largest absolute coefficients:")
        pprint(coefs_abs[:3])
        print("Largest positive coefficients:")
        pprint(coefs_pos[:3])
        print("Largest negative coefficients:")
        pprint(coefs_neg[:3])

In [None]:
one_course_lasso("CS221")

In [None]:
one_course_lasso("CS229")

In [None]:
one_course_lasso("CS229", gender = True)

In [None]:
one_course_lasso("CS110", gender = False)