# Libraries

In [1]:
import numpy as np
import pandas as pd
import pickle
import tqdm.auto as tqdm
from pprint import pprint
import collections
import random
import feather

# Use restricted_wcc dataset (only courses from WCC, no summer, keep only row from last five years or from a student with a course in the last five years) from wrangling_restricted_wcc_dataset.ipynb

In [2]:
WCC_PD_PATH = "../data/wcc_pd.pkl"

with open(WCC_PD_PATH, "rb") as file:
    raw_data = pickle.load(file)
    
raw_data

Unnamed: 0,acad_career,strm,stdnt_enrl_status,unt_taken,unt_billing,crse_grade_input,earn_credit,emplid,subject,catalog_nbr,crse_acad_org,sex,gpa,grade_points,total_units,cum_grade_points,cum_units,overall_gpa,prior_term_gpa,course_name
0,UG,1182.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,MATH,51,MATH,M,4.0,72.0,18.0,72.0,18.0,4.000000,,MATH51
1,UG,1182.0,E,0.0,0.0,,N,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,CHEM,31X,CHEMISTRY,M,,72.0,18.0,72.0,18.0,4.000000,,CHEM31X
2,UG,1182.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,CHEM,31X,CHEMISTRY,M,4.0,72.0,18.0,72.0,18.0,4.000000,,CHEM31X
3,UG,1182.0,E,0.0,0.0,,N,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,MATH,51,MATH,M,,72.0,18.0,72.0,18.0,4.000000,,MATH51
4,UG,1184.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,CS,106X,COMPUTSCI,M,4.0,65.1,17.0,137.1,35.0,3.917143,4.0,CS106X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
784350,UG,1202.0,E,5.0,5.0,B,Y,ZZCaHx+CP6JrUW8AJ/N+m9uPlGbsOEaeCC8uU50l/iVr9d...,CHEM,31A,CHEMISTRY,F,3.0,39.0,11.0,39.0,11.0,3.545455,,CHEM31A
784351,UG,1202.0,E,0.0,0.0,,N,ZZCaHx+CP6JrUW8AJ/N+m9uPlGbsOEaeCC8uU50l/iVr9d...,CHEM,31A,CHEMISTRY,F,,39.0,11.0,39.0,11.0,3.545455,,CHEM31A
784352,UG,1204.0,E,5.0,5.0,,Y,ZZCaHx+CP6JrUW8AJ/N+m9uPlGbsOEaeCC8uU50l/iVr9d...,PSYCH,1,PSYCHOLOGY,F,,,,,,,,PSYCH1
784353,UG,1204.0,E,5.0,5.0,,Y,ZZCaHx+CP6JrUW8AJ/N+m9uPlGbsOEaeCC8uU50l/iVr9d...,CHEM,31B,CHEMISTRY,F,,,,,,,,CHEM31B


In [3]:
all_courses = set(raw_data["course_name"])
pprint(all_courses)
print(len(all_courses))

{'AA100',
 'AA200',
 'AA210A',
 'AA212',
 'AA214A',
 'AA214B',
 'AA228',
 'AA229',
 'AA240A',
 'AA240B',
 'AA241A',
 'AA241B',
 'AA241X',
 'AA242A',
 'AA256',
 'AA270',
 'AA271A',
 'AA272C',
 'AA279A',
 'AA279B',
 'AA280',
 'APPPHYS220',
 'APPPHYS272',
 'APPPHYS273',
 'BIO101',
 'BIO104',
 'BIO107',
 'BIO115',
 'BIO144',
 'BIO153',
 'BIO156',
 'BIO158',
 'BIO163',
 'BIO173',
 'BIO188',
 'BIO189',
 'BIO234',
 'BIO239',
 'BIO256',
 'BIO258',
 'BIO41',
 'BIO42',
 'BIO43',
 'BIO44X',
 'BIO44Y',
 'BIOE101',
 'BIOE103',
 'BIOE103B',
 'BIOE123',
 'BIOE141A',
 'BIOE141B',
 'BIOE201C',
 'BIOE210',
 'BIOE211',
 'BIOE212',
 'BIOE244',
 'BIOE280',
 'BIOE300A',
 'BIOE301C',
 'BIOE311',
 'BIOE313',
 'BIOE355',
 'BIOE41',
 'BIOE42',
 'BIOE44',
 'BIOE454',
 'BIOHOPK163H',
 'BIOHOPK167H',
 'BIOHOPK263H',
 'BIOMEDIN156',
 'BIOMEDIN210',
 'BIOMEDIN212',
 'BIOMEDIN214',
 'BIOMEDIN215',
 'BIOMEDIN217',
 'BIOMEDIN218',
 'BIOMEDIN224',
 'BIOMEDIN225',
 'BIOMEDIN256',
 'BIOMEDIN260',
 'BIOMEDIN371',
 'BIOPHYS

 'STATS167',
 'STATS191',
 'STATS195',
 'STATS200',
 'STATS202',
 'STATS203',
 'STATS206',
 'STATS207',
 'STATS215',
 'STATS217',
 'STATS219',
 'STATS221',
 'STATS231',
 'STATS240',
 'STATS244',
 'STATS250',
 'STATS263',
 'STATS267',
 'STATS270',
 'STATS271',
 'STATS290',
 'STATS300A',
 'STATS305',
 'STATS306A',
 'STATS310A',
 'STATS310B',
 'STATS310C',
 'STATS311',
 'STATS315A',
 'STATS316',
 'STATS344',
 'STATS350',
 'STATS360',
 'STATS363',
 'STATS370',
 'STATS371',
 'STATS374',
 'STATS376A',
 'STATS50',
 'STATS60',
 'SYMSYS100'}
1012


In [4]:
# descriptive statistics
print(len(raw_data), "unique course records")
print(len(raw_data["emplid"].unique()), "unique students")
print(len(raw_data["course_name"].unique()), "unique courses")

317636 unique course records
13709 unique students
1012 unique courses


In [5]:
all_courses = set(raw_data["course_name"])
all_courses

{'CS107E',
 'CHEM173',
 'MS&E268',
 'CS240',
 'ESS246A',
 'CEE176B',
 'ECON146',
 'ME216C',
 'CHEM223',
 'COMM206',
 'CHEMENG262',
 'MS&E322',
 'CME303',
 'ECON155',
 'CHEMENG120B',
 'EE155',
 'CEE166B',
 'BIOMEDIN212',
 'MATSCI209',
 'EE372',
 'EE369A',
 'PUBLPOL268',
 'MS&E111',
 'ECON124',
 'CS267',
 'CHEM287A',
 'MATSCI190',
 'CS240H',
 'STATS263',
 'COMM122',
 'MATH244',
 'AA200',
 'PSYCH10',
 'MATH52',
 'PHYSICS152',
 'MATH226',
 'STATS344',
 'PSYCH175',
 'PHIL252',
 'CHEM185',
 'EE253',
 'PUBLPOL202',
 'MS&E221',
 'EE349',
 'PHYSICS41A',
 'MS&E251',
 'GS252',
 'BIOE300A',
 'AA242A',
 'EE136',
 'PSYCH193',
 'GS253',
 'CS246',
 'PHIL180',
 'ENGR30',
 'PSYCH30',
 'MATH215A',
 'CHEMENG120A',
 'MS&E235',
 'CS108',
 'PHYSICS373',
 'MATH19',
 'PHYSICS231',
 'PHYSICS107',
 'CS448J',
 'EE376B',
 'BIOPHYS371',
 'EARTHSYS112',
 'CME330',
 'STATS207',
 'ME352C',
 'CHEM35',
 'ME131B',
 'MUSIC424',
 'PHYSICS260',
 'ENGR62',
 'CS143',
 'BIOE454',
 'CHEMENG162',
 'BIOE103',
 'ECON5',
 'BIOE141B

Winter is 4, Spring is 6, Fall is 2, Summer is 8?
e.g. 1198 is 18-19, summer quarter. 1196 is 18-19, spring quarter

In [6]:
def getTargetGPAColumn(gpaString):
    if gpaString == "NaN":
        return "gpa_NaN"
    gpaValue = float(gpaString)
    if gpaValue > 3.7:
        return "gpa_A"
    elif gpaValue > 2.7:
        return "gpa_B"
    elif gpaValue > 1.7:
        return "gpa_C"
    elif gpaValue > 0.7:
        return "gpa_D"
    else:
        return "gpa_NP"

# def dropped_function (row):
#     if row['stdnt_enrl_status'] == 'D':
#         return 1
#     return 0

In [7]:
LETTER_GRADES = ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D", "D-", "NP", "W"]

# Builds the feature matrix X and output vector y for a given course, using all other courses in the dataset
# as possible predictors. Only adds a column for another course if at least one student has taken that other
# course before the given course.
# Set gender = True to also add a prediction vector for female (0/1).
# Returns a dataframe with y appended to the right of X, split them off later in get_and_split_data().
############################################################################################################
# 2020-02-23 Initial model settings: 
# Predict success in a course (B+ or better) based on having passed other courses previously (D- or better)
# Incomplete data for strm 1204 (2019-2020 Winter)
# Only consider courses taken at or after strm 1162 (2015-2016 Autumn)
# Y1GRADES = ["A+", "A", "A-", "B+"]
# X1GRADES = ["A+", "A", "A-", "B+", "B", "B-", 
#             "C+", "C", "C-", "D+", "D", "D-"]
# UNFINISHED_QUARTERS = [1204]
# CUTOFF_QUARTER = 1162

Y1GRADES = ["A+", "A", "A-", "B+"] # we encode these grades as y = 1, otherwise y = 0
X1GRADES = ["A+", "A", "A-", "B+", "B", "B-", 
            "C+", "C", "C-", "D+", "D", "D-"] # for previous courses, encode these grades as X_i = 1
UNFINISHED_QUARTERS = [1204] # unfinished quarter(s) if pulled during a quarter
CUTOFF_QUARTER = 1162

def getStudentFeatures(course_name, gender = False, gpa = False, prereqSuccess = False):
    if prereqSuccess:
        X1GRADES = ["A+", "A", "A-", "B+"]
    else:
        X1GRADES = ["A+", "A", "A-", "B+", "B", "B-", 
            "C+", "C", "C-", "D+", "D", "D-"]
    course_entries = raw_data.loc[(raw_data['course_name'] == course_name) & # all course entries that are about the certain course we want
                                  (raw_data['crse_grade_input'].isin(LETTER_GRADES)) & # Grade needs to be a letter
                                  (raw_data['stdnt_enrl_status'] != 'D') & # not dropped
                                  (raw_data['strm'] >= CUTOFF_QUARTER) & # target (outcome) course during or after the cutoff
                                  (~raw_data['strm'].isin(UNFINISHED_QUARTERS))] # we don't have outcomes for these
    students = course_entries.emplid # Students from the course entries
    students_prev_classes = raw_data.loc[raw_data['emplid'].isin(students)] # Other classes taken by these students
    possible_prev_classes = students_prev_classes['course_name'].unique().tolist() # List of courses taken by these students
    possible_prev_classes.append("strm") # For now add term to the data of when the course was taken
    if gender:
        possible_prev_classes.append("female") # Add gender
    if gpa: # Add GPA
        possible_prev_classes.append("gpa_A")
        possible_prev_classes.append("gpa_B")
        possible_prev_classes.append("gpa_C")
        possible_prev_classes.append("gpa_D")
        possible_prev_classes.append("gpa_NP")
        possible_prev_classes.append("gpa_NaN")
    possible_prev_classes.append("emplid") # Add emplid
    possible_prev_classes.append("y") # Add the target to the data. Will split it later
    a = np.zeros(shape = (len(course_entries), len(possible_prev_classes))) # all 0 initialized features matrix
    full_matrix = pd.DataFrame(a, columns = possible_prev_classes) # same as above, but in pandas to allow indexing by course name
    for counter, idx in enumerate(course_entries.index): # loop over all class entries
        grade = course_entries.loc[[idx]]['crse_grade_input']
        gradeStr = grade.tolist()[0]
        if (gradeStr in Y1GRADES):
            full_matrix.iloc[counter, full_matrix.columns.get_loc("y")] = 1
        # fill predictor matrix X
        std_id = course_entries.loc[[idx]]['emplid'] # Get the student id
        quarter = course_entries.loc[[idx]]['strm'] # Get the term
        student_courses = students_prev_classes.loc[students_prev_classes['emplid'] == std_id.tolist()[0]] # Get all courses the student took
        student_courses = student_courses.loc[student_courses['strm'] < quarter.tolist()[0]] # Filter the courses to be only before the course of interest
        if gender:
            female = course_entries.loc[[idx]]['sex'] # Get the student gender
            if (female.tolist()[0] == 'M'):
                full_matrix.iloc[counter, full_matrix.columns.get_loc("female")] = 0
            else:
                full_matrix.iloc[counter, full_matrix.columns.get_loc("female")] = 1
        if gpa:
            prior_GPA = course_entries.loc[[idx]]['prior_term_gpa'] # Get the student gender
            full_matrix.iloc[counter, full_matrix.columns.get_loc(getTargetGPAColumn(prior_GPA.tolist()[0]))] = 1
        full_matrix.iloc[counter, full_matrix.columns.get_loc("strm")] = quarter.tolist()[0]
        full_matrix.iloc[counter, full_matrix.columns.get_loc("emplid")] = std_id.tolist()[0]
        for j in student_courses.index:# Loop over the student's courses
            course_name = student_courses.loc[[j]]['subject']+student_courses.loc[[j]]['catalog_nbr'] # Get the course name
            grade = student_courses.loc[[j]]['crse_grade_input']
            gradeStr = grade.tolist()[0]
            if (gradeStr in X1GRADES):
                full_matrix.iloc[counter, full_matrix.columns.get_loc(course_name.tolist()[0])] = 1 # Set the course name in X to the new value (or if it was already passed keep the value)
    X1GRADES = ["A+", "A", "A-", "B+", "B", "B-", 
            "C+", "C", "C-", "D+", "D", "D-"]
    return full_matrix

In [None]:
#dataset includes a y column, which is the target. It also includes strm and emplid columns that should
#be dropped before running any prediction
features_cs110 = getStudentFeatures('CS110', True, True, True)
features_cs110

In [7]:
# sanity check, remember Y1_GRADES here is B+ or better for y = 1
SAMPLE_ROW_INDEX = random.randint(0, len(features_cs110) - 1)

print(features_cs110.iloc[SAMPLE_ROW_INDEX][features_cs110.iloc[SAMPLE_ROW_INDEX].eq(1)])
print(len(features_cs110.iloc[SAMPLE_ROW_INDEX][features_cs110.iloc[SAMPLE_ROW_INDEX].eq(1)]), "rows with 1")

sample_emplid = features_cs110.iloc[SAMPLE_ROW_INDEX]["emplid"]
print("\nemplid:", sample_emplid)
sample_strm = features_cs110.iloc[SAMPLE_ROW_INDEX]["strm"]
print("strm for CS110:", sample_strm, "\n")

print(raw_data[(raw_data["emplid"] == sample_emplid) & ((raw_data["strm"] < sample_strm) | (raw_data["course_name"] == "CS110"))][["course_name", "crse_grade_input"]])
print(len(raw_data[(raw_data["emplid"] == sample_emplid) & ((raw_data["strm"] < sample_strm) | (raw_data["course_name"] == "CS110"))][["course_name", "crse_grade_input"]]), "rows recorded")

CS103      1
CS109      1
CS107      1
MATH120    1
CS142      1
MATH51H    1
CS261      1
CS265      1
y          1
Name: 394, dtype: object
9 rows with 1

emplid: $2b$15$DnnD9NtJYCCrE3O7S1y1veI9Z.RwjYxxswU2jIVJYErPd/dS9uRqm
strm for CS110: 1174.0 

       course_name crse_grade_input
215974       CS103               A+
215975     MATH51H                A
215977       CS109                A
215978       CS261               A+
215980       CS107                A
215981       CS142               B+
215983       CS265               B+
215985     MATH120               A-
215986       CS110                A
9 rows recorded


In [8]:
print(collections.Counter(features_cs110.y))
print(features_cs110["y"].mean())

Counter({0.0: 870, 1.0: 669})
0.4346978557504873


In [9]:
features_cs110_g = getStudentFeatures("CS110", gender = True)
features_cs110_g

Unnamed: 0,MATH51,CHEM31X,CS106X,CHEM33,PHIL1,CS103,CHEM35,CS109,CS161,CS107,...,PHYSICS231,BIO153,MATH220,PHYSICS62,ME105,AA272C,strm,female,emplid,y
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1202.0,0.0,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1192.0,0.0,$2a$15$5tXEOl2owViV9E5K1pJ/Luf.44w/Ci69ZBbFT6o...,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1186.0,0.0,$2a$15$aHTw2jbPTbRKGeDBfcQ25.KPs1kJF/UqcPukXAC...,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1194.0,1.0,$2a$15$CgRoc5cbNZ0QTWMIqNSbu..3w/a0GkYDa3ktG1x...,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1186.0,0.0,$2a$15$Ejn9P.vrIOxTGiqTGICiDeLirjfUfjv4xBdI4h6...,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1202.0,0.0,YUqcVvVPHekQ3pk5dsZJb91EacfOFwSq9yrH4Gab7GbT8O...,1.0
1535,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1196.0,0.0,YX37GM6UxzX5a19UpwoQ6J4bq4x6Obi/zOIjqRmAfPWmBc...,1.0
1536,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1194.0,0.0,zeMsirklPKg0Ts2/jhhHTE4QTDubdJ6VAh420a2SxyZdWj...,1.0
1537,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1202.0,0.0,zmNzWWYP45nMMEl7vSBuaNnqWtUycLozveGSKlGoXanF8B...,1.0


In [10]:
# sanity check, remember Y1_GRADES here is B+ or better for y = 1
SAMPLE_ROW_INDEX = random.randint(0, len(features_cs110_g) - 1)

print(features_cs110_g.iloc[SAMPLE_ROW_INDEX][features_cs110_g.iloc[SAMPLE_ROW_INDEX].eq(1)])
print(len(features_cs110_g.iloc[SAMPLE_ROW_INDEX][features_cs110_g.iloc[SAMPLE_ROW_INDEX].eq(1)]), "rows with 1")

sample_emplid = features_cs110_g.iloc[SAMPLE_ROW_INDEX]["emplid"]
print("\nemplid:", sample_emplid)
sample_strm = features_cs110_g.iloc[SAMPLE_ROW_INDEX]["strm"]
print("strm for CS110:", sample_strm, "\n")

print(raw_data[(raw_data["emplid"] == sample_emplid) & ((raw_data["strm"] < sample_strm) | (raw_data["course_name"] == "CS110"))][["course_name", "crse_grade_input", "sex"]])
print(len(raw_data[(raw_data["emplid"] == sample_emplid) & ((raw_data["strm"] < sample_strm) | (raw_data["course_name"] == "CS110"))][["course_name", "crse_grade_input", "sex"]]), "rows recorded")

MATH51       1
CS107        1
CS106A       1
CS106B       1
PHYSICS45    1
ENGR14       1
y            1
Name: 648, dtype: object
7 rows with 1

emplid: $2b$15$K1fvGQrJibI27TJVmtvJgekqeqv6WEqDWb4yE98W4.2l.MD2gI2Fq
strm for CS110: 1182.0 

       course_name crse_grade_input sex
360930      CS106A                A   M
360933   PHYSICS45                A   M
360936      ENGR14               A+   M
360937      CS106B               A+   M
360938       CS107               B+   M
360939      MATH51                A   M
360941       CS110               A-   M
7 rows recorded


# Logistic regression models

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [102]:
UNSEEN_QUARTERS = [1204]
TEST_QUARTERS = [1194, 1196] # aka "dev" quarters

def get_and_split_data(course_name, test_quarters = TEST_QUARTERS, gender = False, gpa = False, prereqSuccess = False):
    dataset = getStudentFeatures(course_name, gender, gpa, prereqSuccess)
    dataset_grades = dataset.loc[~dataset['strm'].isin(UNSEEN_QUARTERS)] #For now, dropping this quarter
    dataset_train = dataset_grades.loc[~dataset_grades['strm'].isin(test_quarters)]
    dataset_test = dataset_grades.loc[dataset_grades['strm'].isin(test_quarters)]

    test_students = pd.DataFrame(list(dataset_test["emplid"]), columns = ["emplid"])
    if gender:
        test_students["female"] = list(dataset_test["female"])
    if gpa:
        test_students["gpa_A"] = list(dataset_test["gpa_A"])
        test_students["gpa_B"] = list(dataset_test["gpa_B"])
        test_students["gpa_C"] = list(dataset_test["gpa_C"])
        test_students["gpa_D"] = list(dataset_test["gpa_D"])
        test_students["gpa_NP"] = list(dataset_test["gpa_NP"])
        test_students["gpa_NaN"] = list(dataset_test["gpa_NaN"])
    dataset_train = dataset_train.drop(["strm", "emplid"], axis=1)
    dataset_test = dataset_test.drop(["strm", "emplid"], axis=1)

    X_train = dataset_train.drop(["y"], axis=1)
    y_train = dataset_train["y"]
    X_test = dataset_test.drop(["y"], axis=1)
    y_test = dataset_test["y"]
    
    if not len(X_train) or not len(X_test) or not len(y_train) or not len(y_test):
        return None, None, None, None, True
    return X_train, X_test, y_train, y_test, test_students, False

In [103]:
# Use the X predictor matrix and y outcome vector from 
# getStudentFeatures, fit a logistic regression model and return
# the list of coefficients, default sorted by absolute value

# Params: sort = ["pos", "neg", "abs"] to sort by largest positive, negative, or
# absolute value of coefficients
def get_coefs_from_split(X_train, X_test, y_train, y_test, fail_flag = False, sort = "abs"):
    if fail_flag:
        return None, None, None, None, None, None, None, None, None, True
    if len(y_train.unique()) == 1:
        return None, None, None, None, None, None, None, None, None, True
    model = LogisticRegression(solver = "lbfgs")
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train) 
    test_score = model.score(X_test, y_test)
    train_groundtruth = y_train.mean()
    test_groundtruth = y_test.mean()
    train_size = len(y_train)
    test_size = len(y_test)
    y_predict = model.predict(X_test)
    prob_1_index = 1
    if model.classes_[1] != 1:
        prob_1_index = 0
    y_prob = [x[prob_1_index] for x in model.predict_proba(X_test)]
    f1score = f1_score(y_test, y_predict)
    coefs = [(X_test.columns[i], model.coef_[0][i]) for i in range(len(X_test.columns))]
    if sort:
        if sort == "pos":
            coefs = sorted(coefs, key = lambda x: x[1], reverse = True)
        if sort == "neg":
            coefs = sorted(coefs, key = lambda x: x[1], reverse = False)
        if sort == "abs":
            coefs = sorted(coefs, key = lambda x: abs(x[1]), reverse = True)
    return coefs, y_predict, y_prob, train_score, test_score, train_groundtruth, test_groundtruth, train_size, test_size, f1score, False

In [104]:
# fits a simple logistic regression for one course, prints out accuracies,
# ground truths, top coefficients

def simple_fit_and_report(course_name, test_quarters = TEST_QUARTERS, gender = False,
                          gpa = False, prereqSuccess = False, sort = 'abs'):
    X_train, X_test, y_train, y_test, test_students, fail_flag = get_and_split_data(course_name, gender = gender, gpa = gpa, prereqSuccess = prereqSuccess)
    coefs, y_predict, y_prob, train_score, test_score, train_groundtruth, test_groundtruth, train_size, test_size, f1score, fail_flag = get_coefs_from_split(X_train, X_test, y_train, y_test, fail_flag, sort)
    test_students["y"] = list(y_test)
    test_students["y_predict"] = y_predict
    test_students["y_prob"] = y_prob
#     print("*****", course_name, "*****")
#     if fail_flag:
#         print("COULD NOT FIT MODEL")
#         return
#     print("training accuracy: {}, ground truth (proportion y = 1): {}".format(train_score, train_groundtruth))
#     print("test accuracy: {}, ground truth (proportion y = 1): {}".format(test_score, test_groundtruth))
#     print("Top 10 coefs, sorted by '{}':".format(sort))
    print("f1_score ", f1score)
    print("test_size ", test_size)
#     print("train_size ", train_size)
    print(test_students)
#     pprint(coefs)

In [105]:
simple_fit_and_report("BIO115", gender = True, gpa = True, prereqSuccess = True)

f1_score  1.0
test_size  15
                                               emplid  female  gpa_A  gpa_B  \
0   $2b$15$.lxnbmzpce109/Wso6n5zu2SYxLTLbhyv.9NRJx...     0.0    0.0    1.0   
1   $2b$15$.VbRYlCsHQx1Nrnaw3ocjercXtuvY7cc3zj.xYx...     1.0    0.0    1.0   
2   $2b$15$/K133COR4y893BHKNrKk/./SK4OVWevoGzm1.k0...     1.0    0.0    1.0   
3   $2b$15$4l5KvLasH.bR1rQgm65ome3eQyxntwYSxhKWaKS...     1.0    0.0    1.0   
4   $2b$15$7rIQzCkqsuNQ9UZq7K1GJ.lHmIWu.EYpYmLYtja...     1.0    0.0    1.0   
5   $2b$15$DDs3A4PmKlMVxT9sD4nvteyazpQh.6wJu3MlxZP...     1.0    0.0    1.0   
6   $2b$15$ikSHNtOYnK8OPkqTHYyYiuqzf.aTKYDs7bEwzwt...     1.0    1.0    0.0   
7   $2b$15$LneWNn3hXH6g4qf/C3.xNea699VsWCUtAPieyED...     1.0    0.0    1.0   
8   $2b$15$N6s6c2g9UkptQuPZSjR5Z.pv87iKyhtfMA7fVxw...     1.0    0.0    1.0   
9   $2b$15$s1cTxmCB/0GDdW7H46n4sOCa5Uwz5GMuEwD1p/r...     1.0    1.0    0.0   
10  $2b$15$tw6q07TdjX3huzcpXbNAD.z9nElc.ITBKhE3Y3A...     0.0    0.0    1.0   
11  $2b$15$WBSjprWGuANwp

In [34]:
simple_fit_and_report("CS110", gender = True, gpa = True, prereqSuccess = True)

***** CS110 *****
training accuracy: 0.8182539682539682, ground truth (proportion y = 1): 0.42857142857142855
test accuracy: 0.7096774193548387, ground truth (proportion y = 1): 0.46236559139784944
Top 10 coefs, sorted by 'abs':
[('OSPKYOTO40M', 1.3573637847116564),
 ('CS107', 1.2615005626885627),
 ('ECON137', 1.1121718975723978),
 ('CHEM131', 1.108990281788163),
 ('MATH42', 1.1023631084850947),
 ('EE108', 1.089183402023432),
 ('gpa_C', -1.0701486089444592),
 ('PHYSICS65', 1.0473242354719154),
 ('PSYCH90', 1.0040523643506354),
 ('CS371', 0.9890593269765915)]


In [37]:
simple_fit_and_report("CS106B", gender = True, gpa = True, prereqSuccess = True)

***** CS106B *****
training accuracy: 0.7946282345234196, ground truth (proportion y = 1): 0.6943989518506387
test accuracy: 0.761996161228407, ground truth (proportion y = 1): 0.6621880998080614
Top 10 coefs, sorted by 'abs':
[('gpa_C', -1.9227414537560443),
 ('gpa_NP', 1.6838914854189078),
 ('gpa_A', 1.3155595089748902),
 ('CS103', 1.1387302892388518),
 ('HUMBIO129S', -1.1115189333318594),
 ('ME80', -1.0780087296560619),
 ('PSYCH30', -1.0635156719974148),
 ('EE102A', -0.9975828226128732),
 ('PHYSICS61', 0.9827617992481658),
 ('LINGUIST105', -0.9611576415320746)]


# Loop through all courses

In [34]:
all_courses = sorted(list(set(raw_data["course_name"])))
print(len(all_courses), "unique courses in this dataset:")
pprint(all_courses)

1012 unique courses in this dataset:
['AA100',
 'AA200',
 'AA210A',
 'AA212',
 'AA214A',
 'AA214B',
 'AA228',
 'AA229',
 'AA240A',
 'AA240B',
 'AA241A',
 'AA241B',
 'AA241X',
 'AA242A',
 'AA256',
 'AA270',
 'AA271A',
 'AA272C',
 'AA279A',
 'AA279B',
 'AA280',
 'APPPHYS220',
 'APPPHYS272',
 'APPPHYS273',
 'BIO101',
 'BIO104',
 'BIO107',
 'BIO115',
 'BIO144',
 'BIO153',
 'BIO156',
 'BIO158',
 'BIO163',
 'BIO173',
 'BIO188',
 'BIO189',
 'BIO234',
 'BIO239',
 'BIO256',
 'BIO258',
 'BIO41',
 'BIO42',
 'BIO43',
 'BIO44X',
 'BIO44Y',
 'BIOE101',
 'BIOE103',
 'BIOE103B',
 'BIOE123',
 'BIOE141A',
 'BIOE141B',
 'BIOE201C',
 'BIOE210',
 'BIOE211',
 'BIOE212',
 'BIOE244',
 'BIOE280',
 'BIOE300A',
 'BIOE301C',
 'BIOE311',
 'BIOE313',
 'BIOE355',
 'BIOE41',
 'BIOE42',
 'BIOE44',
 'BIOE454',
 'BIOHOPK163H',
 'BIOHOPK167H',
 'BIOHOPK263H',
 'BIOMEDIN156',
 'BIOMEDIN210',
 'BIOMEDIN212',
 'BIOMEDIN214',
 'BIOMEDIN215',
 'BIOMEDIN217',
 'BIOMEDIN218',
 'BIOMEDIN224',
 'BIOMEDIN225',
 'BIOMEDIN256',
 'BI

 'MATSCI320',
 'MATSCI321',
 'MATSCI322',
 'MATSCI358',
 'MCS100',
 'ME101',
 'ME105',
 'ME112',
 'ME113',
 'ME115A',
 'ME115B',
 'ME116M',
 'ME131A',
 'ME131B',
 'ME140',
 'ME161',
 'ME181',
 'ME202',
 'ME203',
 'ME204A',
 'ME204B',
 'ME210',
 'ME216A',
 'ME216B',
 'ME216C',
 'ME227',
 'ME250',
 'ME260',
 'ME261',
 'ME262',
 'ME271',
 'ME280',
 'ME300A',
 'ME300B',
 'ME300C',
 'ME309',
 'ME318',
 'ME325',
 'ME335A',
 'ME346A',
 'ME351A',
 'ME351B',
 'ME352C',
 'ME70',
 'ME80',
 'MI185',
 'MI285',
 'MS&E111',
 'MS&E112',
 'MS&E120',
 'MS&E121',
 'MS&E125',
 'MS&E130',
 'MS&E145',
 'MS&E146',
 'MS&E201',
 'MS&E211',
 'MS&E212',
 'MS&E220',
 'MS&E221',
 'MS&E226',
 'MS&E233',
 'MS&E235',
 'MS&E241',
 'MS&E243',
 'MS&E245A',
 'MS&E245B',
 'MS&E245G',
 'MS&E246',
 'MS&E251',
 'MS&E252',
 'MS&E260',
 'MS&E261',
 'MS&E262',
 'MS&E268',
 'MS&E294',
 'MS&E295',
 'MS&E310',
 'MS&E311',
 'MS&E317',
 'MS&E319',
 'MS&E321',
 'MS&E322',
 'MS&E332',
 'MS&E334',
 'MS&E351',
 'MS&E352',
 'MS&E355',
 '

In [35]:
from concurrent.futures import ThreadPoolExecutor

In [37]:
#Multithreading this process to maximize speed
def run_predictions(course_name, results):
    X_train, X_test, y_train, y_test, test_students, fail_flag = get_and_split_data(course_name, gender = True, gpa = True, prereqSuccess=True)
    coefs, y_predict, y_prob, train_score, test_score, train_groundtruth, test_groundtruth, train_size, test_size, f1score, fail_flag = get_coefs_from_split(X_train, X_test, y_train, y_test, fail_flag)
    test_students[course_name + "_y"] = list(y_test)
    test_students[course_name + "_y_predict"] = y_predict
    test_students[course_name + "_y_prob"] = y_prob
    results.append([course_name, train_score, test_score, train_groundtruth, test_groundtruth, train_size, test_size, f1score])
results = []
executor = ThreadPoolExecutor()
for course_name in all_courses:
    executor.submit(run_predictions, course_name, results)
executor.shutdown()
results_df = pd.DataFrame(results, columns = ["course_name", "train_score", "test_score", "train_groundtruth", "test_groundtruth", "train_size", "test_size", "f1_score"])
results_df

  **self._backend_args)


Unnamed: 0,course_name,train_score,test_score,train_groundtruth,test_groundtruth,train_size,test_size,f1_score
0,BIOE103B,1.0,1.0,0.692308,1.0,13,4,1.0


In [15]:
results_df[~results_df["train_score"].isnull()]

Unnamed: 0,course_name,train_score,test_score,train_groundtruth,test_groundtruth
31,AA279A,1.000000,0.444444,0.916667,0.444444
33,BIO115,0.888889,1.000000,0.777778,1.000000
35,BIOE103B,1.000000,1.000000,0.692308,1.000000
45,BIO144,0.973684,0.500000,0.842105,0.571429
59,BIOMEDIN210,1.000000,1.000000,0.666667,1.000000
...,...,...,...,...,...
1007,STATS60,0.769231,0.821229,0.691142,0.826816
1008,MATH51,0.756128,0.709534,0.596788,0.618625
1009,PHYSICS41,0.780101,0.772340,0.664744,0.646809
1010,PHYSICS43,0.866318,0.836493,0.733383,0.760664


In [18]:
RESULTS_PATH = "../results/simple_model_gpa_prereqsuccess.fthr"
results_df.to_feather(RESULTS_PATH)

In [19]:
test_df = pd.read_feather(RESULTS_PATH)
test_df

Unnamed: 0,course_name,train_score,test_score,train_groundtruth,test_groundtruth
0,AA214A,,,,
1,AA214B,,,,
2,AA229,,,,
3,AA270,,,,
4,AA271A,,,,
...,...,...,...,...,...
1007,STATS60,0.769231,0.821229,0.691142,0.826816
1008,MATH51,0.756128,0.709534,0.596788,0.618625
1009,PHYSICS41,0.780101,0.772340,0.664744,0.646809
1010,PHYSICS43,0.866318,0.836493,0.733383,0.760664


### TODO: 
* Keep scores for every student raw. Keep one just with the raw score and one with the coef
* Write code to iterate through different model parameters (Y1GRADES, X1GRADES) and save the results
* Average f1 score
* Use the official_reqs dict to remove all columns from X except those listed in the official prereqs, fit models and save the results

## LASSO logistic regression to reduce nonzero coefficients

In [24]:
TEST_QUARTERS = [1194, 1196] # aka "dev" quarters

def one_course_lasso(course_name, test_quarters = TEST_QUARTERS, gender = False, gpa= False, prereqSuccess = False):
    X_train, X_test, y_train, y_test, fail_flag = get_and_split_data(course_name, test_quarters, gender, gpa = gpa, prereqSuccess = prereqSuccess)
    for i in [10, 3, 1, 0.3, 0.1, 0.03, 0.01]:
        model = LogisticRegression(penalty="l1", solver="saga", C=i, max_iter=1000) 
        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test) 
        print("\n***** C = {} *****".format(i))
        print("training accuracy: {}, ground truth (proportion y = 1): {}".format(train_score, y_train.mean()))
        print("test accuracy: {}, ground truth (proportion y = 1): {}".format(test_score, y_test.mean()))
        
        coefs = [(X_train.columns[i], model.coef_[0][i]) for i in range(len(X_train.columns))]
        coefs_pos = sorted(coefs, key = lambda x: x[1], reverse = True)
        coefs_neg = sorted(coefs, key = lambda x: x[1], reverse = False)
        coefs_abs = sorted(coefs, key = lambda x: abs(x[1]), reverse = True)
        coefs_nonzero = sum([x[1] != 0 for x in coefs])
        coefs_zero = sum(x[1] == 0 for x in coefs)
        print("Number of nonzero/zero coefficients: {}/{}".format(coefs_nonzero, coefs_zero))
        print("Largest absolute coefficients:")
        pprint(coefs_abs[:3])
        print("Largest positive coefficients:")
        pprint(coefs_pos[:3])
        print("Largest negative coefficients:")
        pprint(coefs_neg[:3])

In [None]:
one_course_lasso("CS221")

In [None]:
one_course_lasso("CS229")

In [None]:
one_course_lasso("CS229", gender = True)

In [25]:
one_course_lasso("CS110", gender = True, gpa = True, prereqSuccess = True)




***** C = 10 *****
training accuracy: 0.8420634920634921, ground truth (proportion y = 1): 0.42857142857142855
test accuracy: 0.7096774193548387, ground truth (proportion y = 1): 0.46236559139784944
Number of nonzero/zero coefficients: 262/395
Largest absolute coefficients:
[('MATH162', -6.968733792165174),
 ('PHYSICS65', 6.955717891367805),
 ('OSPKYOTO40M', 6.853557844142952)]
Largest positive coefficients:
[('PHYSICS65', 6.955717891367805),
 ('OSPKYOTO40M', 6.853557844142952),
 ('ECON137', 6.4554794321204465)]
Largest negative coefficients:
[('MATH162', -6.968733792165174),
 ('PHYSICS63', -5.902355139018825),
 ('ME80', -5.463229340386131)]

***** C = 3 *****
training accuracy: 0.8373015873015873, ground truth (proportion y = 1): 0.42857142857142855
test accuracy: 0.7347670250896058, ground truth (proportion y = 1): 0.46236559139784944
Number of nonzero/zero coefficients: 181/476
Largest absolute coefficients:
[('OSPKYOTO40M', 4.702913546543071),
 ('PHYSICS65', 3.943475132681561),
 (