# Libraries

In [1]:
import numpy as np
import pandas as pd
import pickle
import tqdm.auto as tqdm
from pprint import pprint
import collections

# Use restricted_wcc dataset (only courses from WCC, last five years, no summer)

In [2]:
WCC_PD_PATH = "../data/wcc_pd.pkl"

with open(WCC_PD_PATH, "rb") as file:
    raw_data = pickle.load(file)
    
raw_data

Unnamed: 0,acad_career,strm,stdnt_enrl_status,unt_taken,unt_billing,crse_grade_input,earn_credit,emplid,subject,catalog_nbr,crse_acad_org,sex,gpa,grade_points,total_units,cum_grade_points,cum_units,overall_gpa,prior_term_gpa,course_name
0,UG,1182.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,MATH,51,MATH,M,4.0,72.0,18.0,72.0,18.0,4.000000,,MATH51
1,UG,1182.0,E,0.0,0.0,,N,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,CHEM,31X,CHEMISTRY,M,,72.0,18.0,72.0,18.0,4.000000,,CHEM31X
2,UG,1182.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,CHEM,31X,CHEMISTRY,M,4.0,72.0,18.0,72.0,18.0,4.000000,,CHEM31X
3,UG,1182.0,E,0.0,0.0,,N,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,MATH,51,MATH,M,,72.0,18.0,72.0,18.0,4.000000,,MATH51
4,UG,1184.0,E,5.0,5.0,A,Y,$2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...,CS,106X,COMPUTSCI,M,4.0,65.1,17.0,137.1,35.0,3.917143,4.0,CS106X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
784350,UG,1202.0,E,5.0,5.0,B,Y,ZZCaHx+CP6JrUW8AJ/N+m9uPlGbsOEaeCC8uU50l/iVr9d...,CHEM,31A,CHEMISTRY,F,3.0,39.0,11.0,39.0,11.0,3.545455,,CHEM31A
784351,UG,1202.0,E,0.0,0.0,,N,ZZCaHx+CP6JrUW8AJ/N+m9uPlGbsOEaeCC8uU50l/iVr9d...,CHEM,31A,CHEMISTRY,F,,39.0,11.0,39.0,11.0,3.545455,,CHEM31A
784352,UG,1204.0,E,5.0,5.0,,Y,ZZCaHx+CP6JrUW8AJ/N+m9uPlGbsOEaeCC8uU50l/iVr9d...,PSYCH,1,PSYCHOLOGY,F,,,,,,,,PSYCH1
784353,UG,1204.0,E,5.0,5.0,,Y,ZZCaHx+CP6JrUW8AJ/N+m9uPlGbsOEaeCC8uU50l/iVr9d...,CHEM,31B,CHEMISTRY,F,,,,,,,,CHEM31B


In [3]:
print(raw_data.loc[1], "\n")
print(raw_data.loc[2], "\n")
print(raw_data.loc[1] == raw_data.loc[2])

acad_career                                                         UG
strm                                                              1182
stdnt_enrl_status                                                    E
unt_taken                                                            0
unt_billing                                                          0
crse_grade_input                                                  None
earn_credit                                                          N
emplid               $2a$15$.iQPCHeeuyLD3TIqJRk4j.LU0IjGYumSdFkAEUf...
subject                                                           CHEM
catalog_nbr                                                        31X
crse_acad_org                                                CHEMISTRY
sex                                                                  M
gpa                                                                NaN
grade_points                                                        72
total_

In [4]:
print(raw_data.loc[784350], "\n")
print(raw_data.loc[784351], "\n")
print(raw_data.loc[784350] == raw_data.loc[784351])

acad_career                                                         UG
strm                                                              1202
stdnt_enrl_status                                                    E
unt_taken                                                            5
unt_billing                                                          5
crse_grade_input                                                     B
earn_credit                                                          Y
emplid               ZZCaHx+CP6JrUW8AJ/N+m9uPlGbsOEaeCC8uU50l/iVr9d...
subject                                                           CHEM
catalog_nbr                                                        31A
crse_acad_org                                                CHEMISTRY
sex                                                                  F
gpa                                                                  3
grade_points                                                        39
total_

In [5]:
all_courses = set(raw_data["course_name"])
all_courses

{'EE47',
 'PSYCH149',
 'ENERGY222',
 'EE273',
 'ME260',
 'CS277',
 'CEE101A',
 'ME280',
 'ME113',
 'AA200',
 'CHEM190',
 'HUMBIO163',
 'PSYCH101',
 'MS&E243',
 'BIOMEDIN210',
 'EARTHSYS251',
 'ECON51',
 'PHYSICS45N',
 'CHEM181',
 'ENERGY212',
 'CHEMENG281',
 'PHIL194J',
 'CS371',
 'STATS217',
 'CME207',
 'MATSCI206',
 'ME300A',
 'BIOE44',
 'PHYSICS65',
 'MS&E355',
 'CEE270B',
 'CHEMENG183',
 'MATH173',
 'CHEMENG20',
 'CHEM135',
 'CME371',
 'CS154',
 'CHEM31AC',
 'PSYCH60B',
 'ME203',
 'HUMBIO157',
 'AA214A',
 'CS148',
 'EE292L',
 'MATH151',
 'CHEM153',
 'CEE260A',
 'STATS141',
 'PHYSICS364',
 'EARTHSYS258',
 'ENGR105',
 'MS&E332',
 'EARTHSYS146B',
 'CS243',
 'CEE262C',
 'EE155',
 'STATS344',
 'ESS146A',
 'LINGUIST284',
 'HUMBIO3A',
 'PHIL287',
 'CS348B',
 'CEE181',
 'OSPPARIS40M',
 'PHYSICS42',
 'MATH42',
 'ECON178',
 'MATH154',
 'ME271',
 'PHYSICS131',
 'MS&E453',
 'EE336',
 'PHYSICS362',
 'CS106B',
 'LINGUIST105',
 'EE279',
 'COMM206',
 'CS211',
 'AA210A',
 'CS193A',
 'EE65',
 'CHEME

Winter is 4, Spring is 6, Fall is 2, Summer is 8?
e.g. 1198 is 18-19, summer quarter. 1196 is 18-19, spring quarter

In [6]:
# def get_course_name (row):
#     return row["course_name"]

# def dropped_function (row):
#     if row['stdnt_enrl_status'] == 'D':
#         return 1
#     return 0

In [7]:
LETTER_GRADES = ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D", "D-", "NP", "W"]

# Builds the feature matrix X and output vector y for a given course, using all other courses in the dataset
# as possible predictors. Only adds a column for another course if at least one student has taken that other
# course before the given course.
# Set gender = True to also add a prediction vector for female (0/1).
# Returns a dataframe with y appended to the right of X, split them off later in get_and_split_data().
############################################################################################################
# 2020-02-23 Initial model settings: 
# Predict success in a course (B+ or better) based on having passed other courses previously (D- or better)
# Incomplete data for strm 1204 (2019-2020 Winter)
# Y1GRADES = ["A+", "A", "A-", "B+"]
# X1GRADES = ["A+", "A", "A-", "B+", "B", "B-", 
#             "C+", "C", "C-", "D+", "D", "D-"]
# UNFINISHED_QUARTERS = ["1204"]

Y1GRADES = ["A+", "A", "A-", "B+"] # we encode these grades as y = 1, otherwise y = 0
X1GRADES = ["A+", "A", "A-", "B+", "B", "B-", 
            "C+", "C", "C-", "D+", "D", "D-"] # for previous courses, encode these grades as X_i = 1
UNFINISHED_QUARTERS = ["1204"] # unfinished quarter(s) if pulled during a quarter

def getStudentFeatures(course_name, gender = False):
    course_entries = raw_data.loc[(raw_data['course_name'] == course_name) & # all course entries that are about the certain course we want
                                  (raw_data['crse_grade_input'].isin(LETTER_GRADES)) & # Grade needs to be a letter
                                  (raw_data['stdnt_enrl_status'] != 'D') & # not dropped
                                  (raw_data['strm'] >= 1162) & # output course is 
                                  (~raw_data['strm'].isin(UNFINISHED_QUARTERS))] # we don't have outcomes for these
    students = course_entries.emplid # Students from the course entries
    students_prev_classes = raw_data.loc[raw_data['emplid'].isin(students)] # Other classes taken by these students
    possible_prev_classes = students_prev_classes['course_name'].unique().tolist() # List of courses taken by these students
    possible_prev_classes.append("strm") # For now add term to the data of when the course was taken
    if gender:
        possible_prev_classes.append("female") # Add gender
    possible_prev_classes.append("emplid") # Add emplid
    possible_prev_classes.append("y") # Add the target to the data. Will split it later
    a = np.zeros(shape = (len(course_entries), len(possible_prev_classes))) # all 0 initialized features matrix
    full_matrix = pd.DataFrame(a, columns = possible_prev_classes) # same as above, but in pandas to allow indexing by course name
    
    for counter, row_idx in enumerate(course_entries.index):
        print(counter, row_idx)

#     for counter, idx in enumerate(course_entries.index): # loop over all class entries
# #     for counter, idx in course_entries.iterrows():
# #         print("student: ", counter, " of ", len(course_entries))
#         grade = course_entries.loc[[idx]]['crse_grade_input']
#         gradeStr = grade.tolist()[0]
#         if (gradeStr in Y1GRADES):
#             full_matrix['y'][counter] = 1
#         # fill predictor matrix X
#         std_id = course_entries.loc[[idx]]['emplid'] # Get the student id
#         quarter = course_entries.loc[[idx]]['strm'] # Get the term
#         if gender:
#             female = course_entries.loc[[idx]]['sex'] # Get the student gender
#         student_courses = students_prev_classes.loc[students_prev_classes['emplid'] == std_id.tolist()[0]] # Get all courses the student took
#         student_courses = student_courses.loc[student_courses['strm'] < quarter.tolist()[0]] # Filter the courses to be only before the course of interest
#         if gender:
#             if (female.tolist()[0] == 'M'):
#                 full_matrix["female"][counter] = 0
#             else:
#                 full_matrix["female"][counter] = 1
#         full_matrix["strm"][counter] = quarter.tolist()[0]
#         full_matrix["emplid"][counter] = std_id.tolist()[0]
#         for j in student_courses.index:# Loop over the student's courses
#             course_name = student_courses.loc[[j]]['subject']+student_courses.loc[[j]]['catalog_nbr'] # Get the course name
#             grade = student_courses.loc[[j]]['crse_grade_input']
#             gradeStr = grade.tolist()[0]
#             if (gradeStr in X1GRADES):
#                 full_matrix.iloc[counter, full_matrix.columns.get_loc(course_name.tolist()[0])] = 1 # Set the course name in X to the new value (or if it was already passed keep the value)
# #             dev_break = True
    return full_matrix

In [8]:
#dataset includes a y column, which is the target. It also includes strm and emplid columns that should
#be dropped before running any prediction
dataset_110 = getStudentFeatures('CS110')
dataset_110

0 15
1 115
2 160
3 198
4 212
5 343
6 360
7 404
8 547
9 647
10 745
11 769
12 790
13 918
14 1221
15 1225
16 1560
17 1673
18 2904
19 2991
20 3528
21 3944
22 5513
23 7358
24 7713
25 8336
26 8564
27 8664
28 9413
29 10363
30 11418
31 11653
32 13845
33 13942
34 14281
35 14424
36 16657
37 17546
38 17610
39 17713
40 17955
41 18626
42 19093
43 20159
44 22244
45 22320
46 22928
47 23727
48 24427
49 24471
50 25796
51 27652
52 27657
53 28295
54 28882
55 29919
56 30013
57 31674
58 32016
59 33315
60 35135
61 36045
62 36225
63 36621
64 37172
65 40304
66 40503
67 40885
68 41251
69 41432
70 41866
71 41988
72 42338
73 43531
74 44473
75 45979
76 46663
77 46734
78 47565
79 47573
80 48429
81 48538
82 49300
83 49302
84 49346
85 49577
86 50161
87 50652
88 51571
89 52179
90 52834
91 53201
92 53203
93 53778
94 54531
95 55526
96 55953
97 56930
98 58036
99 58277
100 58283
101 59060
102 59104
103 59239
104 59944
105 59948
106 60825
107 61078
108 61084
109 61630
110 61901
111 62322
112 62395
113 62527
114 62532
115 

1348 737509
1349 737659
1350 737820
1351 738115
1352 738400
1353 738877
1354 738932
1355 739500
1356 739577
1357 739881
1358 740102
1359 740175
1360 740657
1361 740692
1362 740947
1363 741030
1364 741090
1365 741728
1366 741883
1367 742164
1368 742475
1369 742602
1370 742894
1371 743470
1372 743651
1373 743735
1374 743736
1375 744312
1376 744631
1377 744812
1378 745137
1379 745231
1380 745430
1381 745537
1382 746047
1383 746421
1384 746507
1385 746954
1386 747576
1387 747700
1388 747864
1389 748190
1390 748835
1391 749479
1392 749497
1393 749592
1394 749824
1395 749933
1396 750127
1397 750138
1398 750193
1399 750381
1400 751051
1401 751161
1402 751468
1403 752311
1404 752380
1405 752609
1406 752874
1407 753174
1408 753345
1409 753463
1410 753530
1411 753601
1412 753938
1413 753970
1414 754127
1415 754250
1416 754576
1417 754643
1418 754734
1419 755007
1420 755237
1421 755284
1422 755371
1423 755633
1424 755730
1425 756219
1426 756494
1427 756622
1428 756925
1429 757496
1430 757975
1431

Unnamed: 0,MATH51,CHEM31X,CS106X,CHEM33,PHIL1,CS103,CHEM35,CS109,CS161,CS107,...,PHYSICS231,PHIL251,BIO153,PHYSICS212,PHYSICS331,LINGUIST205A,MATH226,strm,emplid,y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
collections.Counter(dataset_110.y)

Counter({0.0: 1539})

# Try sklearn logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
# Use the X predictor matrix and y outcome vector from 
# getStudentFeatures, fit a logistic regression model and return
# the list of coefficients, default sorted by absolute value

# Params: sort = ["pos", "neg", "abs"] to sort by largest positive, negative, or
# absolute value of coefficients
def get_coefs(X_train, X_test, y_train, y_test, sort = "abs"):
    model = LogisticRegression(solver="lbfgs")
    model.fit(X_train, y_train)
    train_score = model.score(X_train,y_train) 
    test_score = model.score(X_test,y_test)
    print("training accuracy: %s" % train_score)
    print("test accuracy: %s" % test_score)
    coefs = [(X_test.columns[i], model.coef_[0][i]) for i in range(len(X_test.columns))]
    if sort:
        if sort == "pos":
            coefs = sorted(coefs, key = lambda x: x[1], reverse = True)
        if sort == "neg":
            coefs = sorted(coefs, key = lambda x: x[1], reverse = False)
        if sort == "abs":
            coefs = sorted(coefs, key = lambda x: abs(x[1]), reverse = True)
    return coefs

In [12]:
def get_and_split_data(course_name, test_quarters):
    dataset = getStudentFeatures(course_name)
    dataset_grades = dataset.loc[dataset['strm'] < 1204.0] #For now, dropping this quarter
    dataset_train = dataset_grades.loc[~dataset_grades['strm'].isin(test_quarters)]
    dataset_test = dataset_grades.loc[dataset_grades['strm'].isin(test_quarters)]

    dataset_train = dataset_train.drop(["strm", "emplid"], axis=1)
    dataset_test = dataset_test.drop(["strm", "emplid"], axis=1)

    X_train = dataset_train.drop(["y"], axis=1)
    y_train = dataset_train["y"]
    X_test = dataset_test.drop(["y"], axis=1)
    y_test = dataset_test["y"]
    return X_train, X_test, y_train, y_test

In [13]:
X_train, X_test, y_train, y_test = get_and_split_data('CS110', [1194.0, 1196.0])
CS110coefs = get_coefs(X_train, X_test, y_train, y_test, 'abs')
CS110coefs[:100]

0 15
1 115
2 160
3 198
4 212
5 343
6 360
7 404
8 547
9 647
10 745
11 769
12 790
13 918
14 1221
15 1225
16 1560
17 1673
18 2904
19 2991
20 3528
21 3944
22 5513
23 7358
24 7713
25 8336
26 8564
27 8664
28 9413
29 10363
30 11418
31 11653
32 13845
33 13942
34 14281
35 14424
36 16657
37 17546
38 17610
39 17713
40 17955
41 18626
42 19093
43 20159
44 22244
45 22320
46 22928
47 23727
48 24427
49 24471
50 25796
51 27652
52 27657
53 28295
54 28882
55 29919
56 30013
57 31674
58 32016
59 33315
60 35135
61 36045
62 36225
63 36621
64 37172
65 40304
66 40503
67 40885
68 41251
69 41432
70 41866
71 41988
72 42338
73 43531
74 44473
75 45979
76 46663
77 46734
78 47565
79 47573
80 48429
81 48538
82 49300
83 49302
84 49346
85 49577
86 50161
87 50652
88 51571
89 52179
90 52834
91 53201
92 53203
93 53778
94 54531
95 55526
96 55953
97 56930
98 58036
99 58277
100 58283
101 59060
102 59104
103 59239
104 59944
105 59948
106 60825
107 61078
108 61084
109 61630
110 61901
111 62322
112 62395
113 62527
114 62532
115 

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0

## LASSO logistic regression to reduce nonzero coefficients

In [14]:
def one_course_lasso(course_name, test_quarters = [1194.0, 1196.0]):
    X_train, X_test, y_train, y_test = get_and_split_data(course_name, test_quarters)
    for i in [10, 3, 1, 0.3, 0.1, 0.03, 0.01]:
      model = LogisticRegression(penalty="l1", solver="saga", C=i, max_iter=1000) 
      model.fit(X_train, y_train)
      train_score = model.score(X_train, y_train)
      test_score = model.score(X_test, y_test)
      coefs = [(X_train.columns[i], model.coef_[0][i]) for i in range(len(X_train.columns))]
      coefs_pos = sorted(coefs, key = lambda x: x[1], reverse = True)
      coefs_neg = sorted(coefs, key = lambda x: x[1], reverse = False)
      coefs_abs = sorted(coefs, key = lambda x: abs(x[1]), reverse = True)
      coefs_nonzero = sum([x[1] != 0 for x in coefs])
      coefs_zero = sum(x[1] == 0 for x in coefs)
      print("\n***** C = %f *****" % i)
      print("training accuracy: %s" % train_score)
      print("test accuracy: %s" % test_score)
      print("Number of nonzero/zero coefficients: %d/%d" % (coefs_nonzero, coefs_zero))
      print("Largest absolute coefficients:")
      pprint(coefs_abs[:3])
      print("Largest positive coefficients:")
      pprint(coefs_pos[:3])
      print("Largest negative coefficients:")
      pprint(coefs_neg[:3])

In [15]:
one_course_lasso("CS221")

0 16
1 98
2 123
3 199
4 289
5 304
6 338
7 361
8 435
9 543
10 594
11 765
12 854
13 912
14 928
15 1065
16 1557
17 1675
18 2907
19 3496
20 3532
21 4700
22 5432
23 5507
24 7363
25 8324
26 8665
27 10175
28 10431
29 11205
30 11412
31 11661
32 12506
33 13947
34 14283
35 14305
36 14429
37 16265
38 16555
39 16673
40 17548
41 18636
42 19089
43 20166
44 20959
45 22380
46 23732
47 24480
48 24884
49 25793
50 27972
51 29925
52 30087
53 31678
54 32024
55 33334
56 34764
57 35140
58 36038
59 36617
60 37141
61 37745
62 39878
63 40510
64 40878
65 41264
66 41293
67 41444
68 41934
69 42345
70 45967
71 46392
72 46665
73 46738
74 47211
75 47479
76 48433
77 49086
78 49317
79 49358
80 50165
81 51580
82 52186
83 52341
84 53207
85 53618
86 53780
87 56264
88 56375
89 56941
90 58047
91 58777
92 59964
93 60842
94 61003
95 61921
96 62402
97 62522
98 63243
99 63681
100 64030
101 64118
102 64547
103 64829
104 65018
105 66906
106 67266
107 68231
108 70276
109 70537
110 70833
111 71305
112 71377
113 71979
114 72817
115 

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0

In [16]:
one_course_lasso("CS229")

0 126
1 176
2 321
3 345
4 556
5 775
6 868
7 910
8 1872
9 2916
10 3002
11 3497
12 3689
13 5433
14 5515
15 7346
16 9066
17 10355
18 11671
19 14306
20 17549
21 17601
22 19074
23 20944
24 22050
25 22253
26 22983
27 24893
28 25397
29 25864
30 27541
31 27839
32 27975
33 31679
34 32023
35 33330
36 36618
37 37663
38 37855
39 39872
40 40879
41 41433
42 45964
43 46229
44 46666
45 47564
46 49091
47 49361
48 50441
49 51581
50 52184
51 52347
52 53208
53 53784
54 56376
55 60838
56 60894
57 61004
58 62325
59 63030
60 64036
61 64110
62 64839
63 65538
64 68251
65 70237
66 70283
67 70665
68 71306
69 71387
70 71980
71 72871
72 73603
73 75320
74 75579
75 76936
76 77493
77 77715
78 77864
79 78546
80 79394
81 81832
82 82284
83 82388
84 83476
85 83487
86 86366
87 87411
88 87884
89 88284
90 94165
91 94442
92 94454
93 94743
94 96808
95 97091
96 97923
97 99823
98 99954
99 100539
100 100973
101 103856
102 104505
103 105418
104 105777
105 106635
106 106990
107 107822
108 109641
109 109724
110 110105
111 111461
11

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0