In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, average_precision_score, precision_score, recall_score, f1_score, balanced_accuracy_score
%cd Raindrop/code
from models_rd import *
from utils_rd import *
import time

/../ML4ED/Raindrop/code


In [2]:
MOOCs_list = [
'villesafricaines_002.csv',
 'villesafricaines_003.csv',
 'microcontroleurs_004.csv',
 'dsp_004.csv',
 'hwts_001.csv',
 'dsp_001.csv',
 'progfun_002.csv',
 'microcontroleurs_003.csv',
 'geomatique_003.csv',
 'villesafricaines_001.csv',
 'progfun_003.csv',
 'dsp_002.csv',
 'structures_002.csv',
 'initprogcpp_001.csv',
 'analysenumerique_003.csv',
 'microcontroleurs_006.csv',
 'dsp_005.csv',
 'hwts_002.csv',
 'dsp_006.csv',
 'analysenumerique_002.csv',
 'structures_003.csv',
 'microcontroleurs_005.csv',
 'venture_001.csv',
 'analysenumerique_001.csv',
 'cpp_fr_001.csv',
 'structures_001.csv'
]
MOOCs_list = [i.replace("_", "-").split('.')[0] for i in MOOCs_list]

dims4 = [
 12,
 12,
 13,
 12,
 12,
 6,
 12,
 13,
 12,
 12,
 12,
 12,
 13,
 13,
 6,
 13,
 12,
 12,
 12,
 12,
 13,
 13,
 12,
 6,
 13,
 12
]

dims6 = [
 12,
 12,
 13,
 12,
 12,
 6,
 12,
 13,
 12,
 12,
 12,
 12,
 13,
 13,
 6,
 13,
 12,
 12,
 13,
 12,
 13,
 13,
 12,
 6,
 13,
 12
]
dims = {40: dims4, 60: dims6}
data_path = '/../data'
percentile = 40

In [3]:
torch.manual_seed(1)

for MOOC_idx, MOOC in enumerate(MOOCs_list):
    
    d_inp = dims[percentile][MOOC_idx]
    max_len = 1000
    n_classes = 2
    global_structure = torch.ones(d_inp, d_inp)
    saved_model_path = f"../models/n1_mlp2_best_raindrop_{MOOC}_{percentile}.pt"
    batch_size = 256
    n_splits = 1

    d_static = 9
    static_info = 1
    d_ob = 4
    d_model = d_inp * d_ob
    nhid = 2 * d_model
    nlayers = 1
    nhead = 2
    dropout = 0.2
    sensor_wise_mask = False
    MAX = 100
    aggreg = 'mean'
    n_runs=1

    learning_rate = 0.0001
    num_epochs = 25

    acc_arr = np.zeros((n_splits, n_runs))
    auprc_arr = np.zeros((n_splits, n_runs))
    auroc_arr = np.zeros((n_splits, n_runs))
    precision_arr = np.zeros((n_splits, n_runs))
    recall_arr = np.zeros((n_splits, n_runs))
    F1_arr = np.zeros((n_splits, n_runs))


    base_path = '/../data/prep_data'
    Pdict_list = np.load(os.path.join(base_path, f"{MOOC}_{percentile}_data_hard_fail.npy"), allow_pickle=True)
    arr_outcomes = np.load(os.path.join(base_path, f"{MOOC}_{percentile}_y_hard_fail.npy"), allow_pickle=True)

    #Ptrain, Ptest, ytrain, ytest = train_test_split(Pdict_list, arr_outcomes, test_size=0.1, random_state=1)
    #Ptrain, Pval, ytrain, yval = train_test_split(Ptrain, ytrain, test_size=1/9, random_state=1)
    args_train, args_val, args_test = np.load(os.path.join(data_path, 
                                                                   'split_args', f"split_{MOOC.replace('-', '_')}.npy"),
                                                     allow_pickle=True)
    Ptrain = Pdict_list[args_train]
    Pval = Pdict_list[args_val]
    Ptest = Pdict_list[args_test]
    ytrain = arr_outcomes[args_train, :]
    yval = arr_outcomes[args_val, :]
    ytest = arr_outcomes[args_test, :]
    
    
    zero_indices = [i for i, item in enumerate(Ptrain) if item['length'] == 0]
    #zero_Ptrain = Ptrain[zero_indices]
    Ptrain = np.delete(Ptrain, zero_indices, axis=0)
    ytrain = np.delete(ytrain, zero_indices, axis=0)

    zero_indices = [i for i, item in enumerate(Pval) if item['length'] == 0]
    zero_yval = ytest[zero_indices]
    Pval = np.delete(Pval, zero_indices, axis=0)
    yval = np.delete(yval, zero_indices, axis=0)

    zero_indices = [i for i, item in enumerate(Ptest) if item['length'] == 0]
    zero_ytest = ytest[zero_indices]
    Ptest = np.delete(Ptest, zero_indices, axis=0)  
    ytest = np.delete(ytest, zero_indices, axis=0)
    

    T, F = Ptrain[0]['arr'].shape
    D = len(Ptrain[0]['extended_static'])

    Ptrain_tensor = np.zeros((len(Ptrain), T, F))
    Ptrain_static_tensor = np.zeros((len(Ptrain), D))

    for i in range(len(Ptrain)):
        Ptrain_tensor[i] = Ptrain[i]['arr']
        Ptrain_static_tensor[i] = Ptrain[i]['extended_static']

    mf, stdf = getStats(Ptrain_tensor)
    ms, ss = getStats_static(Ptrain_static_tensor, dataset='P12')

    Ptrain_tensor, Ptrain_static_tensor, Ptrain_time_tensor, ytrain_tensor = tensorize_normalize(Ptrain, ytrain, mf,
                                                                                                 stdf, ms, ss)
    Pval_tensor, Pval_static_tensor, Pval_time_tensor, yval_tensor = tensorize_normalize(Pval, yval, mf, stdf, ms, ss)

    Ptest_tensor, Ptest_static_tensor, Ptest_time_tensor, ytest_tensor = tensorize_normalize(Ptest, ytest, mf, stdf, ms, ss)

    Ptrain_tensor = Ptrain_tensor.permute(1, 0, 2)
    Pval_tensor = Pval_tensor.permute(1, 0, 2)
    Ptest_tensor = Ptest_tensor.permute(1, 0, 2)

    Ptrain_time_tensor = Ptrain_time_tensor.squeeze(2).permute(1, 0)
    Pval_time_tensor = Pval_time_tensor.squeeze(2).permute(1, 0)
    Ptest_time_tensor = Ptest_time_tensor.squeeze(2).permute(1, 0)
    
    model = Raindrop_v2(d_inp, d_model, nhead, nhid, nlayers, dropout, max_len,
                                        d_static, MAX, 0.5, aggreg, n_classes, global_structure,
                                        sensor_wise_mask=sensor_wise_mask)
    #torch.save(model, '../models/raw_raindrop_model.pt')

    model = model.cuda()
    criterion = torch.nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1,
                                                           patience=1, threshold=0.0001, threshold_mode='rel',
                                                           cooldown=0, min_lr=1e-8, eps=1e-08, verbose=True)


    idx_0 = np.where(ytrain == 0)[0]
    idx_1 = np.where(ytrain == 1)[0]

    n0, n1 = len(idx_0), len(idx_1)
    expanded_idx_1 = np.concatenate([idx_1, idx_1, idx_1], axis=0)
    expanded_n1 = len(expanded_idx_1)

    K0 = n0 // int(batch_size / 2)
    K1 = expanded_n1 // int(batch_size / 2)
    n_batches = np.min([K0, K1])

    best_aupr_val = best_auc_val = 0.0
    best_loss_val = 100.0

    print('Stop epochs: %d, Batches/epoch: %d, Total batches: %d' % (num_epochs, n_batches, num_epochs * n_batches))
    start = time.time()

    for epoch in range(num_epochs):
        model.train()


        np.random.shuffle(expanded_idx_1)
        I1 = expanded_idx_1
        np.random.shuffle(idx_0)
        I0 = idx_0

        for n in range(n_batches):

            idx0_batch = I0[n * int(batch_size / 2):(n + 1) * int(batch_size / 2)]
            idx1_batch = I1[n * int(batch_size / 2):(n + 1) * int(batch_size / 2)]
            idx = np.concatenate([idx0_batch, idx1_batch], axis=0)


            P, Ptime, Pstatic, y = Ptrain_tensor[:, idx, :].cuda(), Ptrain_time_tensor[:, idx].cuda(), \
                                   Ptrain_static_tensor[idx].cuda(), ytrain_tensor[idx].cuda()


            lengths = torch.sum(Ptime > 0, dim=0)
            #outputs, local_structure_regularization, _ = model.forward(P, Pstatic, Ptime, lengths)
            outputs = model.forward(P, Pstatic, Ptime, lengths)
            outputs = torch.nan_to_num(outputs)
            

            optimizer.zero_grad()
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()


        train_probs = torch.squeeze(torch.sigmoid(outputs))
        train_probs = train_probs.cpu().detach().numpy()
        train_y = y.cpu().detach().numpy()
        train_auroc = roc_auc_score(train_y, train_probs[:, 1])
        train_auprc = average_precision_score(train_y, train_probs[:, 1])


        if epoch == 0 or epoch == num_epochs - 1:
            print(confusion_matrix(train_y, np.argmax(train_probs, axis=1), labels=[0, 1]))

        """Validation"""
        model.eval()
        if epoch == 0 or epoch % 1 == 0:
            with torch.no_grad():
                #out_val = evaluate_standard(model, Pval_tensor, Pval_time_tensor, Pval_static_tensor, static=static_info)
                #Changed into:
                out_val = evaluate(model, Pval_tensor, Pval_time_tensor, Pval_static_tensor,batch_size=batch_size, static=static_info)
                out_val = torch.squeeze(torch.sigmoid(out_val))
                out_val = out_val.detach().cpu().numpy()

                val_loss = criterion(torch.from_numpy(out_val), torch.from_numpy(yval.squeeze(1)).long())


                auc_val = roc_auc_score(yval, out_val[:, 1])
                aupr_val = average_precision_score(yval, out_val[:, 1])

                print("Validation: Epoch %d,  val_loss:%.4f, aupr_val: %.2f, auc_val: %.2f" % (epoch,
                                                                                                val_loss.item(),
                                                                                                aupr_val * 100,
                                                                                                auc_val * 100))

                scheduler.step(aupr_val)
                if auc_val > best_auc_val:
                    best_auc_val = auc_val
                    print(
                        "**[S] Epoch %d, aupr_val: %.4f, auc_val: %.4f **" % (
                        epoch, aupr_val * 100, auc_val * 100))
                    torch.save(model.state_dict(), saved_model_path)

    end = time.time()
    time_elapsed = end - start
    print('Total Time elapsed: %.3f mins' % (time_elapsed / 60.0))
    
    # Test
    model.load_state_dict(torch.load(saved_model_path))
    model.eval()
    with torch.no_grad():
                    out_test = evaluate(model, Ptest_tensor, Ptest_time_tensor, Ptest_static_tensor, n_classes=n_classes, static=static_info, batch_size=batch_size).numpy()
                    ypred = np.argmax(out_test, axis=1)
                
                    # Adding zero interaction students
                    ytest = np.append(ytest, zero_ytest, axis=0)
                    ypred = np.append(ypred, np.zeros([1, len(zero_ytest)]))
                    

                    denoms = np.sum(np.exp(out_test), axis=1).reshape((-1, 1))
                    probs = np.exp(out_test) / denoms
                    
                    # Adding zero interaction students
                    probs = np.append(probs, np.zeros([len(zero_ytest), 2]), axis=0)

                    acc = np.sum(ytest.ravel() == ypred.ravel()) / ytest.shape[0]
                    bac = balanced_accuracy_score(ytest.ravel(), ypred.ravel())
                    f1 = f1_score(ytest.ravel(), ypred.ravel())

                    auc = roc_auc_score(ytest, probs[:, 1])
                    aupr = average_precision_score(ytest, probs[:, 1])

                    #print('Testing: AUROC = %.2f | AUPRC = %.2f | Accuracy = %.2f' % (auc * 100, aupr * 100, acc * 100))
                    #print('classification report', classification_report(ytest, ypred))
                    #print(confusion_matrix(ytest, ypred, labels=list(range(2))))
                    results = pd.DataFrame(columns=['course', 'percentile', 'acc', 'bac', 'f1', 'auc', 'auprc'])
                    results.loc[0] = [MOOC, percentile, acc, bac, f1, auc, aupr]
                    results.to_csv(f"../raindrop_results/test_{MOOC}_{percentile}.csv")
                    print(results)
                    
                    
                    
                    out_val = evaluate(model, Pval_tensor, Pval_time_tensor, Pval_static_tensor, n_classes=n_classes, static=static_info, batch_size=batch_size).numpy()
                    ypred = np.argmax(out_val, axis=1)
                
                    # Adding zero interaction students
                    ytest = np.append(yval, zero_yval, axis=0)
                    ypred = np.append(ypred, np.zeros([1, len(zero_yval)]))
                    

                    denoms = np.sum(np.exp(out_val), axis=1).reshape((-1, 1))
                    probs = np.exp(out_val) / denoms
                    
                    # Adding zero interaction students
                    probs = np.append(probs, np.zeros([len(zero_yval), 2]), axis=0)

                    acc = np.sum(yval.ravel() == ypred.ravel()) / yval.shape[0]
                    bac = balanced_accuracy_score(yval.ravel(), ypred.ravel())
                    f1 = f1_score(yval.ravel(), ypred.ravel())

                    auc = roc_auc_score(yval, probs[:, 1])
                    aupr = average_precision_score(yval, probs[:, 1])

                    results_val = pd.DataFrame(columns=['course', 'percentile', 'acc', 'bac', 'f1', 'auc', 'auprc'])
                    results_val.loc[0] = [MOOC, percentile, acc, bac, f1, auc, aupr]
                    results_val.to_csv(f"../raindrop_results/val_{MOOC}_{percentile}.csv")
                    print(results_val)
                    
                    

Stop epochs: 25, Batches/epoch: 1, Total batches: 25
[[  0 128]
 [  0 128]]
Validation: Epoch 0,  val_loss:0.7485, aupr_val: 26.86, auc_val: 89.83
**[S] Epoch 0, aupr_val: 26.8567, auc_val: 89.8336 **
Validation: Epoch 1,  val_loss:0.7448, aupr_val: 24.61, auc_val: 89.74
Validation: Epoch 2,  val_loss:0.7412, aupr_val: 24.92, auc_val: 89.47
Epoch 00003: reducing learning rate of group 0 to 1.0000e-05.
Validation: Epoch 3,  val_loss:0.7408, aupr_val: 24.72, auc_val: 89.43
Validation: Epoch 4,  val_loss:0.7404, aupr_val: 30.27, auc_val: 89.47
Validation: Epoch 5,  val_loss:0.7401, aupr_val: 30.27, auc_val: 89.47
Validation: Epoch 6,  val_loss:0.7397, aupr_val: 30.09, auc_val: 89.43
Epoch 00007: reducing learning rate of group 0 to 1.0000e-06.
Validation: Epoch 7,  val_loss:0.7397, aupr_val: 30.09, auc_val: 89.43
Validation: Epoch 8,  val_loss:0.7396, aupr_val: 30.09, auc_val: 89.43
Epoch 00009: reducing learning rate of group 0 to 1.0000e-07.
Validation: Epoch 9,  val_loss:0.7396, aupr_v