In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

## Unified train, val, and test sets to be used both on Raindrop and all the baselines
In order for the model comparisons to be fair, we use a single script to prepare the train, validation, and test indices for each of the courses. Each model then reads the same indices and uses it to stratify the data. We found this approach specifically more efficient than keeping a random seed for two reasons:
1. To handle missing data, students with no interactions, etc. just once and for all the models.
2. To keep track of individual students and their predictions accross various models.

In [2]:
data_path = '/../data'
easy_fail_path = os.path.join(data_path, 'extracting_concepts')
mooc_path = os.path.join(data_path, 'mooc')

In [3]:
week_type = 'eq_week'
feature_type = "boroujeni_et_al"

In [4]:
courses_list = ['villesafricaines_002.csv',
 'villesafricaines_003.csv',
 'microcontroleurs_004.csv',
 'dsp_004.csv',
 'hwts_001.csv',
 'dsp_001.csv',
 'progfun_002.csv',
 'microcontroleurs_003.csv',
 'geomatique_003.csv',
 'villesafricaines_001.csv',
 'progfun_003.csv',
 'dsp_002.csv',
 'structures_002.csv',
 'initprogcpp_001.csv',
 'analysenumerique_003.csv',
 'microcontroleurs_006.csv',
 'dsp_005.csv',
 'hwts_002.csv',
 'dsp_006.csv',
 'analysenumerique_002.csv',
 'structures_003.csv',
 'microcontroleurs_005.csv',
 'venture_001.csv',
 'analysenumerique_001.csv',
 'cpp_fr_001.csv',
 'structures_001.csv']
courses_list = [i.split('.')[0] for i in courses_list]

In [5]:
train_size = 0.8
val_size = 0.1
test_size = 0.1

In [6]:
for course in courses_list:
    # Read a sample easy_fail_features file for the course
    easy_fail_list = pd.read_csv(
        os.path.join(easy_fail_path, 
                     f"{week_type}-{feature_type}-{course.replace('-', '_')}", 
                     'feature_labels.csv'))   
    
    
    students_num = len(easy_fail_list)
    args = np.arange(students_num)
    # We need labels to stratify the splitting for making sure a good balance is preserved
    labels = list(easy_fail_list['label-pass-fail'].apply(int))
    
    # Split just the arguments so we can use them everywhere (assuming no shuffling happended anywhere)
    args_train, args_test, _, labels_test = train_test_split(args, 
                                                           labels, 
                                                           train_size = train_size, 
                                                           stratify = labels, 
                                                           random_state = 1)
    args_val, args_test = train_test_split(args_test, 
                                           train_size = 0.5, 
                                           stratify = labels_test, 
                                           random_state = 0)
    
    np.save(os.path.join(data_path, 'split_args', f"split_{course.replace('-','_')}.npy"), 
            np.asanyarray([args_train, args_val, args_test], dtype=object))
    print(f"course: {course} with a balance of: {sum(labels)/len(labels)}")

course: villesafricaines_002 with a balance of: 0.922
course: villesafricaines_003 with a balance of: 0.8950301904319554
course: microcontroleurs_004 with a balance of: 0.9179342058719491
course: dsp_004 with a balance of: 0.8368876080691643
course: hwts_001 with a balance of: 0.5428571428571428
course: dsp_001 with a balance of: 0.7303510960613081
course: progfun_002 with a balance of: 0.18252551020408164
course: microcontroleurs_003 with a balance of: 0.5061728395061729
course: geomatique_003 with a balance of: 0.5486725663716814
course: villesafricaines_001 with a balance of: 0.8864602307225258
course: progfun_003 with a balance of: 0.47928558276560485
course: dsp_002 with a balance of: 0.7664821338701561
course: structures_002 with a balance of: 0.15463917525773196
course: initprogcpp_001 with a balance of: 0.3658872077028886
course: analysenumerique_003 with a balance of: 0.25272331154684097
course: microcontroleurs_006 with a balance of: 0.891156462585034
course: dsp_005 with a b