In [1]:
import numpy as np
import pandas as pd
import tensorflow.keras as keras
import json
import ast

In [2]:
data_path = "/Volumes/MOOC/flipped/apr21-feature-mooc-flipped/"
week_type = "eq_week"
feature_types = ['akpinar_et_al', 'boroujeni_et_al', 
                 'chen_cui', 'he_et_al', 'lalle_conati','lemay_doleck', 
                 'marras_et_al', 'mbouzao_et_al', 'mubarak_et_al', 'wan_et_al']
marras_et_al_id = feature_types.index('marras_et_al')
akpinar_et_al_id = feature_types.index('akpinar_et_al')
course = "epflx_algebre2x"
num_f = 50
num_p = 50
remove_obvious = True

In [3]:
def fillNaN(feature):
    shape = feature.shape
    feature_min = np.nanmin(
        feature.reshape(-1, shape[2]), axis=0
    )  # min of that feature over all weeks
    feature = feature.reshape(-1, shape[2])
    inds = np.where(np.isnan(feature))
    feature[inds] = np.take(feature_min.reshape(-1), inds[1])
    feature = feature.reshape(shape)
    return feature

In [4]:
def clean_name(feature):
    id = feature.find('<')
    if id==-1:
        return feature
    fct = feature[id+9:id+14].strip()
    return feature[0:id]+fct

def clean_akp_name(feature):
    feature = feature.lower()
    if feature.find("(")!=-1:
        feature = feature[1:-1]
        feature = feature.replace(', ', '-')
    return feature

In [5]:
# Loading the features
feature_list = {}

feature_type_list = []
for feature_type in feature_types:

    filepath = data_path + week_type + '-' + feature_type + '-' + course
    feature_current = np.load(filepath+'/feature_values.npz')['feature_values']
    print(feature_current.shape)
    feature_norm = feature_current.reshape(-1,feature_current.shape[2] )
    print(feature_norm.shape)
    feature_type_list.append(pd.DataFrame(feature_norm))
feature_list[course] = feature_type_list

print('course: ', course)
print('week_type: ', week_type)
print('feature_type: ', feature_types)

(170, 16, 347)
(2720, 347)
(170, 16, 3)
(2720, 3)
(170, 16, 13)
(2720, 13)
(170, 16, 3)
(2720, 3)
(170, 16, 22)
(2720, 22)
(170, 16, 10)
(2720, 10)
(170, 16, 12)
(2720, 12)
(170, 16, 3)
(2720, 3)
(170, 16, 13)
(2720, 13)
(170, 16, 14)
(2720, 14)
course:  epflx_algebre2x
week_type:  eq_week
feature_type:  ['akpinar_et_al', 'boroujeni_et_al', 'chen_cui', 'he_et_al', 'lalle_conati', 'lemay_doleck', 'marras_et_al', 'mbouzao_et_al', 'mubarak_et_al', 'wan_et_al']


In [6]:
# Loading feature names
feature_names= dict()

for feature_type in feature_types:
    
    filepath = data_path + week_type + '-' + feature_type + '-' + course + '/settings.txt'
    file = open(filepath, "r")
    contents = file.read()
    dictionary = ast.literal_eval(contents)
    file.close()
    
    feature_type_name = dictionary['feature_names']
    feature_type_name = [clean_name(x) for x in feature_type_name]
    
    if feature_type == 'akpinar_et_al': 
        feature_type_name = [clean_akp_name(x) for x in feature_type_name]
        akp_mask = np.where(np.isin(feature_type_name, 
                 ["total_clicks", "number_sessions", "time_in__video_sum", "time_in__problem_sum",
                  'problem.check-problem.check-problem.check', 
                  'problem.check-problem.check-video.load', 
                  'video.play-video.play-video.play',
                  'video.play-video.pause-video.load',
                  'video.play-problem.check-problem.check',
                  'video.play-video.stop-video.play',
                  'video.pause-video.speedchange-video.play',
                  'video.stop-video.play-video.seek',
                  'video.stop-problem.check-video.load']))
#         print(akp_mask)
        feature_type_name = list(np.array(feature_type_name)[akp_mask[0]])
        feature_list[course][akpinar_et_al_id] = feature_list[course][akpinar_et_al_id][akp_mask[0]]
        
    feature_names[feature_type] = feature_type_name
    print(feature_type, len(feature_type_name))

if remove_obvious: 
    # drop 'student shape', 'competency strength', 'competency alignment' in marras at al
    mr_mask = np.where(np.isin(feature_names['marras_et_al'], 
                 ['student_shape', 'competency_strength', 'competency_alignment']))
    
    new_marras = np.delete(np.array(feature_names['marras_et_al']), mr_mask[0])
    print(new_marras)
    feature_names['marras_et_al'] = new_marras
    
#     new_features = feature_list[course][1].drop(mask[0], axis=1)
#     feature_list[course][1] = new_features

akpinar_et_al 13
boroujeni_et_al 3
chen_cui 13
he_et_al 3
lalle_conati 22
lemay_doleck 10
marras_et_al 7
mbouzao_et_al 3
mubarak_et_al 13
wan_et_al 13
['competency_anticipation' 'content_alignment' 'content_anticipation'
 'student_speed']


In [7]:
# Loading the labels
feature_type = "boroujeni_et_al"
filepath = data_path + week_type + '-' + feature_type + '-' + course + '/feature_labels.csv'
labels = pd.read_csv(filepath)["label-pass-fail"]
labels[labels.shape[0]] = 1
y = labels.values


In [8]:
def load_features(course):
    feature_list = []
    selected_features = []
    total_features = set()
    num_weeks = 0
    num_features = 0
    for i,feature_type in enumerate(feature_types):
        try:
            filepath = data_path + week_type + '-' + feature_type + '-' + course 
            feature_current = np.load(filepath+'/feature_values.npz')['feature_values']

            shape = feature_current.shape
    #         print(shape)

            if remove_obvious and feature_type=='marras_et_al':
                feature_current = np.delete(feature_current, mr_mask[0], axis=2)

            if feature_type=='akpinar_et_al':
                akp_mask_dl = np.delete(list(range(shape[2])), akp_mask[0])
                feature_current = np.delete(feature_current, akp_mask_dl, axis=2)

            shape = feature_current.shape
            print(shape)
            if i==0:
                num_weeks = shape[1]


            selected = np.arange(shape[2])
            # drop existed features
            exist_mask = []
            for i, name in enumerate(feature_names[feature_type]):
                if name in total_features:
                    exist_mask.append(i)
                else:
                    total_features.add(name)
            feature_current = np.delete(feature_current, exist_mask, axis=2)
            selected = np.delete(selected, exist_mask)

            nonNaN = (shape[0]*shape[1] - np.isnan(feature_current.reshape(-1,feature_current.shape[2])).sum(axis=0) > 0)
            feature_current = feature_current[:,:,nonNaN]
            selected = selected[nonNaN]
            feature_current = fillNaN(feature_current)
            nonZero = (abs(feature_current.reshape(-1,feature_current.shape[2])).sum(axis=0)>0)
            selected = selected[nonZero]
            feature_current = feature_current[:,:,nonZero]
    #         print(len(feature_names[feature_type]), selected)
            selected_features.append(np.array(feature_names[feature_type])[[selected]])
            num_features += len(np.array(feature_names[feature_type])[[selected]])


            ##### Normalization with min-max. Added the artifical 1.001 max row for solving the same min max problem
            ##### for features with max=0 I added 1 instead of 1.001 of maximum

            features_min = feature_current.min(axis=0).reshape(-1)
            features_max = feature_current.max(axis=0)
            features_max = np.where(features_max==0,np.ones(features_max.shape),features_max)
            max_instance = 1.001*features_max
            feature_current = np.vstack([feature_current,max_instance.reshape((1,)+max_instance.shape)])
            features_max = features_max.reshape(-1)
            feature_norm = (feature_current.reshape(shape[0]+1,-1)-features_min)/(1.001*features_max-features_min)
            feature_current = feature_norm.reshape(-1,feature_current.shape[1],feature_current.shape[2] )

            feature_list.append(feature_current)
        except:
            print('{} is not valiad'.format(feature_type))
        
    features = np.concatenate(feature_list, axis=2)
    features_min = features.min(axis=0).reshape(-1)
    features_max = features.max(axis=0)
    features = features.reshape(features.shape[0],-1)
    features = pd.DataFrame(features)
    
    SHAPE = features.shape
    # print(np.isnan(features[0,0,-1]))
    print('features shape:', features.shape)
    print('course: ', course)
    print('week_type: ', week_type)
    print('feature_type: ', feature_types)
    print(selected_features)
    return features, features_min, features_max, selected_features, num_weeks, num_features


In [9]:
features, features_min, features_max, selected_features, n_weeks, n_features = load_features(course)

(2149, 15, 13)
(2149, 15, 3)
(2149, 15, 13)
(2149, 15, 3)
(2149, 15, 22)
(2149, 15, 10)
(2149, 15, 4)
(2149, 15, 3)
(2149, 15, 13)
(2149, 15, 13)
features shape: (2150, 990)
course:  geomatique_003
week_type:  eq_week
feature_type:  ['akpinar_et_al', 'boroujeni_et_al', 'chen_cui', 'he_et_al', 'lalle_conati', 'lemay_doleck', 'marras_et_al', 'mbouzao_et_al', 'mubarak_et_al', 'wan_et_al']
[array(['total_clicks', 'number_sessions', 'time_in__video_sum',
       'time_in__problem_sum', 'video.pause-video.speedchange-video.play',
       'video.play-video.pause-video.load',
       'video.play-video.play-video.play',
       'video.play-problem.check-problem.check',
       'problem.check-problem.check-video.load',
       'problem.check-problem.check-problem.check'], dtype='<U41'), array(['regularity_peak_dayhour', 'regularity_periodicity_m1',
       'delay_lecture'], dtype='<U25'), array(['time_sessions_sum', 'time_sessions_mean',
       'time_between_sessions_std', 'time_sessions_std',
       '

  selected_features.append(np.array(feature_names[feature_type])[[selected]])
  num_features += len(np.array(feature_names[feature_type])[[selected]])


In [10]:
# make feature names more readable
# ex: time_in__problem_<function sum at 0x7f3bd02cc9d0> -> time_in_problem_sum
def clean_name(feature):
    id = feature.find('<')
    if id==-1:
        return feature
    fct = feature[id+9:id+14].strip()
    return feature[0:id]+fct


cleaned_selected_features = dict()

for i,feature_type in enumerate(feature_types):
    cleaned_features = [clean_name(x) for x in selected_features[i]]
    cleaned_selected_features[feature_type] = cleaned_features

selected_features = cleaned_selected_features
print(selected_features)
# file = 'selected_features/' + course + '_after.json'
# with open(file, 'w') as f: 
#     json.dump(selected_features, f)

{'akpinar_et_al': ['total_clicks', 'number_sessions', 'time_in__video_sum', 'time_in__problem_sum', 'video.pause-video.speedchange-video.play', 'video.play-video.pause-video.load', 'video.play-video.play-video.play', 'video.play-problem.check-problem.check', 'problem.check-problem.check-video.load', 'problem.check-problem.check-problem.check'], 'boroujeni_et_al': ['regularity_peak_dayhour', 'regularity_periodicity_m1', 'delay_lecture'], 'chen_cui': ['time_sessions_sum', 'time_sessions_mean', 'time_between_sessions_std', 'time_sessions_std', 'total_clicks_weekday', 'total_clicks_weekend', 'ratio_clicks_weekend_day', 'total_clicks_video', 'total_clicks_problem'], 'he_et_al': ['attendance_rate', 'utilization_rate', 'watching_ratio'], 'lalle_conati': ['total_clicks_Video.Load', 'weekly_prop_watched_mean', 'weekly_prop_replayed_mean', 'weekly_prop_interrupted_mean', 'total_clicks_Video', 'frequency_action_Video', 'frequency_action_Video.Load', 'frequency_action_Video.Play', 'frequency_actio

In [11]:
num_feature_type = []
for i, feature_type in enumerate(feature_types):
    num_feature_type.append(len(selected_features[feature_type]))
print(num_feature_type)

[10, 3, 9, 3, 13, 8, 4, 1, 4, 11]


In [12]:
# Loading feature names and transforming them to 2D format.
feature_names= []
final_features = []
for feature_type in feature_types:
    [final_features.append(x) for x in selected_features[feature_type]]
for i in np.arange(n_weeks):
    feature_type_name_with_weeks = [(x+'_InWeek'+str(i+1)) for x in final_features]
    feature_names.append(feature_type_name_with_weeks)
feature_names = np.concatenate(feature_names, axis=0)
feature_names = feature_names.reshape(-1)
# print(feature_names)
features.columns = feature_names


## Making a predict_prob

In [4]:
# This module transforms our data to the 2D format biLSTM was trained with.
def transform_x(x, num_feature_type, num_weeks, features_min, features_max, normal=True):
    return np.array(x).reshape((x.shape[0],x.shape[1]))

In [5]:
# EDIT HERE FOR OTHER MODELS
model_path = '../models/'
model_name = model_path + "lstm_bi_"+course
loaded_model = keras.models.load_model(model_name)

2022-06-15 09:46:11.172107: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-15 09:46:12.435708: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 14 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.
2022-06-15 09:46:13.887068: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-06-15 09:46:14.475816: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes m

In [7]:
loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 14, 82)            0         
                                                                 
 masking (Masking)           (None, 14, 82)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 14, 128)          75264     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 116,545
Trainable params: 116,545
Non-trai

In [15]:
prediction = loaded_model.predict(
    transform_x(np.array(features), 
                num_feature_type, n_weeks, 
                features_min=features_min, 
                features_max=features_max))

In [16]:
print(prediction.shape, y.shape, features.shape)
features_with_prediction = features.copy()
features_with_prediction["prediction"] = prediction
features_with_prediction["real_label"] = y
features_with_prediction["abs_difference"] = abs(
    features_with_prediction["prediction"].values
    - features_with_prediction["real_label"].values
)


(2150, 1) (2150,) (2150, 990)


In [17]:
failed_instances = y > 0
failed = features_with_prediction.iloc[failed_instances]
failed = failed.sort_values(by="abs_difference")

In [18]:
passed_instances = y < 1
passed = features_with_prediction.iloc[passed_instances]
passed = passed.sort_values(by="abs_difference")

In [19]:
chosen_p = passed.iloc[
    (np.ceil(np.linspace(0, passed.shape[0] - 1, num_p)))
].index.values
chosen_f = failed.iloc[
    (np.ceil(np.linspace(0, failed.shape[0] - 1, num_f)))
].index.values
instances = np.concatenate((chosen_f, chosen_p))

In [20]:
np.save("uniform_data/uniform_" + course, instances)