In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime

### Auxiliary functions:

In [2]:
def process_bytes(x, distinction=False):
    x = str(x)
    if "b'" not in x:
        val = x
    else:
        val = x.replace("b'", "").split("\\x00")[0]
        if "'" in val:
            val = val[:-1]
    if distinction:
        if val == "":
            val = False
        else:
            val = True
    return val

def trunc(x_input, max_len):
    x = x_input.copy()
    x_video_len = x['video_length']
    x_prob_len = x['prob_length']
    x_len = x_video_len + x_prob_len
    
    diff = x_len - max_len
    
    if diff > 0:
        x['video_id'] = x['video_id'][:-diff]
        x['event_type'] = x['event_type'][:x_video_len-diff] + x['event_type'][x_video_len:]
        x['pass-fail'] = x['pass-fail']
        x['timestamp'] = x['timestamp'][:x_video_len-diff] + x['timestamp'][x_video_len:]
        x['arr_value'] = x['arr_value'][:x_video_len-diff] + x['arr_value'][x_video_len:]
        x['video_length'] = x['video_length'] - diff
        
    return x

# Only using video events:

In [3]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

dataset = dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value
list_of_video_events = list(np.unique(dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = dataset[dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


dataset_group = dataset.groupby('user_id')
time_event_per_user = dataset_group[['timestamp', 'event_type', 'pass-fail', 'video_id', 'arr_value']].agg(lambda x: list(x))
selected_users = time_event_per_user[(dataset_group.size() > min_len) & (dataset_group.size() < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_video_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_video_events)+1])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:len(events), len(list_of_video_events)] = row['video_id']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


FileNotFoundError: [Errno 2] No such file or directory: 'F:/SeFT4ED/DATASETS/mooc/coursera/video_event/progfun-002.csv'

# Video+Problem events:

In [None]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])

merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+3])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:row['video_length'], len(list_of_events)] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


# Testing without assignment

In [None]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 1
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset[(video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp <= x_deadline)]

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 1
max_len = 1000

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type'])[0:])
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])

merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+2])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]] = arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]]
    arr[:, 9] = 0
    arr[:row['video_length'], len(list_of_events)-1] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
    
    del arr
    
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


# Testing without assignment (and quizes) at all

In [None]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type'])[0:])
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])

merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+1])
    arr[np.arange(len(events)), events] = row['arr_value']
    #arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]] = arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]]
    arr[:, 8] = 0
    arr[:, 9] = 0
    arr[:row['video_length'], len(list_of_events)-2] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)-1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


# Getting first x%

In [None]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 0.4
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])


merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types
selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+3])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:row['video_length'], len(list_of_events)] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


# Getting first x% without assignments

In [None]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 1
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type'])[0:])
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])

merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+2])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]] = arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]]
    arr[:, 9] = 0
    arr[:row['video_length'], len(list_of_events)-1] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


# Getting first x% with removing easy-fails (every event)

In [None]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

marras_feats = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/eq_week-marras_et_al-{saved_filename.replace("-", "_")}/feature_labels.csv')
number_id_mapping = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/user_id_mapping-{saved_filename.replace("-", "_")}.csv')
hard_fail = marras_feats.merge(number_id_mapping, on='Unnamed: 0', how='inner')['user_id']


meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 0.4
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/video_event/{filename}")
video_dataset = video_dataset.merge(hard_fail, on='user_id', how='inner')
outcome_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/grade/{filename}")
outcome_dataset = outcome_dataset.merge(hard_fail, on='user_id', how='inner')

prob_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/problem_event/{filename}")
prob_dataset = prob_dataset.merge(hard_fail, on='user_id', how='inner')

video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 1000

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])


merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+3])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:row['video_length'], len(list_of_events)] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data_hard_fail.npy", P_users)
np.save(f"{filename.split('.')[0]}_y_hard_fail.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


# First x%, easy-fail removed, truncate to max_len

In [None]:
# NOTE
# Note, for now I'm assuming prob_length is always less than max_len

filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

marras_feats = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/eq_week-marras_et_al-{saved_filename.replace("-", "_")}/feature_labels.csv')
number_id_mapping = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/user_id_mapping-{saved_filename.replace("-", "_")}.csv')
hard_fail = marras_feats.merge(number_id_mapping, on='Unnamed: 0', how='inner')['user_id']


meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 1
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/video_event/{filename}")
video_dataset = video_dataset.merge(hard_fail, on='user_id', how='inner')
outcome_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/grade/{filename}")
outcome_dataset = outcome_dataset.merge(hard_fail, on='user_id', how='inner')

prob_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/problem_event/{filename}")
prob_dataset = prob_dataset.merge(hard_fail, on='user_id', how='inner')

video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]

#video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='outer')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
#video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='outer', suffixes=['_video', '_prob'])


merged['video_length'] = merged['timestamp_video'].apply(lambda x: len(x) if type(x) == list else 0)
merged['prob_length'] = merged['timestamp_prob'].apply(lambda x: len(x) if type(x) == list else 0)
for column in merged.columns:
    merged[column] = merged[column].apply(lambda x: [] if type(x) not in [list, int] else x)

merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

merged = merged.merge(outcome_dataset[['user_id', 'pass-fail']], on='user_id', how='inner')
#TODO: What to do for students without any video or problem interactions in the specified period?
#For now, delete them, because what is the model supposed to learn from?

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

#selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = dataset.apply(trunc, max_len=max_len, axis=1)
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+3])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:row['video_length'], len(list_of_events)] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data_hard_fail.npy", P_users)
np.save(f"{filename.split('.')[0]}_y_hard_fail.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


# For all courses, not removing students with no interactions

In [4]:
MOOCs_list = [
'villesafricaines_002.csv',
 'villesafricaines_003.csv',
 'microcontroleurs_004.csv',
 'dsp_004.csv',
 'hwts_001.csv',
 'dsp_001.csv',
 'progfun_002.csv',
 'microcontroleurs_003.csv',
 'geomatique_003.csv',
 'villesafricaines_001.csv',
 'progfun_003.csv',
 'dsp_002.csv',
 'structures_002.csv',
 'initprogcpp_001.csv',
 'analysenumerique_003.csv',
 'microcontroleurs_006.csv',
 'dsp_005.csv',
 'hwts_002.csv',
 'dsp_006.csv',
 'analysenumerique_002.csv',
 'structures_003.csv',
 'microcontroleurs_005.csv',
 'venture_001.csv',
 'analysenumerique_001.csv',
 'cpp_fr_001.csv',
 'structures_001.csv'
]
#MOOCs_list = [i.split('.')[0] for i in MOOCs_list]
MOOCs_list = [i.replace("_", "-") for i in MOOCs_list]

In [15]:
!mkdir ../raindrop_data/prep_data
!mkdir ../raindrop_data/split_args


In [16]:
# NOTE
# Note, for now I'm assuming prob_length is always less than max_len

dims = []
info_dict = {}
for filename in MOOCs_list:

    #filename = "progfun-002.csv"
    coursetype = "mooc/coursera"
    saved_filename = filename.split('.')[0]
    
    marras_feats = pd.read_csv(f'../raindrop_data/easy-fail/eq_week-marras_et_al-{saved_filename.replace("-", "_")}/feature_labels.csv')
    number_id_mapping = pd.read_csv(f'../raindrop_data/easy-fail/user_id_mapping-{saved_filename.replace("-", "_")}.csv')
    hard_fail = marras_feats.merge(number_id_mapping, on='Unnamed: 0', how='inner')['user_id']


    meta_dataset = pd.read_csv('../raindrop_data/mooc/metadata.csv')
    meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

    start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
    start_timestamp = start_timestamp.timestamp()
    end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
    end_timestamp = end_timestamp.timestamp()

    x_percent = 0.6
    x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

    video_dataset = pd.read_csv(f"../raindrop_data/{coursetype}/video_event/{filename}")
    video_dataset = video_dataset.merge(hard_fail, on='user_id', how='right')
    outcome_dataset = pd.read_csv(f"../raindrop_data/{coursetype}/grade/{filename}")
    outcome_dataset = outcome_dataset.merge(hard_fail, on='user_id', how='right')

    prob_dataset = pd.read_csv(f"../raindrop_data/{coursetype}/problem_event/{filename}")
    prob_dataset = prob_dataset.merge(hard_fail, on='user_id', how='right')

    
    video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
    prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]
    
    #video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='outer')
    if 'XXX-' in filename:
        dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

    min_len = 0
    max_len = 1000

    #======
    #Adding numbers to arr_value, VIDEO
    list_of_video_events = list(np.unique(video_dataset['event_type']))
    temp_datasets = []
    for event_type in list_of_video_events:
        temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
        if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
            temp_dataset['arr_value'] = temp_dataset['current_time']
        elif event_type in ['Video.Seek', 'Video.Stalled']:
            temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
        elif event_type == 'Video.SpeedChange':
            temp_dataset['arr_value'] = temp_dataset['new_speed']
        else:
            temp_dataset['arr_value'] = 1
        temp_datasets.append(temp_dataset)

    merged_video = pd.concat(temp_datasets)
    #video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
    video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'arr_value']].dropna()
    #======


    #======
    #Adding numbers to arr_value, PROBLEM
    list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
    temp_datasets = []
    for prob_type in list_of_prob_types:
        temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()

        if prob_type == 'Assignment Part':
            temp_dataset = temp_dataset.dropna()
            temp_dataset['arr_value'] = temp_dataset['grade']
        elif prob_type == 'Quiz':
            temp_dataset['arr_value'] = 1
        else:
            print("New Problem type Found!!")
        temp_datasets.append(temp_dataset)


    merged_prob = pd.concat(temp_datasets)
    prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
    #======


    prob_group = prob_dataset.groupby('user_id').agg(list)
    video_group = video_dataset.groupby('user_id').agg(list)

    merged = video_group.merge(prob_group, on='user_id', how='outer', suffixes=['_video', '_prob'])


    merged['video_length'] = merged['timestamp_video'].apply(lambda x: len(x) if type(x) == list else 0)
    merged['prob_length'] = merged['timestamp_prob'].apply(lambda x: len(x) if type(x) == list else 0)
    for column in merged.columns:
        merged[column] = merged[column].apply(lambda x: [] if type(x) not in [list, int] else x)

    merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
    merged['event_type'] = merged['event_type'] + merged['problem_type']
    merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

    merged = merged.merge(outcome_dataset[['user_id', 'pass-fail']], on='user_id', how='right')
    #TODO: What to do for students without any video or problem interactions in the specified period?
    #For now, delete them, because what is the model supposed to learn from?
    
    merged['video_length'] = merged['video_length'].fillna(0).apply(int)
    merged['prob_length'] = merged['prob_length'].fillna(0).apply(int)
    for column in merged.columns:
        merged[column] = merged[column].apply(lambda x: [] if type(x) not in [list, int, str] else x)
    

    dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])
    
    list_of_events = list_of_video_events + list_of_prob_types

    #selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
    selected_users = dataset.apply(trunc, max_len=max_len, axis=1)
    selected_users = selected_users.reset_index(level=0)
    selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
    selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
    selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x=='Passed' else 0)

    P_users = []
    y_users = []
    for index, row in selected_users.iterrows():
        user_id = row['user_id']
        time = row['timestamp']
        events = row['event_type']
        arr = np.zeros([max_len, len(list_of_events)+3])
        arr[np.arange(len(events)), events] = row['arr_value']
        arr[:row['video_length'], len(list_of_events)] = row['video_id']
        arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
        arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']

        indices = np.argsort(time)

        P_users.append({'id': user_id, 
                        'static': tuple([0, 0, 0, 0, 0, 0]), 
                        'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                        'arr': arr[indices],
                        'time': np.reshape(time[indices], (max_len, 1)), 
                        'length': len(events)
                        })

        y_users.append(row['pass-fail'])
    y_users = np.reshape(np.array(y_users), (len(y_users), 1))

    
    np.save(os.path.join('../raindrop_data/prep_data', f"{filename.split('.')[0]}_{int(x_percent*100)}_data_hard_fail.npy"), P_users)
    np.save(os.path.join('../raindrop_data/prep_data', f"{filename.split('.')[0]}_{int(x_percent*100)}_y_hard_fail.npy"), y_users)

    print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")
    dims.append(len(list_of_events)+3)
    info_dict[saved_filename] = [len(P_users), sum(y_users)/len(y_users)]


Dataset: villesafricaines-002.csv 
 Users: 3000 
 1/all ratio: [0.078]
Dataset: villesafricaines-003.csv 
 Users: 2153 
 1/all ratio: [0.10496981]
Dataset: microcontroleurs-004.csv 
 Users: 2827 
 1/all ratio: [0.08206579]
Dataset: dsp-004.csv 
 Users: 1735 
 1/all ratio: [0.16311239]
Dataset: hwts-001.csv 
 Users: 1400 
 1/all ratio: [0.45714286]
Dataset: dsp-001.csv 
 Users: 5611 
 1/all ratio: [0.2696489]
Dataset: progfun-002.csv 
 Users: 7840 
 1/all ratio: [0.81747449]
Dataset: microcontroleurs-003.csv 
 Users: 567 
 1/all ratio: [0.49382716]
Dataset: geomatique-003.csv 
 Users: 452 
 1/all ratio: [0.45132743]
Dataset: villesafricaines-001.csv 
 Users: 4941 
 1/all ratio: [0.11353977]
Dataset: progfun-003.csv 
 Users: 10862 
 1/all ratio: [0.52071442]
Dataset: dsp-002.csv 
 Users: 3974 
 1/all ratio: [0.23351787]
Dataset: structures-002.csv 
 Users: 97 
 1/all ratio: [0.84536082]
Dataset: initprogcpp-001.csv 
 Users: 727 
 1/all ratio: [0.63411279]
Dataset: analysenumerique-003.cs

In [None]:
info_dict = {key: [info_dict[key][0], float(info_dict[key][1])] for key in info_dict.keys()}

In [None]:
info_dict

In [None]:
list_of_events

In [None]:
['Video.Download',
 'Video.Error',
 'Video.Load',
 'Video.Pause',
 'Video.Play',
 'Video.Seek',
 'Video.SpeedChange',
 'Video.Stalled',
 'Assignment Part',
 'Quiz',
 'video_id',
 'problem_id',
 'submission_number']

In [None]:
list_of_prob_types

In [None]:
dims4

In [None]:
dims6

In [None]:
len(dims4)

In [None]:
merged[.loc[merged.isnull()] = merged.loc[merged.isnull()].apply(lambda x: [])

In [None]:
for motherfuckingcolumn in merged.columns:
    merged[motherfuckingcolumn] = merged[motherfuckingcolumn].apply(lambda x: [] if type(x) != list else x)

In [None]:
merged

In [None]:
merged['timestamp_video'].apply(lambda x: [] if type(x) != list else x)

In [None]:
prob_group.shape

In [None]:
video_group.shape

In [None]:
merged = video_group.merge(prob_group, on='user_id', how='outer', suffixes=['_video', '_prob'])


merged['video_length'] = merged['timestamp_video'].apply(lambda x: len(x) if type(x) == list else 0)
merged['prob_length'] = merged['timestamp_prob'].apply(lambda x: len(x) if type(x) == list else 0)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

In [None]:
merged

In [None]:
merged[merged.index == 3257038]['video_id'].isnull()

In [None]:
merged['timestamp_video']

In [None]:
len(np.nan)

# Temp exploration

In [None]:
list_of_events

In [None]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

marras_feats = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/eq_week-marras_et_al-{saved_filename.replace("-", "_")}/feature_labels.csv')
number_id_mapping = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/user_id_mapping-{saved_filename.replace("-", "_")}.csv')
hard_fail = marras_feats.merge(number_id_mapping, on='Unnamed: 0', how='inner')['user_id']


meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

In [None]:
prob_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/problem_event/{filename}")

In [None]:
prob_dataset

In [None]:
np.unique(prob_dataset['problem_type'], return_counts=True)

In [None]:
prob_dataset = prob_dataset.merge(hard_fail, on='user_id', how='inner')

In [None]:
prob_dataset

In [None]:
np.unique(prob_dataset['problem_type'], return_counts=True)

In [None]:
lengths = (dataset['video_length'] + dataset['prob_length']).to_numpy()

In [None]:
lengths.sort()

In [None]:
lengths[-500:]

In [None]:
len(lengths)

In [None]:
a = dataset['prob_length'].to_numpy()

In [None]:
a.sort()

In [None]:
a[-500:]