In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime

### Auxiliary functions:

In [2]:
def process_bytes(x, distinction=False):
    x = str(x)
    if "b'" not in x:
        val = x
    else:
        val = x.replace("b'", "").split("\\x00")[0]
        if "'" in val:
            val = val[:-1]
    if distinction:
        if val == "":
            val = False
        else:
            val = True
    return val

def trunc(x_input, max_len):
    x = x_input.copy()
    x_video_len = x['video_length']
    x_prob_len = x['prob_length']
    x_len = x_video_len + x_prob_len
    
    diff = x_len - max_len
    
    if diff > 0:
        x['video_id'] = x['video_id'][:-diff]
        x['event_type'] = x['event_type'][:x_video_len-diff] + x['event_type'][x_video_len:]
        x['pass-fail'] = x['pass-fail']
        x['timestamp'] = x['timestamp'][:x_video_len-diff] + x['timestamp'][x_video_len:]
        x['arr_value'] = x['arr_value'][:x_video_len-diff] + x['arr_value'][x_video_len:]
        x['video_length'] = x['video_length'] - diff
        
    return x

# Only using video events:

In [4]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

dataset = dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value
list_of_video_events = list(np.unique(dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = dataset[dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


dataset_group = dataset.groupby('user_id')
time_event_per_user = dataset_group[['timestamp', 'event_type', 'pass-fail', 'video_id', 'arr_value']].agg(lambda x: list(x))
selected_users = time_event_per_user[(dataset_group.size() > min_len) & (dataset_group.size() < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_video_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_video_events)+1])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:len(events), len(list_of_video_events)] = row['video_id']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


Dataset: progfun-002.csv 
 Users: 7902 
 1/all ratio: [0.39736775]


# Video+Problem events:

In [49]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])

merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+3])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:row['video_length'], len(list_of_events)] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


Dataset: progfun-002.csv 
 Users: 7292 
 1/all ratio: [0.36437191]


# Testing without assignment

In [None]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 1
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset[(video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp <= x_deadline)]

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 1
max_len = 1000

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type'])[0:])
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])

merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+2])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]] = arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]]
    arr[:, 9] = 0
    arr[:row['video_length'], len(list_of_events)-1] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
    
    del arr
    
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


# Testing without assignment (and quizes) at all

In [13]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type'])[0:])
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])

merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+1])
    arr[np.arange(len(events)), events] = row['arr_value']
    #arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]] = arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]]
    arr[:, 8] = 0
    arr[:, 9] = 0
    arr[:row['video_length'], len(list_of_events)-2] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)-1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")



KeyboardInterrupt



# Getting first x%

In [115]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 0.4
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])


merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types
selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+3])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:row['video_length'], len(list_of_events)] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


Dataset: progfun-002.csv 
 Users: 9408 
 1/all ratio: [0.5]


# Getting first x% without assignments

In [23]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 1
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/video_event/{filename}")
outcome_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/grade/{filename}")

prob_dataset = pd.read_csv(f"F:/SeFT4ED/DATASETS/{coursetype}/problem_event/{filename}")

video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type'])[0:])
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])

merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+2])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]] = arr[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]]
    arr[:, 9] = 0
    arr[:row['video_length'], len(list_of_events)-1] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data.npy", P_users)
np.save(f"{filename.split('.')[0]}_y.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


Dataset: progfun-002.csv 
 Users: 7521 
 1/all ratio: [0.38997474]


# Getting first x% with removing easy-fails (every event)

In [74]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

marras_feats = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/eq_week-marras_et_al-{saved_filename.replace("-", "_")}/feature_labels.csv')
number_id_mapping = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/user_id_mapping-{saved_filename.replace("-", "_")}.csv')
hard_fail = marras_feats.merge(number_id_mapping, on='Unnamed: 0', how='inner')['user_id']


meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 0.4
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/video_event/{filename}")
video_dataset = video_dataset.merge(hard_fail, on='user_id', how='inner')
outcome_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/grade/{filename}")
outcome_dataset = outcome_dataset.merge(hard_fail, on='user_id', how='inner')

prob_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/problem_event/{filename}")
prob_dataset = prob_dataset.merge(hard_fail, on='user_id', how='inner')

video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]

video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='inner')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 1000

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='inner', suffixes=['_video', '_prob'])


merged['video_length'] = merged['timestamp_video'].apply(len)
merged['prob_length'] = merged['timestamp_prob'].apply(len)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x[0]=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+3])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:row['video_length'], len(list_of_events)] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data_hard_fail.npy", P_users)
np.save(f"{filename.split('.')[0]}_y_hard_fail.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


Dataset: progfun-002.csv 
 Users: 7393 
 1/all ratio: [0.83038009]


# First x%, easy-fail removed, truncate to max_len

In [6]:
# NOTE
# Note, for now I'm assuming prob_length is always less than max_len

filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

marras_feats = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/eq_week-marras_et_al-{saved_filename.replace("-", "_")}/feature_labels.csv')
number_id_mapping = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/user_id_mapping-{saved_filename.replace("-", "_")}.csv')
hard_fail = marras_feats.merge(number_id_mapping, on='Unnamed: 0', how='inner')['user_id']


meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
start_timestamp = start_timestamp.timestamp()
end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
end_timestamp = end_timestamp.timestamp()

x_percent = 1
x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

video_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/video_event/{filename}")
video_dataset = video_dataset.merge(hard_fail, on='user_id', how='inner')
outcome_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/grade/{filename}")
outcome_dataset = outcome_dataset.merge(hard_fail, on='user_id', how='inner')

prob_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/problem_event/{filename}")
prob_dataset = prob_dataset.merge(hard_fail, on='user_id', how='inner')

video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]

#video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='outer')
if 'XXX-' in filename:
    dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

min_len = 5
max_len = 215

#======
#Adding numbers to arr_value, VIDEO
list_of_video_events = list(np.unique(video_dataset['event_type']))
temp_datasets = []
for event_type in list_of_video_events:
    temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
    if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
        temp_dataset['arr_value'] = temp_dataset['current_time']
    elif event_type in ['Video.Seek', 'Video.Stalled']:
        temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
    elif event_type == 'Video.SpeedChange':
        temp_dataset['arr_value'] = temp_dataset['new_speed']
    else:
        temp_dataset['arr_value'] = 1
    temp_datasets.append(temp_dataset)
    
merged_video = pd.concat(temp_datasets)
#video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'arr_value']].dropna()
#======


#======
#Adding numbers to arr_value, PROBLEM
list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
temp_datasets = []
for prob_type in list_of_prob_types:
    temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()
    
    if prob_type == 'Assignment Part':
        temp_dataset = temp_dataset.dropna()
        temp_dataset['arr_value'] = temp_dataset['grade']
    elif prob_type == 'Quiz':
        temp_dataset['arr_value'] = 1
    else:
        print("New Problem type Found!!")
    temp_datasets.append(temp_dataset)
    
    
merged_prob = pd.concat(temp_datasets)
prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
#======


prob_group = prob_dataset.groupby('user_id').agg(list)
video_group = video_dataset.groupby('user_id').agg(list)

merged = video_group.merge(prob_group, on='user_id', how='outer', suffixes=['_video', '_prob'])


merged['video_length'] = merged['timestamp_video'].apply(lambda x: len(x) if type(x) == list else 0)
merged['prob_length'] = merged['timestamp_prob'].apply(lambda x: len(x) if type(x) == list else 0)
for column in merged.columns:
    merged[column] = merged[column].apply(lambda x: [] if type(x) not in [list, int] else x)

merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

merged = merged.merge(outcome_dataset[['user_id', 'pass-fail']], on='user_id', how='inner')
#TODO: What to do for students without any video or problem interactions in the specified period?
#For now, delete them, because what is the model supposed to learn from?

dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])

list_of_events = list_of_video_events + list_of_prob_types

#selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
selected_users = dataset.apply(trunc, max_len=max_len, axis=1)
selected_users = selected_users.reset_index(level=0)
selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x=='Passed' else 0)

P_users = []
y_users = []
for index, row in selected_users.iterrows():
    user_id = row['user_id']
    time = row['timestamp']
    events = row['event_type']
    arr = np.zeros([max_len, len(list_of_events)+3])
    arr[np.arange(len(events)), events] = row['arr_value']
    arr[:row['video_length'], len(list_of_events)] = row['video_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
    arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']
    
    indices = np.argsort(time)
    
    P_users.append({'id': user_id, 
                    'static': tuple([0, 0, 0, 0, 0, 0]), 
                    'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    'arr': arr[indices],
                    'time': np.reshape(time[indices], (max_len, 1)), 
                    'length': len(events)
                    })
    
    y_users.append(row['pass-fail'])
y_users = np.reshape(np.array(y_users), (len(y_users), 1))

np.save(f"{filename.split('.')[0]}_data_hard_fail.npy", P_users)
np.save(f"{filename.split('.')[0]}_y_hard_fail.npy", y_users)

print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")


FileNotFoundError: [Errno 2] No such file or directory: 'F:/SeFT4ED_2/data/result/easy-fail/eq_week-marras_et_al-progfun_002/feature_labels.csv'

# For all courses, not removing students with no interactions

In [4]:
MOOCs_list = [
'villesafricaines_002.csv',
 'villesafricaines_003.csv',
 'microcontroleurs_004.csv',
 'dsp_004.csv',
 'hwts_001.csv',
 'dsp_001.csv',
 'progfun_002.csv',
 'microcontroleurs_003.csv',
 'geomatique_003.csv',
 'villesafricaines_001.csv',
 'progfun_003.csv',
 'dsp_002.csv',
 'structures_002.csv',
 'initprogcpp_001.csv',
 'analysenumerique_003.csv',
 'microcontroleurs_006.csv',
 'dsp_005.csv',
 'hwts_002.csv',
 'dsp_006.csv',
 'analysenumerique_002.csv',
 'structures_003.csv',
 'microcontroleurs_005.csv',
 'venture_001.csv',
 'analysenumerique_001.csv',
 'cpp_fr_001.csv',
 'structures_001.csv'
]
#MOOCs_list = [i.split('.')[0] for i in MOOCs_list]
MOOCs_list = [i.replace("_", "-") for i in MOOCs_list]

In [5]:
# NOTE
# Note, for now I'm assuming prob_length is always less than max_len

dims = []
info_dict = {}
for filename in MOOCs_list:

    #filename = "progfun-002.csv"
    coursetype = "mooc/coursera"
    saved_filename = filename.split('.')[0]

    marras_feats = pd.read_csv(f'/beerslaw/raindrop_data/easy-fail/eq_week-marras_et_al-{saved_filename.replace("-", "_")}/feature_labels.csv')
    number_id_mapping = pd.read_csv(f'/beerslaw/raindrop_data/easy-fail/user_id_mapping-{saved_filename.replace("-", "_")}.csv')
    hard_fail = marras_feats.merge(number_id_mapping, on='Unnamed: 0', how='inner')['user_id']


    meta_dataset = pd.read_csv('/beerslaw/raindrop_data/mooc/metadata.csv')
    meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

    start_timestamp = datetime.datetime.strptime(meta_dataset.start_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
    start_timestamp = start_timestamp.timestamp()
    end_timestamp = datetime.datetime.strptime(meta_dataset.end_date.to_numpy()[0], '%Y-%m-%d %H:%M:%S')
    end_timestamp = end_timestamp.timestamp()

    x_percent = 0.6
    x_deadline = start_timestamp + x_percent*(end_timestamp-start_timestamp)

    video_dataset = pd.read_csv(f"/beerslaw/raindrop_data/{coursetype}/video_event/{filename}")
    video_dataset = video_dataset.merge(hard_fail, on='user_id', how='right')
    outcome_dataset = pd.read_csv(f"/beerslaw/raindrop_data/{coursetype}/grade/{filename}")
    outcome_dataset = outcome_dataset.merge(hard_fail, on='user_id', how='right')

    prob_dataset = pd.read_csv(f"/beerslaw/raindrop_data/{coursetype}/problem_event/{filename}")
    prob_dataset = prob_dataset.merge(hard_fail, on='user_id', how='right')

    
    video_dataset = video_dataset[(video_dataset.timestamp >= start_timestamp) & (video_dataset.timestamp <= x_deadline)]
    prob_dataset = prob_dataset[(prob_dataset.timestamp >= start_timestamp) & (prob_dataset.timestamp <= x_deadline)]
    
    #video_dataset = video_dataset.merge(outcome_dataset, on='user_id', how='outer')
    if 'XXX-' in filename:
        dataset['pass-fail'] = dataset['grade'].apply(lambda x: 'Passed' if x>=4 else 'Failed')

    min_len = 0
    max_len = 1000

    #======
    #Adding numbers to arr_value, VIDEO
    list_of_video_events = list(np.unique(video_dataset['event_type']))
    temp_datasets = []
    for event_type in list_of_video_events:
        temp_dataset = video_dataset[video_dataset.event_type == event_type].copy()
        if event_type in ['Video.Error', 'Video.Pause', 'Video.Play']:
            temp_dataset['arr_value'] = temp_dataset['current_time']
        elif event_type in ['Video.Seek', 'Video.Stalled']:
            temp_dataset['arr_value'] = temp_dataset['current_time'] - temp_dataset['old_time']
        elif event_type == 'Video.SpeedChange':
            temp_dataset['arr_value'] = temp_dataset['new_speed']
        else:
            temp_dataset['arr_value'] = 1
        temp_datasets.append(temp_dataset)

    merged_video = pd.concat(temp_datasets)
    #video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'pass-fail', 'arr_value']].dropna()
    video_dataset = merged_video[['user_id', 'video_id', 'event_type', 'timestamp', 'arr_value']].dropna()
    #======


    #======
    #Adding numbers to arr_value, PROBLEM
    list_of_prob_types = list(np.unique(prob_dataset['problem_type']))
    temp_datasets = []
    for prob_type in list_of_prob_types:
        temp_dataset = prob_dataset[prob_dataset.problem_type == prob_type].copy()

        if prob_type == 'Assignment Part':
            temp_dataset = temp_dataset.dropna()
            temp_dataset['arr_value'] = temp_dataset['grade']
        elif prob_type == 'Quiz':
            temp_dataset['arr_value'] = 1
        else:
            print("New Problem type Found!!")
        temp_datasets.append(temp_dataset)


    merged_prob = pd.concat(temp_datasets)
    prob_dataset = merged_prob[['user_id', 'problem_id', 'problem_type', 'timestamp', 'submission_number', 'arr_value']].dropna()
    #======


    prob_group = prob_dataset.groupby('user_id').agg(list)
    video_group = video_dataset.groupby('user_id').agg(list)

    merged = video_group.merge(prob_group, on='user_id', how='outer', suffixes=['_video', '_prob'])


    merged['video_length'] = merged['timestamp_video'].apply(lambda x: len(x) if type(x) == list else 0)
    merged['prob_length'] = merged['timestamp_prob'].apply(lambda x: len(x) if type(x) == list else 0)
    for column in merged.columns:
        merged[column] = merged[column].apply(lambda x: [] if type(x) not in [list, int] else x)

    merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
    merged['event_type'] = merged['event_type'] + merged['problem_type']
    merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

    merged = merged.merge(outcome_dataset[['user_id', 'pass-fail']], on='user_id', how='right')
    #TODO: What to do for students without any video or problem interactions in the specified period?
    #For now, delete them, because what is the model supposed to learn from?
    
    merged['video_length'] = merged['video_length'].fillna(0).apply(int)
    merged['prob_length'] = merged['prob_length'].fillna(0).apply(int)
    for column in merged.columns:
        merged[column] = merged[column].apply(lambda x: [] if type(x) not in [list, int, str] else x)
    

    dataset = merged.drop(columns=['timestamp_video', 'timestamp_prob', 'arr_value_video', 'arr_value_prob', 'problem_type'])
    
    list_of_events = list_of_video_events + list_of_prob_types

    #selected_users = dataset[(dataset['video_length'] + dataset['prob_length'] > min_len) & (dataset['video_length'] + dataset['prob_length'] < max_len)]
    selected_users = dataset.apply(trunc, max_len=max_len, axis=1)
    selected_users = selected_users.reset_index(level=0)
    selected_users['timestamp'] = selected_users['timestamp'].apply(lambda x: np.pad(x, (0, max_len-len(x)), 'constant', constant_values=(0, 0)))
    selected_users['event_type'] = selected_users['event_type'].apply(lambda x: [list_of_events.index(i) for i in x])
    selected_users['pass-fail'] = selected_users['pass-fail'].apply(lambda x: 1 if x=='Passed' else 0)

    P_users = []
    y_users = []
    for index, row in selected_users.iterrows():
        user_id = row['user_id']
        time = row['timestamp']
        events = row['event_type']
        arr = np.zeros([max_len, len(list_of_events)+3])
        arr[np.arange(len(events)), events] = row['arr_value']
        arr[:row['video_length'], len(list_of_events)] = row['video_id']
        arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+1] = row['problem_id']
        arr[row['video_length']:row['video_length']+row['prob_length'], len(list_of_events)+2] = row['submission_number']

        indices = np.argsort(time)

        P_users.append({'id': user_id, 
                        'static': tuple([0, 0, 0, 0, 0, 0]), 
                        'extended_static': [0, 0, 0, 0, 0, 0, 0, 0, 0],
                        'arr': arr[indices],
                        'time': np.reshape(time[indices], (max_len, 1)), 
                        'length': len(events)
                        })

        y_users.append(row['pass-fail'])
    y_users = np.reshape(np.array(y_users), (len(y_users), 1))

    
    np.save(os.path.join('/beerslaw/raindrop_data/prep_data', f"{filename.split('.')[0]}_{int(x_percent*100)}_data_hard_fail.npy"), P_users)
    np.save(os.path.join('/beerslaw/raindrop_data/prep_data', f"{filename.split('.')[0]}_{int(x_percent*100)}_y_hard_fail.npy"), y_users)

    print(f"Dataset: {filename} \n Users: {len(P_users)} \n 1/all ratio: {sum(y_users)/len(y_users)}")
    dims.append(len(list_of_events)+3)
    info_dict[saved_filename] = [len(P_users), sum(y_users)/len(y_users)]


Dataset: villesafricaines-002.csv 
 Users: 3000 
 1/all ratio: [0.078]
Dataset: villesafricaines-003.csv 
 Users: 2153 
 1/all ratio: [0.10496981]
Dataset: microcontroleurs-004.csv 
 Users: 2827 
 1/all ratio: [0.08206579]
Dataset: dsp-004.csv 
 Users: 1735 
 1/all ratio: [0.16311239]
Dataset: hwts-001.csv 
 Users: 1400 
 1/all ratio: [0.45714286]
Dataset: dsp-001.csv 
 Users: 5611 
 1/all ratio: [0.2696489]
Dataset: progfun-002.csv 
 Users: 7840 
 1/all ratio: [0.81747449]
Dataset: microcontroleurs-003.csv 
 Users: 567 
 1/all ratio: [0.49382716]
Dataset: geomatique-003.csv 
 Users: 452 
 1/all ratio: [0.45132743]
Dataset: villesafricaines-001.csv 
 Users: 4941 
 1/all ratio: [0.11353977]
Dataset: progfun-003.csv 
 Users: 10862 
 1/all ratio: [0.52071442]
Dataset: dsp-002.csv 
 Users: 3974 
 1/all ratio: [0.23351787]
Dataset: structures-002.csv 
 Users: 97 
 1/all ratio: [0.84536082]
Dataset: initprogcpp-001.csv 
 Users: 727 
 1/all ratio: [0.63411279]
Dataset: analysenumerique-003.cs

In [8]:
info_dict = {key: [info_dict[key][0], float(info_dict[key][1])] for key in info_dict.keys()}

In [9]:
info_dict

{'villesafricaines-002': [3000, 0.078],
 'villesafricaines-003': [2153, 0.1049698095680446],
 'microcontroleurs-004': [2827, 0.08206579412805094],
 'dsp-004': [1735, 0.16311239193083574],
 'hwts-001': [1400, 0.45714285714285713],
 'dsp-001': [5611, 0.26964890393869184],
 'progfun-002': [7840, 0.8174744897959184],
 'microcontroleurs-003': [567, 0.49382716049382713],
 'geomatique-003': [452, 0.45132743362831856],
 'villesafricaines-001': [4941, 0.1135397692774742],
 'progfun-003': [10862, 0.5207144172343952],
 'dsp-002': [3974, 0.233517866129844],
 'structures-002': [97, 0.845360824742268],
 'initprogcpp-001': [727, 0.6341127922971114],
 'analysenumerique-003': [459, 0.7472766884531591],
 'microcontroleurs-006': [1470, 0.10884353741496598],
 'dsp-005': [2605, 0.17159309021113245],
 'hwts-002': [1023, 0.49364613880742914],
 'dsp-006': [1469, 0.24029952348536418],
 'analysenumerique-002': [504, 0.7162698412698413],
 'structures-003': [173, 0.31213872832369943],
 'microcontroleurs-005': [26

In [12]:
list_of_events

['Video.Download',
 'Video.Error',
 'Video.Load',
 'Video.Pause',
 'Video.Play',
 'Video.Seek',
 'Video.SpeedChange',
 'Video.Stalled',
 'Assignment Part',
 'Quiz']

In [None]:
['Video.Download',
 'Video.Error',
 'Video.Load',
 'Video.Pause',
 'Video.Play',
 'Video.Seek',
 'Video.SpeedChange',
 'Video.Stalled',
 'Assignment Part',
 'Quiz',
 'video_id',
 'problem_id',
 'submission_number']

In [13]:
list_of_prob_types

['Quiz']

In [17]:
dims4

[12,
 12,
 13,
 12,
 12,
 6,
 12,
 13,
 12,
 12,
 12,
 12,
 13,
 13,
 6,
 13,
 12,
 12,
 12,
 12,
 13,
 13,
 12,
 6,
 13,
 12]

In [20]:
dims6

[12,
 12,
 13,
 12,
 12,
 6,
 12,
 13,
 12,
 12,
 12,
 12,
 13,
 13,
 6,
 13,
 12,
 12,
 13,
 12,
 13,
 13,
 12,
 6,
 13,
 12]

In [21]:
len(dims4)

26

In [201]:
merged[.loc[merged.isnull()] = merged.loc[merged.isnull()].apply(lambda x: [])

ValueError: Cannot index with multidimensional key

In [212]:
for motherfuckingcolumn in merged.columns:
    merged[motherfuckingcolumn] = merged[motherfuckingcolumn].apply(lambda x: [] if type(x) != list else x)

In [213]:
merged

Unnamed: 0_level_0,video_id,event_type,timestamp_video,pass-fail,arr_value_video,problem_id,problem_type,timestamp_prob,submission_number,arr_value_prob,video_length,prob_length
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1578,"[3, 109, 89, 8, 5, 91, 21, 5, 93, 103, 125, 91...","[Video.Download, Video.Download, Video.Downloa...","[1364293284, 1367410652, 1367049572, 136428962...","[Passed, Passed, Passed, Passed, Passed, Passe...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[12, 20, 7, 17, 20, 12, 12, 12, 6, 12, 14, 6, ...","[Assignment Part, Assignment Part, Assignment ...","[1366038288, 1368552465, 1364898526, 136790945...","[2, 3, 1, 1, 2, 6, 5, 4, 1, 1, 1, 2, 1, 1, 3]","[9.27, 10.0, 10.0, 10.0, 0.0, 10.0, 9.27, 9.27...",[],[]
1584,"[33, 21, 103, 85, 117, 121, 41, 105, 109, 73, ...","[Video.Download, Video.Download, Video.Downloa...","[1384066837, 1384066759, 1367744683, 138406704...","[Failed, Failed, Failed, Failed, Failed, Faile...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[14, 20, 7, 17, 12, 6, 20]","[Assignment Part, Assignment Part, Assignment ...","[1367607437, 1373900387, 1364939197, 136912613...","[1, 1, 1, 1, 1, 1, 2]","[9.55, 9.65, 10.0, 10.0, 10.0, 10.0, 10.0]",[],[]
1587,"[35, 37, 101, 111, 91, 79, 95, 75, 107, 113, 2...","[Video.Download, Video.Download, Video.Downloa...","[1368215102, 1368215106, 1368215223, 136821524...","[Passed, Passed, Passed, Passed, Passed, Passe...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[7, 7, 20, 5, 6, 12, 14, 17, 12, 12]","[Assignment Part, Assignment Part, Assignment ...","[1364959846, 1364960882, 1367994193, 136425592...","[1, 2, 1, 1, 1, 1, 1, 1, 2, 3]","[10.0, 10.0, 10.0, 10.0, 10.0, 7.09, 10.0, 10....",[],[]
1727,"[125, 121, 113, 123, 115, 103, 119, 43, 71, 79...","[Video.Download, Video.Download, Video.Downloa...","[1368864326, 1368864136, 1368311193, 136886344...","[Passed, Passed, Passed, Passed, Passed, Passe...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[14, 17, 12, 7, 6, 5, 12, 12, 7, 7, 6, 6, 17, ...","[Assignment Part, Assignment Part, Assignment ...","[1367043197, 1368525256, 1366594645, 136497833...","[1, 2, 2, 2, 1, 1, 4, 3, 1, 3, 5, 3, 1, 1, 1, ...","[10.0, 10.0, 9.27, 9.98, 6.33, 10.0, 10.0, 9.2...",[],[]
1887,"[8, 4, 51, 5, 8, 43, 41, 37, 35, 3, 27, 53, 35...","[Video.Load, Video.Load, Video.Load, Video.Loa...","[1364273704, 1364878645, 1367165509, 136427404...","[Failed, Failed, Failed, Failed, Failed, Faile...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[12, 6, 6, 12, 7, 12]","[Assignment Part, Assignment Part, Assignment ...","[1367190487, 1366133246, 1366131479, 136718971...","[2, 2, 1, 1, 1, 3]","[7.09, 9.96, 7.63, 7.09, 9.8, 10.0]",[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...
3187748,[],[],[],[],[],"[6, 20, 12, 12, 17, 7, 14, 20]","[Assignment Part, Assignment Part, Assignment ...","[1365926675, 1368954488, 1366861742, 136695102...","[1, 2, 1, 2, 1, 1, 1, 1]","[10.0, 10.0, 0.0, 10.0, 10.0, 10.0, 10.0, 10.0]",[],[]
3196657,[],[],[],[],[],"[5, 7, 20, 12, 14, 6, 17]","[Assignment Part, Assignment Part, Assignment ...","[1365347414, 1365347332, 1368747396, 136712167...","[1, 1, 1, 1, 1, 1, 1]","[10.0, 9.96, 10.0, 10.0, 10.0, 10.0, 9.98]",[],[]
3207804,[],[],[],[],[],"[7, 17, 12, 14, 6, 20, 14, 14, 14, 7, 12, 6, 1...","[Assignment Part, Assignment Part, Assignment ...","[1365451607, 1368206638, 1366308909, 136683457...","[2, 1, 1, 1, 3, 1, 3, 4, 2, 1, 2, 1, 5, 2, 2]","[10.0, 9.98, 7.09, 9.55, 10.0, 10.0, 9.55, 9.5...",[],[]
3238405,[],[],[],[],[],"[17, 14, 7, 12, 6, 14, 14]","[Assignment Part, Assignment Part, Assignment ...","[1367611577, 1366409905, 1365712641, 136640334...","[1, 2, 1, 1, 1, 1, 3]","[10.0, 9.55, 10.0, 10.0, 10.0, 9.55, 10.0]",[],[]


In [191]:
merged['timestamp_video'].apply(lambda x: [] if type(x) != list else x)

user_id
1578       [1364293284, 1367410652, 1367049572, 136428962...
1584       [1384066837, 1384066759, 1367744683, 138406704...
1587       [1368215102, 1368215106, 1368215223, 136821524...
1727       [1368864326, 1368864136, 1368311193, 136886344...
1887       [1364273704, 1364878645, 1367165509, 136427404...
                                 ...                        
3187748                                                   []
3196657                                                   []
3207804                                                   []
3238405                                                   []
3257038                                                   []
Name: timestamp_video, Length: 7840, dtype: object

In [149]:
prob_group.shape

(7840, 5)

In [150]:
video_group.shape

(7787, 5)

In [177]:
merged = video_group.merge(prob_group, on='user_id', how='outer', suffixes=['_video', '_prob'])


merged['video_length'] = merged['timestamp_video'].apply(lambda x: len(x) if type(x) == list else 0)
merged['prob_length'] = merged['timestamp_prob'].apply(lambda x: len(x) if type(x) == list else 0)
merged['timestamp'] = merged['timestamp_video'] + merged['timestamp_prob']
merged['event_type'] = merged['event_type'] + merged['problem_type']
merged['arr_value'] = merged['arr_value_video'] + merged['arr_value_prob']

In [178]:
merged

Unnamed: 0_level_0,video_id,event_type,timestamp_video,pass-fail,arr_value_video,problem_id,problem_type,timestamp_prob,submission_number,arr_value_prob,video_length,prob_length,timestamp,arr_value
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1578,"[3, 109, 89, 8, 5, 91, 21, 5, 93, 103, 125, 91...","[Video.Download, Video.Download, Video.Downloa...","[1364293284, 1367410652, 1367049572, 136428962...","[Passed, Passed, Passed, Passed, Passed, Passe...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[12, 20, 7, 17, 20, 12, 12, 12, 6, 12, 14, 6, ...","[Assignment Part, Assignment Part, Assignment ...","[1366038288, 1368552465, 1364898526, 136790945...","[2, 3, 1, 1, 2, 6, 5, 4, 1, 1, 1, 2, 1, 1, 3]","[9.27, 10.0, 10.0, 10.0, 0.0, 10.0, 9.27, 9.27...",285,15,"[1364293284, 1367410652, 1367049572, 136428962...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1584,"[33, 21, 103, 85, 117, 121, 41, 105, 109, 73, ...","[Video.Download, Video.Download, Video.Downloa...","[1384066837, 1384066759, 1367744683, 138406704...","[Failed, Failed, Failed, Failed, Failed, Faile...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[14, 20, 7, 17, 12, 6, 20]","[Assignment Part, Assignment Part, Assignment ...","[1367607437, 1373900387, 1364939197, 136912613...","[1, 1, 1, 1, 1, 1, 2]","[9.55, 9.65, 10.0, 10.0, 10.0, 10.0, 10.0]",143,7,"[1384066837, 1384066759, 1367744683, 138406704...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1587,"[35, 37, 101, 111, 91, 79, 95, 75, 107, 113, 2...","[Video.Download, Video.Download, Video.Downloa...","[1368215102, 1368215106, 1368215223, 136821524...","[Passed, Passed, Passed, Passed, Passed, Passe...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[7, 7, 20, 5, 6, 12, 14, 17, 12, 12]","[Assignment Part, Assignment Part, Assignment ...","[1364959846, 1364960882, 1367994193, 136425592...","[1, 2, 1, 1, 1, 1, 1, 1, 2, 3]","[10.0, 10.0, 10.0, 10.0, 10.0, 7.09, 10.0, 10....",325,10,"[1368215102, 1368215106, 1368215223, 136821524...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1727,"[125, 121, 113, 123, 115, 103, 119, 43, 71, 79...","[Video.Download, Video.Download, Video.Downloa...","[1368864326, 1368864136, 1368311193, 136886344...","[Passed, Passed, Passed, Passed, Passed, Passe...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[14, 17, 12, 7, 6, 5, 12, 12, 7, 7, 6, 6, 17, ...","[Assignment Part, Assignment Part, Assignment ...","[1367043197, 1368525256, 1366594645, 136497833...","[1, 2, 2, 2, 1, 1, 4, 3, 1, 3, 5, 3, 1, 1, 1, ...","[10.0, 10.0, 9.27, 9.98, 6.33, 10.0, 10.0, 9.2...",106,18,"[1368864326, 1368864136, 1368311193, 136886344...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1887,"[8, 4, 51, 5, 8, 43, 41, 37, 35, 3, 27, 53, 35...","[Video.Load, Video.Load, Video.Load, Video.Loa...","[1364273704, 1364878645, 1367165509, 136427404...","[Failed, Failed, Failed, Failed, Failed, Faile...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[12, 6, 6, 12, 7, 12]","[Assignment Part, Assignment Part, Assignment ...","[1367190487, 1366133246, 1366131479, 136718971...","[2, 2, 1, 1, 1, 3]","[7.09, 9.96, 7.63, 7.09, 9.8, 10.0]",16,6,"[1364273704, 1364878645, 1367165509, 136427404...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3187748,,,,,,"[6, 20, 12, 12, 17, 7, 14, 20]","[Assignment Part, Assignment Part, Assignment ...","[1365926675, 1368954488, 1366861742, 136695102...","[1, 2, 1, 2, 1, 1, 1, 1]","[10.0, 10.0, 0.0, 10.0, 10.0, 10.0, 10.0, 10.0]",0,8,,
3196657,,,,,,"[5, 7, 20, 12, 14, 6, 17]","[Assignment Part, Assignment Part, Assignment ...","[1365347414, 1365347332, 1368747396, 136712167...","[1, 1, 1, 1, 1, 1, 1]","[10.0, 9.96, 10.0, 10.0, 10.0, 10.0, 9.98]",0,7,,
3207804,,,,,,"[7, 17, 12, 14, 6, 20, 14, 14, 14, 7, 12, 6, 1...","[Assignment Part, Assignment Part, Assignment ...","[1365451607, 1368206638, 1366308909, 136683457...","[2, 1, 1, 1, 3, 1, 3, 4, 2, 1, 2, 1, 5, 2, 2]","[10.0, 9.98, 7.09, 9.55, 10.0, 10.0, 9.55, 9.5...",0,15,,
3238405,,,,,,"[17, 14, 7, 12, 6, 14, 14]","[Assignment Part, Assignment Part, Assignment ...","[1367611577, 1366409905, 1365712641, 136640334...","[1, 2, 1, 1, 1, 1, 3]","[10.0, 9.55, 10.0, 10.0, 10.0, 9.55, 10.0]",0,7,,


In [173]:
merged[merged.index == 3257038]['video_id'].isnull()

user_id
3257038    True
Name: video_id, dtype: bool

In [162]:
merged['timestamp_video']

user_id
1578       [1364293284, 1367410652, 1367049572, 136428962...
1584       [1384066837, 1384066759, 1367744683, 138406704...
1587       [1368215102, 1368215106, 1368215223, 136821524...
1727       [1368864326, 1368864136, 1368311193, 136886344...
1887       [1364273704, 1364878645, 1367165509, 136427404...
                                 ...                        
3187748                                                  NaN
3196657                                                  NaN
3207804                                                  NaN
3238405                                                  NaN
3257038                                                  NaN
Name: timestamp_video, Length: 7840, dtype: object

In [164]:
len(np.nan)

TypeError: object of type 'float' has no len()

# Temp exploration

In [53]:
list_of_events

['Video.Download',
 'Video.Error',
 'Video.Load',
 'Video.Pause',
 'Video.Play',
 'Video.Seek',
 'Video.SpeedChange',
 'Video.Stalled',
 'Assignment Part']

In [54]:
filename = "progfun-002.csv"
coursetype = "mooc/coursera"
saved_filename = filename.split('.')[0]

marras_feats = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/eq_week-marras_et_al-{saved_filename.replace("-", "_")}/feature_labels.csv')
number_id_mapping = pd.read_csv(f'F:/SeFT4ED_2/data/result/easy-fail/user_id_mapping-{saved_filename.replace("-", "_")}.csv')
hard_fail = marras_feats.merge(number_id_mapping, on='Unnamed: 0', how='inner')['user_id']


meta_dataset = pd.read_csv('F:\SeFT4ED\DATASETS\mooc\metadata.csv')
meta_dataset = meta_dataset[meta_dataset.course_id==saved_filename]

In [58]:
prob_dataset = pd.read_csv(f"F:/SeFT4ED_2/data/course/{coursetype}/problem_event/{filename}")

In [59]:
prob_dataset

Unnamed: 0.1,Unnamed: 0,user_id,problem_id,event_type,timestamp,problem_type,grade,submission_number
0,1,235771,6,Problem.Check,1366220878,Assignment Part,9.33,1
1,7,995529,17,Problem.Check,1368387177,Assignment Part,9.96,3
2,8,2787621,7,Problem.Check,1365562650,Assignment Part,9.58,5
3,12,2485475,12,Problem.Check,1367583295,Assignment Part,9.90,1
4,15,363637,6,Problem.Check,1365720098,Assignment Part,10.00,2
...,...,...,...,...,...,...,...,...
105235,458840,3031116,20,Problem.Check,1368634680,Assignment Part,9.65,1
105236,458843,1661762,6,Problem.Check,1365970799,Assignment Part,10.00,1
105237,458857,2142192,5,Problem.Check,1364393301,Assignment Part,10.00,1
105238,458863,2632602,7,Problem.Check,1365172398,Assignment Part,10.00,1


In [61]:
np.unique(prob_dataset['problem_type'], return_counts=True)

(array(['Assignment Part', 'Quiz'], dtype=object),
 array([105141,     99], dtype=int64))

In [63]:
prob_dataset = prob_dataset.merge(hard_fail, on='user_id', how='inner')

In [64]:
prob_dataset

Unnamed: 0.1,Unnamed: 0,user_id,problem_id,event_type,timestamp,problem_type,grade,submission_number
0,7,995529,17,Problem.Check,1368387177,Assignment Part,9.96,3
1,5275,995529,20,Problem.Check,1368998115,Assignment Part,5.83,1
2,45234,995529,7,Problem.Check,1364822146,Assignment Part,10.00,1
3,46912,995529,14,Problem.Check,1367150025,Assignment Part,9.96,3
4,127011,995529,17,Problem.Check,1368385263,Assignment Part,10.00,2
...,...,...,...,...,...,...,...,...
87609,454700,2201795,7,Problem.Check,1365251620,Assignment Part,9.98,1
87610,397445,2547514,6,Problem.Check,1365683557,Assignment Part,10.00,1
87611,410929,2547514,5,Problem.Check,1364313685,Assignment Part,10.00,1
87612,412969,2547514,7,Problem.Check,1364392132,Assignment Part,10.00,1


In [65]:
np.unique(prob_dataset['problem_type'], return_counts=True)

(array(['Assignment Part'], dtype=object), array([87614], dtype=int64))

In [68]:
lengths = (dataset['video_length'] + dataset['prob_length']).to_numpy()

In [69]:
lengths.sort()

In [70]:
lengths[-500:]

array([  602,   602,   602,   603,   603,   604,   604,   604,   605,
         605,   605,   606,   606,   607,   607,   607,   608,   610,
         611,   611,   611,   611,   612,   612,   612,   613,   613,
         613,   613,   614,   614,   614,   615,   616,   617,   617,
         618,   618,   618,   618,   619,   619,   619,   619,   620,
         622,   622,   622,   624,   624,   625,   625,   625,   625,
         627,   630,   630,   630,   630,   631,   631,   632,   632,
         633,   633,   634,   634,   635,   635,   635,   635,   636,
         636,   637,   637,   637,   638,   639,   640,   640,   642,
         643,   643,   643,   644,   644,   644,   645,   645,   645,
         645,   646,   646,   647,   647,   647,   649,   649,   650,
         651,   651,   654,   654,   654,   654,   655,   655,   655,
         656,   657,   658,   658,   658,   658,   658,   658,   658,
         658,   659,   660,   661,   662,   662,   662,   663,   664,
         664,   664,

In [71]:
len(lengths)

7756

In [77]:
a = dataset['prob_length'].to_numpy()

In [78]:
a.sort()

In [79]:
a[-500:]

array([ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10