In [3]:
import pandas as pd
import os
import numpy as ny
import seaborn as sns
from matplotlib import pyplot as plt

# help function

In [22]:
#define Jaccard Similarity function

def jaccard(chosen_feats, feats):
    intersection = len(list(set(chosen_feats).intersection(feats)))
    union = (len(chosen_feats) + len(feats)) - intersection
    return float(intersection) / union


In [23]:
'''
    calculate each interviewer's positive & negative selected features distances from the method they chose
    i.e. jaccard(chosen_features, method_(pos/neg)_features)
'''
def row_js(row, dt, course):
    pos_list = []
    neg_list = []
    pos_list.append(row['1st positive feature {}'.format(course)])
    pos_list.append(row['2nd positive feature {}'.format(course)])
    neg_list.append(row['1st negative feature {}'.format(course)])
    neg_list.append(row['2nd negative feature {}'.format(course)])
    
    pos_list = [dt['feature_code'].get(item,item)  for item in pos_list]
    neg_list = [dt['feature_code'].get(item,item)  for item in neg_list]
    selected_list = pos_list + neg_list
#     print(selected_list)
    
    selected_md = str(row['Which results you agree the most?--{}'.format(course)])
    selected_md = selected_md.lower()
    if selected_md == 'kernelshap':
        selected_md = 'shap'
    
    pos_d = jaccard(pos_list, dt['{}_{}_pos'.format(course, selected_md)])
    neg_d = jaccard(neg_list, dt['{}_{}_neg'.format(course, selected_md)])
    top_d = jaccard(selected_list, dt['{}_{}_top'.format(course, selected_md)])
    
    return pos_d, neg_d, top_d

# Active Learning

In [24]:
'''we code features based on the dictionary:
{
    'time_in__video_sum': 0, 
    'frequency_action_Video.Pause': 1, 
    'number_submissions_distinct': 2, 
    'obs_duration_problem_var': 3, 
    'competency_anticipation': 4, 
    'problem.check-problem.check-problem.check': 5, 
    'time_sessions_std': 6, 
    'ratio_clicks_weekend_day': 7
}
{
    'total-time-vid': 0,
    'freq-pause-vid': 1,
    'distinct-probs-quiz': 2,
    'std-correct-time-quiz': 3,
    'eager-view-quiz': 4
    'check-check-check-quiz': 5,
    'std-time-session': 6,
    'ratio-clicks-weekend': 7
}
'''

AL_mds_dict = {
    'AN_lime_pos': [2, 4, 7],
    'AN_lime_neg': [1, 3, 6],
    'AN_lime_top': [4, 6, 7],
    'AN_cem_pos': [0, 2, 4, 6, 7],
    'AN_cem_neg': [1, 3],
    'AN_cem_top': [2, 3, 4],
    'AN_shap_pos': [0, 3, 5, 6],
    'AN_shap_neg': [1, 2, 4],
    'AN_shap_top': [0, 2, 6],

    'Geom_lime_pos': [2, 4, 6, 7],
    'Geom_lime_neg': [5],
    'Geom_lime_top': [2, 5, 7],
    'Geom_cem_pos': [2, 3, 5, 6, 7],
    'Geom_cem_neg': [0],
    'Geom_cem_top': [0, 2, 7],
    'Geom_shap_pos': [1, 6],
    'Geom_shap_neg': [0, 2, 3, 4, 7],
    'Geom_shap_top': [1, 2, 6]
}

AL_mds_dict['feature_code'] = {
    'total-time-vid': 0,
    'freq-pause-vid': 1,
    'distinct-probs-quiz': 2,
    'std-correct-time-quiz': 3,
    'eager-view-quiz': 4,
    'check-check-check-quiz': 5,
    'std-time-session': 6,
    'ratio-clicks-weekend': 7
}

In [32]:
AL_survey = pd.read_excel('experts_survey--AL.xlsx', sheet_name='Original data')
AL_survey = AL_survey[:11]

In [33]:
AL_survey['AN_distances']= AL_survey.apply(row_js, args=(AL_mds_dict, 'AN'), axis=1)
AL_survey[['AN_pos_d', 'AN_neg_d', 'AN_top_d']] = pd.DataFrame(AL_survey['AN_distances'].tolist(), index=AL_survey.index)
AL_survey['Geom_distances']= AL_survey.apply(row_js, args=(AL_mds_dict, 'Geom'), axis=1)
AL_survey[['Geom_pos_d', 'Geom_neg_d', 'Geom_top_d']] = pd.DataFrame(AL_survey['Geom_distances'].tolist(), index=AL_survey.index)
AL_survey

Unnamed: 0,时间戳记,Name,Gender,Age,Nationality,Title,Institution(s),Teaching background,MOOC experience,1st positive feature AN,...,label-redistribute problems_V,label-no suggestion_V,AN_distances,AN_pos_d,AN_neg_d,AN_top_d,Geom_distances,Geom_pos_d,Geom_neg_d,Geom_top_d
0,2022-08-24 10:56:50.470000,Jibril,Male,28.0,France,Post doc,EPFL,"Programming languages eg. C, Java\nprobability...",Work with MOOC data.,total-time-vid,...,0.0,0.0,"(0.4, 0.3333333333333333, 0.4)",0.4,0.333333,0.4,"(0.2, 0.0, 0.16666666666666666)",0.2,0.0,0.166667
1,2022-08-25 17:41:19.044000,Mirko,Male,30.0,Italy,professor,University of Cagliari (Italy),programming for undergrad (100+),work with MOOC data\nparticipanted as a student,std-correct-time-quiz,...,0.0,0.0,"(0.5, 0.0, 0.4)",0.5,0.0,0.4,"(0.2, 0.0, 0.16666666666666666)",0.2,0.0,0.166667
2,2022-08-26 22:20:52.972000,Suraj Rampure,Male,23.0,,Lecturer,UCSD,"4 yrs TA, 100 students\n2 yrs instructor, intr...","tried to adapted a few platforms, but not formal",total-time-vid,...,0.0,0.0,"(0.4, 0.0, 0.16666666666666666)",0.4,0.0,0.166667,"(0.0, 0.0, 0.16666666666666666)",0.0,0.0,0.166667
3,2022-08-26 23:05:59.992000,Kevin Lin,Male,26.0,,Professor,UW Seattle,"undergrad data stucture, data science and intr...",I haven't really ever taken a full course. I'v...,eager-view-quiz,...,0.0,0.0,"(0.4, 0.3333333333333333, 0.4)",0.4,0.333333,0.4,"(0.16666666666666666, 0.5, 0.4)",0.166667,0.5,0.4
4,2022-08-29 16:36:03.942000,Paolo Prandoni,Male,53.0,,Lecturer,EPFL,"Computers and Music, signal processing DSP (11...",150k students that have taken DSP moocs (start...,std-correct-time-quiz,...,1.0,0.0,"(0.2, 0.0, 0.16666666666666666)",0.2,0.0,0.166667,"(0.5, 0.5, 0.4)",0.5,0.5,0.4
5,2022-08-29 22:26:25.112000,anna rafferty,Female,37.0,,Professor,Carleton College (US),undergrad courses from intro to elective middl...,tried some but never finish them,distinct-probs-quiz,...,1.0,0.0,"(0.2, 0.25, 0.75)",0.2,0.25,0.75,"(0.2, 0.0, 0.4)",0.2,0.0,0.4
6,2022-08-30 15:59:07.428000,Ryan Baker,Male,44.0,,Professor,University of Pennsylvania,"small independent class, \nsmall seminar,\nlar...","open course, big data and education, 2013 100,...",distinct-probs-quiz,...,0.0,1.0,"(0.2, 0.25, 0.4)",0.2,0.25,0.4,"(0.16666666666666666, 0.5, 0.4)",0.166667,0.5,0.4
7,2022-08-30 17:02:26.668000,Martin Hlosta,Male,36.0,,Senior Researcher,FFHS (CH),2010-2012 40 students face-to-face university ...,Leading lecturer of the MOOC,std-time-session,...,0.0,0.0,"(0.5, 0.25, 0.4)",0.5,0.25,0.4,"(0.2, 0.0, 0.0)",0.2,0.0,0.0
8,2022-08-30 21:02:54.755000,Fahad Kamran,Male,26.0,,PhD,U of Michigan,"GSI of undergrad (500-1000, \nintro computer s...",never taken any;\nwork with a little bit in un...,total-time-vid,...,1.0,0.0,"(0.4, 0.0, 0.4)",0.4,0.0,0.4,"(0.2, 0.0, 0.16666666666666666)",0.2,0.0,0.166667
9,2022-09-12 15:03:04.645000,Maria Salamo,Female,49.0,Spanish,Professor,University of Barcelona,"20+ 30+ yrs teaching experience \nprogramming,...",taught online couple of years ago,std-time-session,...,0.0,0.0,"(0.5, 0.25, 0.16666666666666666)",0.5,0.25,0.166667,"(0.2, 0.5, 0.4)",0.2,0.5,0.4


In [36]:
AL_survey.to_csv('AL_distance.csv')

# Modality

In [28]:
'''we code the features based on the dictionary:
{
    'video.play-video.stop-video.play': 0, 
    'SeekLength-mean': 1, 
    'WatchingRatio': 2, 
    'SpeedPlayback-mean': 3, 
    'StudentWeeklyActiveness': 4, 
    'FrequencyEvent-video.play-played': 5, 
    'NumberSubmissions-avg': 6, 
    'NumberSubmissions-distinct': 7, 
    'ObsDurationProblem-max': 8, 
    'problem.check-problem.check-problem.check': 9, 
    'RegPeriodicity-m1': 10
}
{
    'play-stop-play-vid': 0,
    'avg-len-seek-vid': 1,
    'watch-ratio-vid': 2,
    'speed-vid': 3,
    'active-participation-weekly-vid': 4,
    'freq-play-vid': 5,
    'num-submit-quiz': 6,
    'distinct-probs-quiz': 7,
    'correct-time-quiz': 8,
    'check-check-check-quiz': 9,
    'hourly-freq-regular': 10
}
'''

MD_mds_dict = {
    'Flip_lime_pos': [0, 1, 2, 3],
    'Flip_lime_neg':[4, 8, 9],
    'Flip_lime_top': [0, 8, 9],
    'Flip_cem_pos': [0, 3, 8, 9],
    'Flip_cem_neg': [1],
    'Flip_cem_top': [0, 1, 9],
    'Flip_shap_pos': [1, 3, 4, 8, 9, 10],
    'Flip_shap_neg': [0, 2, 5],
    'Flip_shap_top': [5, 10, 9],

    'MOOC_lime_pos': [2, 7],
    'MOOC_lime_neg': [6],
    'MOOC_lime_top': [2, 6, 7],
    'MOOC_cem_pos': [2, 5, 6, 7, 8, 10],
    'MOOC_cem_neg': [3, 4, 9],
    'MOOC_cem_top': [2, 3, 7],
    'MOOC_shap_pos': [3, 8],
    'MOOC_shap_neg': [2, 4, 6, 7, 9, 10],
    'MOOC_shap_top': [7, 4, 8]
}

MD_mds_dict['feature_code'] = {
    'play-stop-play-vid': 0,
    'avg-len-seek-vid': 1,
    'watch-ratio-vid': 2,
    'speed-vid': 3,
    'active-participation-weekly-vid': 4,
    'freq-play-vid': 5,
    'num-submit-quiz': 6,
    'distinct-probs-quiz': 7,
    'correct-time-quiz': 8,
    'check-check-check-quiz': 9,
    'hourly-freq-regular': 10
}

In [31]:
MD_survey = pd.read_excel('experts_survey--MD.xlsx', sheet_name='Original')
MD_survey = MD_survey[:10]

In [35]:
MD_survey['Flip_distances']= MD_survey.apply(row_js, args=(MD_mds_dict, 'Flip'), axis=1)
MD_survey[['Flip_pos_d', 'Flip_neg_d', 'Flip_top_d']] = pd.DataFrame(MD_survey['Flip_distances'].tolist(), index=MD_survey.index)
MD_survey['MOOC_distances']= MD_survey.apply(row_js, args=(MD_mds_dict, 'MOOC'), axis=1)
MD_survey[['MOOC_pos_d', 'MOOC_neg_d', 'MOOC_top_d']] = pd.DataFrame(MD_survey['MOOC_distances'].tolist(), index=MD_survey.index)
MD_survey

Unnamed: 0,时间戳记,Name,Gender,Age,Nationality,Title,Institution(s),Teaching background,MOOC experiences,1st positive feature Flip,...,skye suggestion labels,vini labels,Flip_distances,Flip_pos_d,Flip_neg_d,Flip_top_d,MOOC_distances,MOOC_pos_d,MOOC_neg_d,MOOC_top_d
0,2022-09-05 09:27:09.091000,Veronika Bogina,Female,42.0,,Researcher,Hafia University (Israel),"Soft Engineering (100), logic for master and a...","Couresara, take courses and mentor for NLP",correct-time-quiz,...,rich content\nshort videos\nredistribution quizzs,1. pause videos / shorter videos\n4. modify cu...,"(0.3333333333333333, 0.0, 0.16666666666666666)",0.333333,0.0,0.166667,"(0.3333333333333333, 0.3333333333333333, 0.4)",0.333333,0.333333,0.4
1,2022-09-05 10:41:13.016000,Flammarion Nicolas,Male,32.0,,Professor,EPFL,"ML (500), OPT ML (200) master for 3 years",No,active-participation-weekly-vid,...,quizzs design\nredistribution quizzs,2. quizzes as part of videos\n4. modify curric...,"(0.14285714285714285, 0.0, 0.16666666666666666)",0.142857,0.0,0.166667,"(0.0, 0.14285714285714285, 0.75)",0.0,0.142857,0.75
2,2022-09-05 22:40:51.959000,Sam,Male,26.0,,Lecturer,UCSD,"intro to CS 100 students, \nintro data science...",take a bunch,watch-ratio-vid,...,quizzs design,4. modify curriculum\n5. redistribute problems...,"(0.14285714285714285, 0.25, 0.0)",0.142857,0.25,0.0,"(0.3333333333333333, 0.0, 0.16666666666666666)",0.333333,0.0,0.166667
3,2022-09-06 08:00:25.187000,Ashish Aggarwal,Male,30.0,,Professor,Unversity of Florida,intro programming courses for engineering stud...,flipped class \nsome online class\ndeveloped r...,active-participation-weekly-vid,...,rich content\nshort videos\nredistribution quizzs,1. pause videos / shorter videos\n2. quizzes a...,"(0.0, 0.25, 0.16666666666666666)",0.0,0.25,0.166667,"(0.3333333333333333, 0.3333333333333333, 0.4)",0.333333,0.333333,0.4
4,2022-09-06 10:03:32.442000,Stefano Faralli,Male,47.0,,Professor,Sapienza University of Rome,4.5 yrs teaching at university\nteacher remote...,teaching online class,watch-ratio-vid,...,,6. No suggestions,"(0.14285714285714285, 0.0, 0.0)",0.142857,0.0,0.0,"(0.3333333333333333, 0.25, 0.16666666666666666)",0.333333,0.25,0.166667
5,2022-09-06 11:05:08.759000,Viktor Kuncak,Male,45.0,,Professor,EPFL,"undergrad and master, programming languages, f...",function programming for 2nd year is a MOOC,avg-len-seek-vid,...,change grading principle,4. modify curriculum\n5. redistribute problems...,"(0.14285714285714285, 0.25, 0.16666666666666666)",0.142857,0.25,0.166667,"(0.0, 0.25, 0.4)",0.0,0.25,0.4
6,2022-09-06 16:01:08.041000,Guilherme Ramos,Male,33.0,,Professor,"Instituto Superior Técnico, Universidade de Li...","computer science, programming, undergrad, 30-4...","during covid teaching through Zoom, google pla...",speed-vid,...,teacher improvement,3. reminders\n\n,"(0.14285714285714285, 0.25, 0.16666666666666666)",0.142857,0.25,0.166667,"(0.3333333333333333, 0.0, 0.4)",0.333333,0.0,0.4
7,2022-09-07 09:42:42.014000,Moriah Ariely,Female,41.0,Israel,Senior Intern,Weizmann Institute of Science,Teach teachers for 6 yrs \nbiology high school...,as participant,active-participation-weekly-vid,...,no idea,6. No suggestions,"(0.3333333333333333, 0.0, 0.4)",0.333333,0.0,0.4,"(0.0, 0.5, 0.16666666666666666)",0.0,0.5,0.166667
8,2022-09-13 10:32:56.181000,Danilo Dessi,Male,30.0,Italy,Assistant Professor,University of Cagliari (Italy) / University of...,"computer science, bachelor level",no experience,hourly-freq-regular,...,rich content,4. modify curriculum,"(0.3333333333333333, 0.6666666666666666, 0.4)",0.333333,0.666667,0.4,"(0.14285714285714285, 0.0, 0.4)",0.142857,0.0,0.4
9,2022-09-14 15:39:24.880000,Giora,Male,47.0,Israel,Professor,Weizmann Institute of Science,"computing science to high school, undergrad, \...","MIT, intro to physics, 2014-2016\npost doc wor...",num-submit-quiz,...,add deadlines,3. reminders\n5. redistribute problems\nacross...,"(0.0, 0.0, 0.0)",0.0,0.0,0.0,"(0.0, 0.5, 0.16666666666666666)",0.0,0.5,0.166667


In [37]:
MD_survey.to_csv('MD_distance.csv')