In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA


In [None]:
normalization=True

In [None]:
feature_names= []
feature_types = [ "boroujeni_et_al", "chen_cui", "marras_et_al", "lalle_conati"]

for feature_type in feature_types:
    filepath = '../ex-epfl-mooc/scripts/feature_names/' + feature_type + '.csv'
    feature_type_name = pd.read_csv(filepath,header=None)
    feature_type_name = feature_type_name.values.reshape(-1)
    feature_names.append(feature_type_name)
    print(feature_type_name.shape)
    
feature_names = np.concatenate(feature_names)
feature_names

In [None]:
# create normalized LIME results
folder = "../ex-epfl-mooc/uniform_eq_results"
courses = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_001', 'microcontroleurs_003']
lime_normalized = []
for course in courses:
    extract_file = folder + "/LIME/" + course + "/dataframes/all_important_features.csv"
    df = pd.read_csv(extract_file, header=0)
    df.drop(['exp number', 'real value'], axis = 1, inplace = True) 
    if normalization:
        normalized_df=(df-df.min())/(df.max()-df.min())
        normalized_df = normalized_df.div(normalized_df.sum(axis=1), axis=0)
    else:
        normalized_df = df
    df = pd.read_csv(extract_file, header=0)
    normalized_df['exp number'] = df['exp number']
    normalized_df['real value'] = df['real value']
    normalized_df.to_csv('normalized_LIME_' + course +".csv")
    lime_normalized.append((course,normalized_df))

In [None]:
# normalized Permutation SHAP results
folder = "../ex-epfl-mooc/uniform_eq_results"
courses = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_001', 'microcontroleurs_003']
perm_shap_normalized = []
for course in courses:
    extract_file = folder + "/SHAP/Permutation/" + course + ".csv"
    df = pd.read_csv(extract_file, header=0)
    df.drop(['Unnamed: 0', 'exp_num'], axis = 1, inplace = True) 
    if normalization:
        normalized_df=(df-df.min())/(df.max()-df.min())
        normalized_df = normalized_df.div(normalized_df.sum(axis=1), axis=0)
    else:
        normalized_df = df
    df = pd.read_csv(extract_file, header=0)
    normalized_df['exp number'] = df['exp_num']
    normalized_df.to_csv('normalized_SHAP_' + course +".csv")
    perm_shap_normalized.append((course,normalized_df))
# #     df.drop(['exp number', 'real value'], axis = 1, inplace = True) 
#     normalized_df=(df-df.min())/(df.max()-df.min())
#     normalized_df.to_csv('normalized_SHAP_' + course +".csv")

In [None]:
# counterfactuals
folder = "../ex-epfl-mooc/uniform_eq_results"
courses = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_001', 'microcontroleurs_003']
counter_normalized = []
for course in courses:
    shap_df = pd.read_csv('normalized_SHAP_' + course +".csv", header=0)
    cf_list = []
    for i in shap_df['exp number']:
        extract_file = folder + "/Counterfactuals/" + course + "/feature_importances_" + str(i) + ".csv"
        try:
            instance = pd.read_csv(extract_file)
            cf_list.append(instance)
        except:
            cf_list.append(pd.DataFrame())
    if len(cf_list) > 1:
        df = pd.concat(cf_list, axis=0)
        if normalization:
            normalized_df=(df-df.min())/(df.max()-df.min())
            normalized_df = normalized_df.div(normalized_df.sum(axis=1), axis=0)
        else:
            normalized_df = df
        normalized_df['exp number'] = shap_df['exp number']
        normalized_df.to_csv('normalized_Counterfactuals_' + course +".csv")
        counter_normalized.append((course, normalized_df))

In [None]:
# CEM
folder = "../ex-epfl-mooc/uniform_eq_results"
courses = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_001', 'microcontroleurs_003']
cem_normalized = []
for course in courses:
    extract_file = folder + "/CEM/" + course + "/importances.csv"
    df = pd.read_csv(extract_file, header=0)
    df.drop(['Unnamed: 0', 'exp_num'], axis = 1, inplace = True) 
    if normalization:
        normalized_df=(df-df.min())/(df.max()-df.min())
        normalized_df = normalized_df.div(normalized_df.sum(axis=1), axis=0)
    else:
        normalized_df = df
    df = pd.read_csv(extract_file, header=0)
    normalized_df['exp number'] = df['exp_num']
    normalized_df.to_csv('normalized_CEM_' + course +".csv")
    cem_normalized.append((course,normalized_df))

In [None]:
# normalized Kernel SHAP results
folder = "../ex-epfl-mooc/uniform_eq_results"
courses = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_001', 'microcontroleurs_003']
kernel_shap_normalized = []
for course in courses:
    extract_file = folder + "/SHAP/Kernel/" + course + ".csv"
    try:
         df = pd.read_csv(extract_file, header=0)
    except:
        kernel_shap_normalized.append((course, pd.DataFrame()))
        continue
    df.drop(['Unnamed: 0', 'exp_num'], axis = 1, inplace = True) 
    if normalization:
        normalized_df=(df-df.min())/(df.max()-df.min())
        normalized_df = normalized_df.div(normalized_df.sum(axis=1), axis=0)
    else:
        normalized_df = df
    df = pd.read_csv(extract_file, header=0)
    normalized_df['exp number'] = df['exp_num']
    normalized_df.to_csv('normalized_SHAP_' + course +".csv")
    kernel_shap_normalized.append((course,normalized_df))
# #     df.drop(['exp number', 'real value'], axis = 1, inplace = True) 
#     normalized_df=(df-df.min())/(df.max()-df.min())
#     normalized_df.to_csv('normalized_SHAP_' + course +".csv")

In [None]:
course_names = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_003', 'microcontroleurs_003']

In [None]:
cem_vals, perm_shap_vals, lime_vals, dice_vals, kernel_shap_vals = [], [], [], [], []

def fit_pca(pca, df):
    if len(df) < 1:
        return []
    pca.fit(df.select_dtypes(['number']).select_dtypes(['number']).fillna(0))
    return pca.singular_values_

pca = PCA(n_components=2)
# for i,course in enumerate(courses):
#     cem_vals.append(fit_pca(pca, cem_normalized[i][1]))
#     kernel_shap_vals.append(fit_pca(pca, kernel_shap_normalized[i][1]))
#     perm_shap_vals.append(fit_pca(pca, perm_shap_normalized[i][1]))
#     lime_vals.append(fit_pca(pca, lime_normalized[i][1]))
#     dice_vals.append(fit_pca(pca, counter_normalized[i][1]))

for i,course in enumerate(courses):
    cem_vals.append(fit_pca(pca, all_points[(course, 'CEM')]))
    kernel_shap_vals.append(fit_pca(pca, all_points[(course, 'KernelSHAP')]))
    perm_shap_vals.append(fit_pca(pca, all_points[(course, 'PermSHAP')]))
    lime_vals.append(fit_pca(pca, all_points[(course, 'LIME')]))
    dice_vals.append(fit_pca(pca, all_points[(course, 'DiCE')]))


In [None]:
vals = [cem_vals, kernel_shap_vals, perm_shap_vals, lime_vals, dice_vals]
total_list = []
[total_list.extend(l) for l in vals]

In [None]:
dice_vals

In [None]:
vals = pd.DataFrame(total_list, columns=['pca1', 'pca2'])

In [None]:
vals

In [None]:
total_names = []
pca_dim = 5
# vals = [cem_vals, kernel_shap_vals, perm_shap_vals, lime_vals, dice_vals]
courses = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_001', 'microcontroleurs_003']
names = [['cem'] * pca_dim + ['kernel_shap']*pca_dim, ['perm_shap'] * pca_dim, ['lime'] * pca_dim, ['dice'] * pca_dim]
[total_names.extend(l) for l in names]
vals['method'] = total_names
vals['course'] = courses * 5
vals['method_pretty'] = np.concatenate([['CEM'] * pca_dim + ['KernelSHAP']*pca_dim, ['PermSHAP'] * pca_dim, ['LIME'] * pca_dim, ['DiCE'] * pca_dim])

In [None]:
df.columns

In [None]:
all_cols.append(['AvgReplayedWeeklyProp',
 'AvgTimeSessions',
 'CompetencyAlignment',
 'CompetencyAnticipation',
 'CompetencyStrength',
 'ContentAlignment',
 'ContentAnticipation',
 'DelayLecture',
 'RatioClicksWeekendDay',
 'RegPeakTimeDayHour',
 'RegPeriodicityM1',
 'StdTimeSessions',
 'StudentShape',
 'StudentSpeed',
 'TotalTimeProblem',
 'TotalTimeVideo'])

In [None]:
all_cols = set(np.concatenate(all_cols))

In [None]:
d.columns.values

In [None]:
#For LIME df is explanations for each instance stacked together features not present in one instance have nan as their value
course_index = 0
num_weeks = 10
course = course_names[course_index]
exp_sets = [
    ('LIME', lime_normalized),
    ('KernelSHAP', kernel_shap_normalized),
    ('PermSHAP', perm_shap_normalized),
    ('CEM', cem_normalized),
    ('DiCE', counter_normalized)
]
for method, overall_df in exp_sets:
    df = overall_df[course_index][1]
    if 'exp number' in df.columns:
        df = df.drop(['exp number'], axis=1)
             
    if 'exp_num' in df.columns:
        df = df.drop(['exp_num'], axis=1)
             
    if 'real value' in df.columns:
        df = df.drop(['real value'], axis=1)
    
    df = abs(df)
    if "LIME" not in method:
        sorted_values = np.argsort(abs(df), axis = 1) #.drop('exp_num', axis=1)
    #     df = df.drop('exp_num', axis=1)
    #     print(sorted_values.shape, df.shape)
    #     df = sorted_values.iloc[:, df.shape[1]-10:]
        ind = sorted_shap_values.iloc[:,:df.shape[1]-10]
    #     print(ind)
    #     print(df.shape[0])
        for i in np.arange(df.shape[0]):
            df.iloc[i,ind.iloc[i,:]]=np.nan #features outside top ten is replaced with nans
        df = df.iloc[:,~((df.isnull().sum(axis=0)==df.shape[0]).values)]#eliminating features with all nans

    ai = np.argsort(df.values) #nans will come after numbers
    for j,c in enumerate(list(ai[:,:10])): # changing feature importance to scores
        df.iloc[j,c] = np.arange(1,11)
    top_features = df.columns
    top_features_type = np.array([s[0:s.find('_InWeek')].split(' ')[-1] for s in top_features])
    top_features_week = np.array([s[s.find('_InWeek')+7::].split(' ')[0] for s in top_features])
    count = np.nansum(df.values,axis=0)#summing without the nans-----total score
    top_features_type_unique = list({ k for k in top_features_type })
    top_features_week_unique = list({ k for k in top_features_week })
    ######
    zero_data = np.zeros(shape=(num_weeks,len(top_features_type_unique)))
    d = pd.DataFrame(zero_data, columns=top_features_type_unique)
    for i,f in enumerate(top_features_type):
        d[f][int(top_features_week[i])-1]+=count[i]
    d = d/(df.shape[0]*10)
    d = d.iloc[:,((d>=0.333333).sum(axis=0)>0).values]#cutting criteria

    col_rename = {}
    for col in df.columns:
        col_rename[col.split('_InWeek')[0]] = mapping(col.split('_InWeek')[0])
    d = d.rename(columns=col_rename)
    
    for col in all_cols:
        if col not in d.columns:
            new_col = np.empty((len(d),1))
            new_col.fill(0)
            d.insert(0, col, new_col)
    
    d = d[all_cols]
    ######
    fig, ax = plt.subplots(figsize=(10, 4),facecolor='white')
    cmap = sns.light_palette("purple",n_colors=20)
    d[d.values<0.01]=np.nan
    annot = np.vectorize(lambda x: '' if x<0.01 else str(round(x,2)))(d.T.to_numpy())
    heatmap_font = 12
    g = sns.heatmap(d.values.T, annot=annot, fmt="", vmin=0, vmax=1, annot_kws={"fontsize":heatmap_font, "weight":'bold'},cmap=cmap, square=True)
    g.set_xticklabels(np.arange(1,num_weeks+1),rotation=0)
    g.set_yticklabels(d.columns.values,rotation=0)
#     border = 0.5
#     g.axhline(y = 0, color='k',linewidth = border)
#     g.axhline(y = d.shape[1], color = 'k',
#                     linewidth = border)
#     g.axvline(x = 0, color = 'k',
#                     linewidth = border)
#     g.axvline(x = d.shape[0], 
#                     color = 'k', linewidth = border)
    g.set_title( 'Important features heatmap '+ course + " " +  method, fontsize=15)
    plt.savefig('plots/important_features_' + course + "_" + method + ".svg",  bbox_inches = 'tight', facecolor=fig.get_facecolor())
    # pyplot.savefig(“./LIME_results/” + group + r”/pyplots/heatmap.png”, bbox_inches = ‘tight’, facecolor=fig.get_facecolor())

In [None]:
d.columns

In [None]:
vals_df

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from matplotlib import rc
import matplotlib
matplotlib.rc_file_defaults()

rc('font', **{'family': 'serif', 'serif': ['Computer Modern Bold']})
rc('text', usetex=True)
rc('text.latex', preamble=r'\usepackage{amsmath}\usepackage{amssymb}')

matplotlib.rcParams['text.usetex'] = True
course_names = ['DSP 1', 'DSP 2', 'Geomatique', 'Villes Africaines', 'Microcontroleurs']

for course_index in np.arange(5):
    course = courses[course_index]
#     n = ['   CEM', '  DICE', '   KernelSHAP', '   LIME', '  PermSHAP']
    vals_df =vals[vals['course'] == course]
    x = vals_df['pca1']
    y = vals_df['pca2']
    t = np.arange(5)
    plt.figure()
    plt.scatter(x, y, c=t, cmap="rainbow", marker='*')
    for i, txt in enumerate(vals_df['method_pretty']):
        if 'Kernel' in txt:
            plt.annotate(txt, (x[i*5+course_index]-0.4, y[i*5+course_index]+0.03))
        else: 
            if 'LIME' in txt:
                plt.annotate(txt, (x[i*5+course_index]+0.05, y[i*5+course_index] - 0.07))
            else:
                plt.annotate(txt, (x[i*5+course_index]+0.05, y[i*5+course_index]))
    plt.title('PCA Comparison of Feature Importances - '+ course_names[course_index] + '\n') 
    plt.xlim(0, 2.6)
    plt.ylim(0, 1.25)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.savefig("plots/" + course + "_PCA_analysis.svg")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

course = courses[course_index]
vals_df =vals.groupby('method_pretty').mean()
x=vals_df['pca1']
y=vals_df['pca2']
t=np.arange(5)
plt.scatter(x, y, c=t, cmap=cm.plasma)
for i, txt in enumerate(vals_df.index):
    if x[i] > 1.2:
            plt.annotate(txt, (x[i]-0.25, y[i]))
    else:
        if 'Kernel' in txt:
            plt.annotate(txt, (x[i]-0.01, y[i]+0.05))
        else: 
            plt.annotate(txt, (x[i]+0.05, y[i]))
plt.xlim(0, 2.6)
plt.ylim(0, 1.25)
plt.title('Averaged PCA Analysis of Explainability Method Feature Importances \n')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.savefig('plots/averaged_pca_analysis.png')

In [None]:
# vals = pd.DataFrame(pca_vals, columns=['pca1', 'pca2'])
vals_methods = vals # [vals['method'].isin(['perm_shap', 'lime']) ]
markers = ['*', '.', '+', '1', 'd']
for j,course in enumerate(courses):
    subset = vals_methods[vals_methods['course'] == course]
    x=np.array(subset['pca1'])
    y=np.array(subset['pca2'])
    t=np.arange(5)
    plt.scatter(x, y, c=t, cmap='rainbow', marker=markers[j])
    n = ['d1', 'd2', 'geo', 'va', 'm']
    for i, txt in enumerate(vals_methods['method_pretty']):
        if j==1 and i%5 == 0:
            if 'LIME' in txt:
                plt.annotate(txt, (x[3]-0.4, y[3]+0.07))
            else:
                if 'CEM' in txt:
                    plt.annotate(txt, (x[0]-0.1, y[0]+0.08))
                else:
                    if 'DiCE' in txt:
                        plt.annotate(txt, (x[4]-0.25, y[4]-0.08))
                    if 'Kernel' in txt:
                        plt.annotate(txt, (x[1]-0.1, y[1]+0.08))
                    if 'Perm' in txt:
                        plt.annotate(txt, (x[2]+0.18, y[2]-0.05))
    plt.title('All Courses - PCA Analysis of Explainability Method Feature Importances\n')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.savefig('plots/all_courses_pca.svg')
    # ax.legend([0, 1, 2, 3, 4],['cem', 'kernel_shap', 'perm_shap', 'lime', 'dice'], loc='lower right')

### Quantitative Metrics: Frobenius Norm + Cosine Distance

In [None]:
np.array(exp_sets).shape

In [None]:
# experiment_set = counter_normalized
# method = 'DICE'
# for i in range(len(courses)):
df = experiment_set[0][1]
for method, experiment_set in exp_sets:
    print(method)
    if len(df) > 1:
        plt.figure()
        if method == 'DICE':
            df = df.drop('exp number',axis=1).mean() #.drop('exp_num',axis=1).drop('Unnamed: 0',axis=1)
        else:
            df = df.drop('exp number',axis=1).mean()
        df = pd.DataFrame(df)
        print(df.index)
        df['week'] = [int(i.split('_InWeek')[-1]) for i in df.index]
        df['feature_name'] = [i.split('_InWeek')[0] for i in df.index]
        heatmap_df = df.pivot( "feature_name","week", 0)
        ax = sns.heatmap(heatmap_df)
        ax.set_title(courses[0] + "- " + method)
        plt.savefig("Heatmaps/" + "horizontal_" + courses[0] + "- " + method + ".png", bbox_inches='tight')

In [None]:
exp_sets = [
    ('LIME', lime_normalized),
    ('KernelSHAP', kernel_shap_normalized),
    ('PermSHAP', perm_shap_normalized),
    ('CEM', cem_normalized),
    ('DiCE', counter_normalized)
]

In [None]:
top_5 = [f[:5].index.values for f in feature_sums]
top_5 = [item for sublist in top_5 for item in sublist]
set(top_5)

In [None]:
from matplotlib.colors import LogNorm

exp_sets = [
    ('LIME', lime_normalized),
    ('KernelSHAP', kernel_shap_normalized),
    ('PermSHAP', perm_shap_normalized),
    ('CEM', cem_normalized),
    ('DiCE', counter_normalized)
]
feature_sums = []
for method, experiment_set in exp_sets:
    print(method)
    fig, ax = plt.subplots(figsize=(10, 5),facecolor='white')
#     for i in range(len(courses)):
    i = 0
    df = experiment_set[i][1]
    if len(df) > 1:
        if method == 'DiCE':
            df = df.drop('exp number',axis=1).drop('exp_num',axis=1).drop('Unnamed: 0',axis=1).mean()
        else:
            if method == 'LIME':
                df = df.drop('real value',axis=1)
            df = df.drop('exp number',axis=1).mean() #.drop('exp_num',axis=1).drop('Unnamed: 0',axis=1)

        df = pd.DataFrame(df)
        df.index = [max(i.split(' '), key=len) for i in df.index]
        df = df[~df.index.duplicated()]
        if method == 'LIME':
            df['week'] = [int(i.split('_InWeek')[1].split(' ')[0]) for i in df.index]
            df['feature_name'] = [i.split('_InWeek')[0] for i in df.index]
        else:
            df['week'] = [int(i.split('_InWeek')[1]) for i in df.index]
            df['feature_name'] = [i.split('_InWeek')[0].split(' ')[-1] for i in df.index]
            if method == 'CEM' or method == 'DiCE':
                df['feature_name'] = [mapping(i.split('_InWeek')[0].split(' ')[-1]) for i in df.index]
        heatmap_df = df.pivot("feature_name","week", 0)
#         print(heatmap_df)
        if method == 'LIME':
            for j in np.arange(10, 0, -1):
                if j not in heatmap_df.columns:
                    new_col = np.empty((len(heatmap_df),1))
                    new_col.fill(0)
                    heatmap_df.insert(0, j, new_col)
            for col in index:
                if col not in heatmap_df.index:
                    row_nan = np.empty((10))
                    row_nan.fill(0)
                    new_row = pd.Series(data=row_nan, name=col)
                    heatmap_df = heatmap_df.append(new_row, ignore_index=False)
            heatmap_df = heatmap_df.drop(0, axis=1)
        heatmap_df = heatmap_df.fillna(0)
        numeric_df = heatmap_df.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x)
#         feature_sums.append(heatmap_df.sum(axis=1).sort_values(ascending=False))
        numeric_df = numeric_df[numeric_df.index.isin(set(top_5))]
# #         print(heatmap_df)
        ax = sns.heatmap(numeric_df, vmax=1, vmin=-4, cmap='magma_r', square=True)
#         ax = sns.heatmap(heatmap_df, robust=True)
        ax.set_title(courses[i] + "- " + method)
        plt.savefig("Heatmaps/" + "expanded_" + courses[i] + "- " + method + ".svg", bbox_inches='tight')

In [None]:
lime_normalized[0][1]

In [None]:
counter_normalized[0][1]

In [None]:
from matplotlib.colors import LogNorm

exp_sets = [
    ('LIME', lime_normalized),
    ('KernelSHAP', kernel_shap_normalized),
    ('PermSHAP', perm_shap_normalized),
    ('CEM', cem_normalized),
    ('DiCE', counter_normalized)
]
courses = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_001', 'microcontroleurs_003']
courses_len = [10, 10, 15, 13, 13]
all_points = {}
for i in np.arange(5):
    for method, experiment_set in exp_sets:
        print(method)
        print(courses[i])
    #     fig, ax = plt.subplots(figsize=(10, 5),facecolor='white')
    #     for i in range(len(courses)):
        df = experiment_set[i][1]
        if len(df) > 1:
            if method == 'DiCE':
                df = df.drop('exp number',axis=1).drop('exp_num',axis=1).drop('Unnamed: 0',axis=1)
            else:
                if method == 'LIME':
                    df = df.drop('real value',axis=1)
                df = df.drop('exp number',axis=1) #.drop('exp_num',axis=1).drop('Unnamed: 0',axis=1)

            df = pd.DataFrame(df)
            df = df.T
            df.index = [max(i.split(' '), key=len) for i in df.index]
            df = df[~df.index.duplicated()]
            if method == 'LIME':
                df.insert(0, 'week', [int(i.split('_InWeek')[1].split(' ')[0]) for i in df.index])
                df['feature_name'] = [i.split('_InWeek')[0] for i in df.index]
            else:
                df.insert(0, 'week', [int(i.split('_InWeek')[1]) for i in df.index])
                df['feature_name'] = [i.split('_InWeek')[0].split(' ')[-1] for i in df.index]
                if method == 'CEM' or method == 'DiCE':
                    df['feature_name'] = [mapping(i.split('_InWeek')[0].split(' ')[-1]) for i in df.index]
            df.set_index(['feature_name', 'week'], drop=True, inplace=True)
#             for j in np.arange(1, courses_len[i]+1):
#             for j in np.arange(1, courses_len[i]+1):
#                 if j not in df.columns:
#                     new_col = np.empty((len(df),1))
#                     new_col.fill(0)
#                     df.insert(int(j)-1, j, new_col)
            for col in feature_names:
                for week in np.arange(1, courses_len[i]+1):
                    if (col, week) not in df.index:
                        row_nan = np.empty(len(df.columns))
                        row_nan.fill(0)
                        new_row = pd.Series(data=row_nan, name=(col, week))
                        df = df.append(new_row, ignore_index=False)
            df = df.fillna(0)
            df = df.sort_index()
    #         numeric_df = heatmap_df.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x)
            all_points[(courses[i], method)] = df

In [None]:
all_points[('dsp_002', 'PermSHAP')]

In [None]:
from matplotlib.colors import LogNorm

exp_sets = [
    ('LIME', lime_normalized),
    ('KernelSHAP', kernel_shap_normalized),
    ('PermSHAP', perm_shap_normalized),
    ('CEM', cem_normalized),
    ('DiCE', counter_normalized)
]
courses = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_001', 'microcontroleurs_003']
courses_len = [10, 10, 15,13,13]
# heatmaps = {}
for i in np.arange(5):
    for method, experiment_set in exp_sets:
        print(method)
        print(courses[i])
    #     fig, ax = plt.subplots(figsize=(10, 5),facecolor='white')
    #     for i in range(len(courses)):
        df = experiment_set[i][1]
        if len(df) > 1:
            if method == 'DiCE':
                df = df.drop('exp number',axis=1).drop('exp_num',axis=1).drop('Unnamed: 0',axis=1).mean()
            else:
                if method == 'LIME':
                    df = df.drop('real value',axis=1)
                df = df.drop('exp number',axis=1).mean() #.drop('exp_num',axis=1).drop('Unnamed: 0',axis=1)

            df = pd.DataFrame(df)
            df.index = [max(i.split(' '), key=len) for i in df.index]
            df = df[~df.index.duplicated()]
            if method == 'LIME':
                df['week'] = [int(i.split('_InWeek')[1].split(' ')[0]) for i in df.index]
                df['feature_name'] = [i.split('_InWeek')[0] for i in df.index]
            else:
                df['week'] = [int(i.split('_InWeek')[1]) for i in df.index]
                df['feature_name'] = [i.split('_InWeek')[0].split(' ')[-1] for i in df.index]
                if method == 'CEM' or method == 'DiCE':
                    df['feature_name'] = [mapping(i.split('_InWeek')[0].split(' ')[-1]) for i in df.index]
            print(df)
            heatmap_df = df.pivot("feature_name","week", 0)
#             print(heatmap_df)
            for j in np.arange(1, courses_len[i]+1):
                if j not in heatmap_df.columns:
                    new_col = np.empty((len(heatmap_df),1))
                    new_col.fill(0)
                    heatmap_df.insert(int(j)-1, j, new_col)
            for col in master_list:
                if col not in heatmap_df.index:
                    row_nan = np.empty((courses_len[i]))
                    row_nan.fill(0)
                    new_row = pd.Series(data=row_nan, name=col)
                    heatmap_df = heatmap_df.append(new_row, ignore_index=False)
            if 0 in heatmap_df.columns:
                heatmap_df = heatmap_df.drop(0, axis=1)
            heatmap_df = heatmap_df.fillna(0)
    #         numeric_df = heatmap_df.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x)
            heatmaps[(courses[i], method)] = heatmap_df

In [None]:
master_list = [item for sublist in master_index for item in sublist]
master_list = [item for sublist in master_list for item in sublist]
set(master_list)

In [None]:
heatmaps[('microcontroleurs_003', 'PermSHAP')]

In [None]:
index = heatmap_df.index
index

In [None]:
def mapping(feature):
    name_mapping = {
        'competency_anticipation':  'CompetencyAnticipation',
         'content_alignment':  'ContentAlignment',
         'content_anticipation':  'ContentAnticipation',
         'delay_lecture':  'DelayLecture',
         'frequency_action_Video.Load':  'FrequencyEventLoad',
         'frequency_action_Video':  'FrequencyEventVideo',
         'frequency_action_Video.Play':  'FrequencyEventVideoPlay',
         'frequency_action_Video.Pause':  'FrequencyEventVideoPause',
         'number_sessions':  'NumberOfSessions',
         'ratio_clicks_weekend_day':  'RatioClicksWeekendDay',
         'regularity_peak_dayhour':  'RegPeakTimeDayHour',
         'regularity_periodicity_m1':  'RegPeriodicityM1',
         'student_speed':  'StudentSpeed',
         'time_between_sessions_std':  'StdTimeBetweenSessions',
         'time_in__problem_sum':  'TotalTimeProblem',
         'time_in__video_sum':  'TotalTimeVideo',
         'time_sessions_mean':  'AvgTimeSessions',
         'time_sessions_std':  'StdTimeSessions',
         'time_sessions_sum':  'TotalTimeSessions',
         'total_clicks':  'TotalClicks',
         'total_clicks_Video':  'TotalClicksVideoChen',
         'total_clicks_Video.Load':  'TotalClicksVideoLoad',
         'total_clicks_problem':  'TotalClicksProblem',
         'total_clicks_video':  'TotalClicksVideoConati',
         'total_clicks_weekday':  'TotalClicksWeekday',
         'total_clicks_weekend': 'TotalClicksWeekend',
         'weekly_prop_replayed_mean': 'AvgReplayedWeeklyProp',
         'weekly_prop_watched_mean':  'AvgWatchedWeeklyProp',
         'weekly_prop_interrupted_mean': 'AvgInterruptedWeeklyProp',
         'pause_duration_mean': 'AvgPauseDuration',
         'pause_duration_std': 'StdPauseDuration',
         'time_speeding_up_mean': 'AvgTimeSpeedingUp',
         'time_speeding_up_std': 'StdTimeSpeedingUp'
    }
    
    if feature in name_mapping.keys(): 
        return name_mapping[feature]
    return feature

In [None]:
len(set(['AvgReplayedWeeklyProp',
 'CompetencyAnticipation',
 'DelayLecture',
 'FrequencyEventLoad',
 'NumberOfSessions',
 'RegPeakTimeDayHour',
 'RegPeriodicityM1',
 'TotalClicksProblem',
 'TotalTimeProblem',
'AvgReplayedWeeklyProp',
 'AvgTimeSessions',
 'CompetencyAlignment',
 'CompetencyAnticipation',
 'CompetencyStrength',
 'ContentAlignment',
 'ContentAnticipation',
 'DelayLecture',
 'RatioClicksWeekendDay',
 'RegPeakTimeDayHour',
 'RegPeriodicityM1',
 'StdTimeSessions',
 'StudentShape',
 'StudentSpeed',
 'TotalTimeProblem',
 'TotalTimeVideo']))

# Metrics

In [None]:
from scipy.spatial.distance import cosine, jensenshannon
from scipy.stats import spearmanr

def extract(m1, m2, i):
    x = m1[i]
    y = m2[i]
    if sum(x) == 0:
        x[0] = 0.0001
    return x,y

# cosine distance
def cosine_vector(m1, m2):
  """
  in:
  m1, m2: DataFrames of shape (n_instances, n_features)
          containing feature importance scores from method 1/2
  
  out:
  list of length n_instances, containing cosine distances
  between feature importance scores for each instance
  """
  dists = []
  for i in range(min(len(m1.columns), len(m2.columns))):
    x,y = extract(m1, m2, i)
    dists.append(cosine(x,y))
  return dists

# norm of differences
def norm_differences(m1, m2):
  """
  in:
  m1, m2: DataFrames of shape (n_instances, n_features)
          containing feature importance scores from method 1/2
  
  out:
  Frobenius norm of the difference of the two dataframes
  """
  return np.linalg.norm(m1-m2, ord='fro')

# jensen_shannon distance
def jensen_shannon(m1, m2):
  """
  in:
  m1, m2: DataFrames of shape (n_instances, n_features)
          containing feature importance scores from method 1/2
  
  out:
  list of length n_instances, containing jenson shannon distance
  between feature importance scores for each instance
  """
  dists = []
  for i in range(min(len(m1.columns), len(m2.columns))):
    x, y = extract(m1, m2, i)
    dists.append(jensenshannon(x,y))
  dists = np.array(dists)
  return dists

# rank-correlation (spearman's rank correlation coefficient) 
def spearman_rank_correlation(m1, m2):
  """
  in:
  m1, m2: DataFrames of shape (n_instances, n_features)
          containing feature importance scores from method 1/2
  
  out:
  list of length n_instances, containing rank correlation
  between feature importance scores for each instance
  """
  dists = []
  for i in range(min(len(m1.columns), len(m2.columns))):
    x,y = extract(m1, m2, i)
    corr, pval = spearmanr(x,y)
    if corr is np.nan:
        print(x, y)
        print(sum(x), sum(y), corr)
    dists.append(corr)
  dists = np.array(dists)
  return dists

from sklearn.metrics import mutual_info_score
# mutual information 
def mutual_info(m1, m2):
  """
  in:
  m1, m2: DataFrames of shape (n_instances, n_features)
          containing feature importance scores from method 1/2
  
  out:
  list of length n_instances, containing mutual info
  between feature importance scores for each instance
  """
  dists = []
  for i in range(min(len(m1.columns), len(m2.columns))):
    x,y = extract(m1, m2, i)
    dists.append(mutual_info_score(x,y))
  dists = np.array(dists)
  return dists

In [None]:
set(all_points[('dsp_002', 'CEM')].index) - set(all_points[('dsp_002', 'LIME')].index)

In [None]:
method_list = ['LIME', 'PermSHAP', 'KernelSHAP', 'DiCE', 'CEM']
all_results = []
course_names = ['dsp_001', 'dsp_002', 'geomatique_003', 'villesafricaines_001', 'microcontroleurs_003']
distance_metric = mutual_info
for course in course_names:
    comparison_results = []
    for method_i in method_list:
        for method_j in method_list:
            print(course, method_i, method_j)
            comparison_results.append(distance_metric(all_points[(course, method_i)], all_points[(course, method_j)]))
    all_results.append(comparison_results)

In [None]:
import json

In [None]:
dist_dict = {}
method = 'mutual_info'
pretty_name = "Mutual Info"
# course_index = 0
for course_index in np.arange(5):
    plt.figure()
    for i, course in enumerate(all_results):
        comp_mean = []
        for comparison in course:
            comp_mean.append(np.mean(comparison))
        dist_dict[i] = comp_mean
    heatmap_df = pd.DataFrame(np.array(dist_dict[course_index]).reshape(5,5), columns=['LIME', 'KernelSHAP', 'PermSHAP',  'DiCE', 'CEM'])
    heatmap_df['method'] = ['LIME', 'KernelSHAP', 'PermSHAP',  'DiCE', 'CEM']
    heatmap_df = heatmap_df.set_index('method')
    sns.heatmap(heatmap_df, cmap='magma', annot=True, vmin=0, vmax=1, fmt="0.2f", annot_kws={"fontsize":15})
    plt.title(pretty_name + ' between \nexplainability methods in '+courses[course_index])
    plt.savefig("plots/" + courses[course_index] + "_" + method + ".png", bbox_inches='tight')
    
data = json.dumps(dist_dict)
# open file for writing, "w" 
f = open(method +".json","w")
f.write(data)
f.close()