# Evaluation of Userv Survey answers

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from statsmodels.stats.inter_rater import fleiss_kappa
import krippendorff
import numpy as np

# Fleiss' kappa

In [8]:
# Divide data_filtered into two DataFrames based on group type
group_a_data = data[data['Group'] == 'a']
group_b_data = data[data['Group'] == 'b']

# Function to convert data into a format suitable for Fleiss' kappa
def prepare_fleiss_data(df, column_name, n_raters):
    # Create a contingency table
    contingency_table = df.groupby(['Question Number', column_name]).size().unstack(fill_value=0)
    
    # Ensure all categories are present
    for category in ['strong top', 'weak top', 'equal', 'weak bottom', 'strong bottom']:
        if category not in contingency_table.columns:
            contingency_table[category] = 0
    
    # Sort columns to ensure consistent order
    contingency_table = contingency_table[['strong top', 'weak top', 'equal', 'weak bottom', 'strong bottom']]
    # print(contingency_table)
    
    # Ensure each question has n_raters ratings
    for idx in contingency_table.index:
        total_ratings = contingency_table.loc[idx].sum()
        # print('total ratings', idx, total_ratings)
        if total_ratings < n_raters:
            contingency_table.loc[idx, 'fill'] = n_raters - total_ratings
        elif total_ratings > n_raters:
            contingency_table = contingency_table.drop(idx)
    
    return contingency_table

In [10]:
# Function to calculate Fleiss' kappa for a group
def calculate_fleiss_kappa_for_group(group_data, n_raters):
    fleiss_kappa_results = []

    # Prepare data for Rep Saliency and Pro Saliency
    rep_contingency_table = prepare_fleiss_data(group_data, 'Rep Saliency', n_raters)
    pro_contingency_table = prepare_fleiss_data(group_data, 'Pro Saliency', n_raters)
    
    # Calculate Fleiss' kappa for Rep Saliency
    rep_fleiss_kappa = fleiss_kappa(rep_contingency_table.to_numpy())
    
    # Calculate Fleiss' kappa for Pro Saliency
    pro_fleiss_kappa = fleiss_kappa(pro_contingency_table.to_numpy())
    
    fleiss_kappa_results.append({
        'Rep Saliency Kappa': rep_fleiss_kappa,
        'Pro Saliency Kappa': pro_fleiss_kappa
    })
    
    return fleiss_kappa_results

# Calculate Fleiss' kappa for group A (5 raters)
group_a_kappa_results = calculate_fleiss_kappa_for_group(group_a_data, 5)

# Calculate Fleiss' kappa for group B (3 raters)
group_b_kappa_results = calculate_fleiss_kappa_for_group(group_b_data, 3)

# Combine the results
all_kappa_results = {
    'Group A': group_a_kappa_results,
    'Group B': group_b_kappa_results
}

# Convert results to DataFrame and display
group_a_kappa_df = pd.DataFrame(group_a_kappa_results)
group_b_kappa_df = pd.DataFrame(group_b_kappa_results)

print("Fleiss' kappa results for Group A:")
print(group_a_kappa_df)

print("Fleiss' kappa results for Group B:")
print(group_b_kappa_df)

# Save the results to a CSV file
output_folder = '../evaluation'
os.makedirs(output_folder, exist_ok=True)
group_a_kappa_df.to_csv(os.path.join(output_folder, 'general_fleiss_kappa_results_group_a.csv'), index=False)
group_b_kappa_df.to_csv(os.path.join(output_folder, 'general_fleiss_kappa_results_group_b.csv'), index=False)

Fleiss' kappa results for Group A:
   Rep Saliency Kappa  Pro Saliency Kappa
0            0.160812            0.014687
Fleiss' kappa results for Group B:
   Rep Saliency Kappa  Pro Saliency Kappa
0            0.073491           -0.061552


## Krippendorff's alpha

In [13]:
# Combine group and question number to ensure unique question identifiers
data['Question ID'] = data['Group'] + '-' + data['Question Number'].astype(str)

# Create a matrix of item-category counts
categories = ['strong bottom', 'weak bottom', 'equal', 'weak top', 'strong top']
category_mapping = {category: i for i, category in enumerate(categories)}

# Gather all ratings into a list of lists
ratings = []

for question_id in data['Question ID'].unique():
    question_ratings = data[data['Question ID'] == question_id][['Rep Saliency', 'Pro Saliency']].values.flatten()
    ratings.append([category_mapping.get(rating, -1) for rating in question_ratings])
    

# Determine the maximum length of ratings
max_len = max(len(row) for row in ratings)

# Pad each row to the maximum length with -1
padded_ratings = [row + [-1] * (max_len - len(row)) for row in ratings]

# Convert to numpy array
ratings_array = np.array(padded_ratings)

# Calculate Krippendorff's alpha
alpha = krippendorff.alpha(reliability_data=ratings_array, level_of_measurement='nominal')
print(f'General Krippendorff\'s alpha: {alpha}')


General Krippendorff's alpha: 0.10844421118543002


In [14]:
# Create a mapping for categories
categories = ['strong bottom', 'weak bottom', 'equal', 'weak top', 'strong top']
category_mapping = {category: i for i, category in enumerate(categories)}

def prepare_ratings(saliency_column):
    # Gather all ratings into a list of lists
    ratings = []
    for question_id in data['Question ID'].unique():
        question_ratings = data[data['Question ID'] == question_id][saliency_column].values
        ratings.append([category_mapping.get(rating, -1) for rating in question_ratings])
    
    # Determine the maximum length of ratings
    max_len = max(len(row) for row in ratings)

    # Pad each row to the maximum length with -1
    padded_ratings = [row + [-1] * (max_len - len(row)) for row in ratings]

    # Convert to numpy array
    ratings_array = np.array(padded_ratings)
    
    return ratings_array

# Prepare ratings for Rep Saliency
rep_ratings_array = prepare_ratings('Rep Saliency')

# Calculate Krippendorff's alpha for Rep Saliency
alpha_rep = krippendorff.alpha(reliability_data=rep_ratings_array, level_of_measurement='nominal')
print(f'Krippendorff\'s alpha for Rep Saliency: {alpha_rep}')

# Prepare ratings for Pro Saliency
pro_ratings_array = prepare_ratings('Pro Saliency')

# Calculate Krippendorff's alpha for Pro Saliency
alpha_pro = krippendorff.alpha(reliability_data=pro_ratings_array, level_of_measurement='nominal')
print(f'Krippendorff\'s alpha for Pro Saliency: {alpha_pro}')

Krippendorff's alpha for Rep Saliency: 0.1066581472583652
Krippendorff's alpha for Pro Saliency: 0.10878203315274781


## Majority Vote
We want to see the **majority vote**. Divided by group (A and B), for each question we check the answers of users, and get the agreement of the majority. In case of a tie, we take the worst.

In [15]:
from collections import Counter

data = pd.read_csv('../evaluation/filtered_saliency_survey.csv')

# Map saliency levels to broader categories
saliency_mapping = {
    'strong top': 'top',
    'weak top': 'top',
    'equal': 'equal',
    'weak bottom': 'bottom',
    'strong bottom': 'bottom'
}

# Apply the mapping to the saliency columns
data['Rep Saliency Merged'] = data['Rep Saliency'].map(saliency_mapping)
data['Pro Saliency Merged'] = data['Pro Saliency'].map(saliency_mapping)

# Save the majority votes to a CSV file
# output_folder = '../evaluation/majority/by_group'
output_folder = '../evaluation'
os.makedirs(output_folder, exist_ok=True)

# Function to determine the majority vote
def majority_vote(saliency_list):
    count = Counter(saliency_list)
    most_common = count.most_common()
    if len(most_common) == 1:
        return most_common[0][0]
    if most_common[0][1] > most_common[1][1]:
        return most_common[0][0]
    # Handle tie by choosing the worst
    tie_candidates = [item for item in most_common if item[1] == most_common[0][1]]
    priority = {'top': 1, 'equal': 0, 'bottom': -1}
    return min(tie_candidates, key=lambda x: priority[x[0]])[0]

# Group by Method, Chart, and Question Number
grouped = data.groupby(['Method', 'Chart', 'Question Number', 'Group'])

# Extract the majority vote for each question
majority_votes = grouped.apply(lambda x: pd.Series({
    'Rep Majority': majority_vote(x['Rep Saliency Merged']),
    'Pro Majority': majority_vote(x['Pro Saliency Merged'])
})).reset_index()

majority_votes.to_csv(os.path.join(output_folder, 'saliency_majority_votes.csv'), index=False)

  majority_votes = grouped.apply(lambda x: pd.Series({


## Golden standard
For each group, given all the answers from users in the majority data frame, create two lists, one per "representative" and one for "prominent" for a golden standard.

In [16]:
import json
import pandas as pd

# Load the origin file (assuming it's a JSON file)
with open('../a_group_data.json', 'r') as file:
    group_a_data = json.load(file)

with open('../b_group_data.json', 'r') as file:
    group_b_data = json.load(file)

# Function to create the dictionary for a group
def create_question_dict(group_data):
    question_dict = {}
    question_number = 1

    for method, charts in group_data.items():
        for chart, pairs in charts.items():
            for pair in pairs:
                top_concept_id = None
                bottom_concept_id = None
                
                for cluster in pair:
                    if cluster['saliency'] == 'top':
                        top_concept_id = cluster['id']
                    elif cluster['saliency'] == 'bottom':
                        bottom_concept_id = cluster['id']
                
                question_dict[question_number] = {
                    "top": top_concept_id,
                    "bottom": bottom_concept_id
                }
                question_number += 1

    return question_dict

# Create the dictionaries for each group
group_a_question_dict = create_question_dict(group_a_data)
group_b_question_dict = create_question_dict(group_b_data)

# Display the results
print("Group A Question Dictionary:")
print(group_a_question_dict)

print("\nGroup B Question Dictionary:")
print(group_b_question_dict)

Group A Question Dictionary:
{1: {'top': 1, 'bottom': 53}, 2: {'top': 35, 'bottom': 44}, 3: {'top': 32, 'bottom': 13}, 4: {'top': 56, 'bottom': 77}, 5: {'top': 1, 'bottom': 65}, 6: {'top': 79, 'bottom': 71}, 7: {'top': 40, 'bottom': 5}, 8: {'top': 4, 'bottom': 62}, 9: {'top': 7, 'bottom': 8}, 10: {'top': 26, 'bottom': 47}, 11: {'top': 40, 'bottom': 39}, 12: {'top': 35, 'bottom': 10}, 13: {'top': 7, 'bottom': 65}, 14: {'top': 35, 'bottom': 53}, 15: {'top': 35, 'bottom': 47}, 16: {'top': 32, 'bottom': 39}, 17: {'top': 1, 'bottom': 77}, 18: {'top': 79, 'bottom': 5}, 19: {'top': 4, 'bottom': 10}, 20: {'top': 79, 'bottom': 77}, 21: {'top': 79, 'bottom': 13}, 22: {'top': 56, 'bottom': 39}, 23: {'top': 56, 'bottom': 71}, 24: {'top': 56, 'bottom': 65}, 25: {'top': 79, 'bottom': 8}, 26: {'top': 36, 'bottom': 16}, 27: {'top': 82, 'bottom': 6}, 28: {'top': 29, 'bottom': 16}, 29: {'top': 42, 'bottom': 72}, 30: {'top': 21, 'bottom': 60}, 31: {'top': 26, 'bottom': 2}, 32: {'top': 67, 'bottom': 68}, 

In [17]:
data = pd.read_csv('../evaluation/saliency_majority_votes.csv')

# Combine question dictionaries for easier access
question_dict = {
    'a': group_a_question_dict,
    'b': group_b_question_dict
}

# Initialize global lists
rep_list = []
pro_list = []
removed_rep_list = []
removed_pro_list = []
not_added_rep_list = []
not_added_pro_list = []

# Function to update the global lists based on the majority votes
def update_golden_lists(df, question_dict):
    global rep_list, pro_list, removed_rep_list, removed_pro_list, not_added_rep_list, not_added_pro_list

    for index, row in df.iterrows():
        question_number = row['Question Number']
        group = row['Group']
        rep_majority = row['Rep Majority']
        pro_majority = row['Pro Majority']

        if rep_majority != 'equal':
            rep_concept_id = question_dict[group][question_number][rep_majority]
            rep_other_concept_id = question_dict[group][question_number]['bottom'] if rep_majority == 'top' else question_dict[group][question_number]['top']
            if rep_concept_id not in rep_list and rep_concept_id not in removed_rep_list and rep_concept_id not in not_added_rep_list:
                if rep_other_concept_id in rep_list:
                    rep_list.remove(rep_other_concept_id)
                    removed_rep_list.append(rep_other_concept_id)
                    not_added_rep_list.append(rep_concept_id)
                else:
                    rep_list.append(rep_concept_id)

        if pro_majority != 'equal':
            pro_concept_id = question_dict[group][question_number][pro_majority]
            pro_other_concept_id = question_dict[group][question_number]['bottom'] if pro_majority == 'top' else question_dict[group][question_number]['top']
            if pro_concept_id not in pro_list and pro_concept_id not in removed_pro_list and pro_concept_id not in not_added_pro_list:
                if pro_other_concept_id in pro_list:
                    pro_list.remove(pro_other_concept_id)
                    removed_pro_list.append(pro_other_concept_id)
                    not_added_pro_list.append(pro_concept_id)
                else:
                    pro_list.append(pro_concept_id)

        if rep_majority == 'equal':
            rep_concept_id = question_dict[group][question_number]['top']
            rep_other_concept_id = question_dict[group][question_number]['bottom']
            if rep_concept_id in rep_list:
                rep_list.remove(rep_concept_id)
                removed_rep_list.append(rep_concept_id)
            else:
                not_added_rep_list.append(rep_concept_id)
            if rep_other_concept_id in rep_list:
                rep_list.remove(rep_other_concept_id)
                removed_rep_list.append(rep_other_concept_id)
            else:
                not_added_rep_list.append(rep_other_concept_id) 
        if pro_majority == 'equal':
            pro_concept_id = question_dict[group][question_number]['top']
            pro_other_concept_id = question_dict[group][question_number]['bottom']
            if pro_concept_id in pro_list:
                pro_list.remove(pro_concept_id)
                removed_pro_list.append(pro_concept_id)
            else:
                not_added_pro_list.append(pro_concept_id)
            if pro_other_concept_id in pro_list:
                pro_list.remove(pro_other_concept_id)
                removed_pro_list.append(pro_other_concept_id)
            else:
                not_added_pro_list.append(pro_other_concept_id) 

# Separate the data by group
group_a_data = data[data['Group'] == 'a']
group_b_data = data[data['Group'] == 'b']

# Update the global lists using the majority votes data
update_golden_lists(group_a_data, question_dict)
update_golden_lists(group_b_data, question_dict)

# Display the results
print("Representative Golden List:", set(rep_list))
print("Prominent Golden List:", set(pro_list))
print("Removed Representative List:", set(removed_rep_list))
print("Removed Prominent List:", set(removed_pro_list))
print("Not Added Representative List:", set(not_added_rep_list))
print("Not Added Prominent List:", set(not_added_pro_list))

Representative Golden List: {65, 1, 6, 38, 70, 12, 45, 50, 83, 21}
Prominent Golden List: {7, 8, 13, 15, 18, 19, 22, 25, 30, 32, 36, 40, 45, 46, 52, 53, 65, 66, 69, 70, 72}
Removed Representative List: {2, 3, 4, 5, 7, 8, 10, 11, 14, 15, 16, 18, 22, 27, 28, 31, 33, 37, 39, 47, 49, 54, 55, 57, 77, 78, 82, 84}
Removed Prominent List: {2, 4, 5, 10, 14, 16, 24, 27, 29, 38, 39, 44, 49, 50, 54, 55, 57, 60, 61, 62, 64, 68, 74, 75, 77, 78, 79, 80, 81, 82, 83, 84}
Not Added Representative List: {4, 5, 7, 9, 11, 13, 14, 15, 16, 19, 20, 23, 24, 25, 26, 28, 29, 30, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 48, 49, 51, 52, 53, 55, 56, 58, 59, 60, 61, 62, 64, 66, 67, 68, 69, 71, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85}
Not Added Prominent List: {1, 3, 4, 6, 9, 10, 11, 12, 14, 20, 26, 28, 33, 34, 35, 37, 39, 42, 43, 44, 47, 48, 50, 54, 55, 56, 58, 59, 60, 64, 67, 68, 71, 75, 76, 81, 82, 85}


Compare how many of the concepts in the golden standards are also present in our top concepts.

In [18]:
# find intersection
def intersection(lst1, lst2):
    return set(lst1).intersection(lst2)

methods = ['VIG', 'SIG', 'XRAY']
charts = ['lattice', 'scatterplot', 'barchart']

# Initialize a dictionary to store the results
intersection_results = {}


# Initialize a list to store the results for CSV
csv_results = []

for method in methods:
    # Load the JSON file for the current method
    with open(f'../results/{method}_results.json', 'r') as file:
        method_data = json.load(file)

    for chart in charts:
        if chart in method_data and '01' in method_data[chart] and len(method_data[chart]['01']) > 0:
            combination_top = method_data[chart]['01']
            representative_intersection = intersection(combination_top, rep_list)
            prominent_intersection = intersection(combination_top, pro_list)

            # Calculate the percentage of intersection
            rep_percentage = round((len(representative_intersection) / len(combination_top)) * 100, 3)
            pro_percentage = round((len(prominent_intersection) / len(combination_top)) * 100, 3)

            # Store the results in a dictionary
            result = {
                'Method': method,
                'Chart': chart,
                'Representative Intersection': list(representative_intersection),
                'Prominent Intersection': list(prominent_intersection),
                'Representative Percentage': rep_percentage,
                'Prominent Percentage': pro_percentage
            }

            # Add to the list for CSV
            csv_results.append(result)

            # Print the results
            print(f"Method: {method}, Chart: {chart}")
            print(f"  Representative Intersection: {representative_intersection}")
            print(f"  Representative Percentage: {rep_percentage:.3f}%")
            print(f"  Prominent Intersection: {prominent_intersection}")
            print(f"  Prominent Percentage: {pro_percentage:.3f}%")

# Save the results to a new JSON file
output_json_path = '../evaluation/golden_intersection/intersection_results.json'
with open(output_json_path, 'w') as outfile:
    json.dump(csv_results, outfile, indent=4)

# Convert the results to a DataFrame and save as CSV
df = pd.DataFrame(csv_results)
output_csv_path = '../evaluation/golden_intersection/intersection_results.csv'
df.to_csv(output_csv_path, index=False)



Method: VIG, Chart: lattice
  Representative Intersection: {1}
  Representative Percentage: 11.111%
  Prominent Intersection: {40, 32, 7}
  Prominent Percentage: 33.333%
Method: VIG, Chart: scatterplot
  Representative Intersection: {50, 21, 38}
  Representative Percentage: 11.538%
  Prominent Intersection: {32, 36, 30}
  Prominent Percentage: 11.538%
Method: SIG, Chart: lattice
  Representative Intersection: set()
  Representative Percentage: 0.000%
  Prominent Intersection: {8, 53, 15}
  Prominent Percentage: 27.273%
Method: SIG, Chart: scatterplot
  Representative Intersection: {65, 50}
  Representative Percentage: 13.333%
  Prominent Intersection: {65, 66, 69, 22}
  Prominent Percentage: 26.667%
Method: SIG, Chart: barchart
  Representative Intersection: {12, 21, 6}
  Representative Percentage: 13.636%
  Prominent Intersection: {7, 8, 13, 15, 18}
  Prominent Percentage: 22.727%
Method: XRAY, Chart: lattice
  Representative Intersection: {83, 45, 70}
  Representative Percentage: 10.