In [3]:
import numpy as np
import pandas as pd
from scipy import stats

In [None]:
batch_one_path = './data/retreival_batch_one.xlsx'
batch_two_path = './data/retreival_batch_two.xlsx'
batch_three_path = './data/retreival_batch_three.xlsx'

In [17]:
batch_one = {
    'direct': pd.read_excel(batch_one_path, sheet_name='direct'),
    'single': pd.read_excel(batch_one_path, sheet_name='single'),
    'multi': pd.read_excel(batch_one_path, sheet_name='multi')
}

batch_two = {
    'direct': pd.read_excel(batch_two_path, sheet_name='direct'),
    'single': pd.read_excel(batch_two_path, sheet_name='single'),
    'multi': pd.read_excel(batch_two_path, sheet_name='multi')
}

batch_three = {
    'direct': pd.read_excel(batch_three_path, sheet_name='direct'),
    'single': pd.read_excel(batch_three_path, sheet_name='single'),
    'multi': pd.read_excel(batch_three_path, sheet_name='multi')
}

In [18]:
batch_one['direct'].sample(3)

Unnamed: 0,query,target,graph,text-embedding-3-large,multilingual-e5-large
91,dense feature,dense feature,Concept ID: E.10 Concept Label: example,Concept ID: E.9 Concept Label: discrete feature,Concept ID: D.4 Concept Label: dense feature
55,underfitting,underfitting,Concept ID: B.26 Concept Label: underfitting,Concept ID: B.26 Concept Label: underfitting,Concept ID: B.26 Concept Label: underfitting
22,retrieval-augmented generation,retrieval-augmented generation,Concept ID: A.29 Concept Label: retrieval-augm...,Concept ID: A.29 Concept Label: retrieval-augm...,Concept ID: A.29 Concept Label: retrieval-augm...


In [None]:
def extract_labels(text):
    if pd.isna(text): 
        return text
    
    def clean_text(s):
        return s.strip() if isinstance(s, str) else s
    
    if isinstance(text, str) and '\n' in text:
        entries = text.split('\n')
        labels = []
        
        for entry in entries:
            entry = clean_text(entry)
            if 'Concept Label:' in entry:
                label = clean_text(entry.split('Concept Label:')[1])
                labels.append(label)
            elif 'Label:' in entry:
                label = clean_text(entry.split('Label:')[1])
                labels.append(label)
        return labels if labels else clean_text(text)
    
    try:
        text = clean_text(text)
        if 'Concept Label:' in text:
            return clean_text(text.split('Concept Label:')[1])
        elif 'Label:' in text:
            return clean_text(text.split('Label:')[1])
        return text
    except:
        return text

In [6]:
for batch in [batch_one, batch_two, batch_three]:
    for sheet in batch.values():
        sheet['graph'] = sheet['graph'].apply(extract_labels)
        sheet['text-embedding-3-large'] = sheet['text-embedding-3-large'].apply(extract_labels)
        sheet['multilingual-e5-large'] = sheet['multilingual-e5-large'].apply(extract_labels)

In [7]:
batch_one['direct'].head(3)

Unnamed: 0,query,target,graph,text-embedding-3-large,multilingual-e5-large
0,relu,relu,relu,relu,relu
1,artificial intelligence,artificial intelligence,artificial intelligence,artificial intelligence,artificial intelligence
2,inference,inference,inference,inference,weight


In [None]:
def check_matches(df):
    df['graph_match'] = (df['graph'] == df['target']).astype(int)
    df['text-embedding_match'] = (df['text-embedding-3-large'] == df['target']).astype(int)
    df['multilingual-e5_match'] = (df['multilingual-e5-large'] == df['target']).astype(int)
    return df

In [None]:
for batch in [batch_one, batch_two, batch_three]:
    for sheet_name in ['direct', 'single']:
        batch[sheet_name] = check_matches(batch[sheet_name])

In [10]:
display(batch_two['single'].sample(10))

Unnamed: 0,query,target,graph,text-embedding-3-large,multilingual-e5-large,graph_match,text-embedding_match,multilingual-e5_match
34,How do you define a feature in machine learning?,feature,feature,feature set,feature set,1,0,0
87,What does a bias term do in a neural network?,bias,bias,bias,bias,1,1,1
41,What is the purpose of L2 loss?,l2 loss,squared loss,squared loss,squared loss,0,0,0
56,How is validation loss measured?,validation loss,log loss,test loss,test loss,0,0,0
4,What is a linear model in machine learning?,linear,linear model,linear regression,linear regression,0,0,0
25,What does supervised machine learning mean?,supervised machine learning,supervised machine learning,supervised machine learning,supervised machine learning,1,1,1
64,How does gradient descent work?,gradient descent,gradient descent,gradient descent,gradient descent,1,1,1
30,How is a classification threshold determined?,classification threshold,roc curve,classification threshold,classification threshold,0,1,1
78,What defines a neural network?,neural network,neural network,neural network,neural network,1,1,1
66,What counts as a true positive in classification?,true positive,true positive,negative class,true positive rate,1,0,0


In [None]:
def calculate_batch_accuracy(batch):
    combined_results = pd.concat([
        batch['direct'],
        batch['single']
    ], ignore_index=True)
    
    total_samples = len(combined_results.dropna())
    
    accuracies = {
        'graph': (combined_results['graph_match'].sum() / total_samples) * 100,
        'text-embedding': (combined_results['text-embedding_match'].sum() / total_samples) * 100,
        'multilingual-e5': (combined_results['multilingual-e5_match'].sum() / total_samples) * 100
    }
    
    return accuracies

batch_accuracies = {
    'batch one': calculate_batch_accuracy(batch_one),
    'batch two': calculate_batch_accuracy(batch_two),
    'batch three': calculate_batch_accuracy(batch_three)
}

accuracy_df = pd.DataFrame(batch_accuracies).round(2)

display(accuracy_df)

Unnamed: 0,batch one,batch two,batch three
graph,83.0,85.0,82.0
text-embedding,69.5,70.0,69.5
multilingual-e5,67.5,68.5,67.0


In [None]:
def calculate_metrics_with_ci(df, metric_type='accuracy'):
    statistical_metrics = {}

    if metric_type == 'precision':
        method_map = {
            'Graph P@3': 'graph',
            'Text-Embedding P@3': 'text-embedding',
            'Multilingual-E5 P@3': 'multilingual-e5'
        }
    else: 
        method_map = {
            'graph': 'graph',
            'text-embedding': 'text-embedding',
            'multilingual-e5': 'multilingual-e5'
        }
    
    for display_name, method in method_map.items():
        observations = df.loc[display_name] if metric_type == 'precision' else df.loc[method]
        mean_value = np.mean(observations)
        
        alpha = 0.95
        degrees_of_freedom = len(observations) - 1
        t_statistic = stats.t.ppf((1 + alpha) / 2, degrees_of_freedom)
        standard_error = stats.sem(observations)
        margin_of_error = t_statistic * standard_error
        
        statistical_metrics[method] = {
            f'{metric_type}_value': f"{round(mean_value, 2)}% ± {round(margin_of_error, 2)}%",
            'confidence_interval_95': f"[{round(mean_value - margin_of_error, 2)}%, {round(mean_value + margin_of_error, 2)}%]"
        }
    
    results_df = pd.DataFrame(statistical_metrics).T
    results_df.index = ['Graph-based Method', 'OpenAI (v3-large)', 'Multilingual E5 (large)']
    metric_name = 'Top-1 Accuracy' if metric_type == 'accuracy' else 'Precision@3'
    results_df.columns = [f'{metric_name} (95% CI)', '95% Confidence Interval']
    
    return results_df

accuracy_results = calculate_metrics_with_ci(accuracy_df, metric_type='accuracy')
display(accuracy_results)

Unnamed: 0,Top-1 Accuracy (95% CI),95% Confidence Interval
Graph-based Method,83.33% ± 3.79%,"[79.54%, 87.13%]"
OpenAI (v3-large),69.67% ± 0.72%,"[68.95%, 70.38%]"
Multilingual E5 (large),67.67% ± 1.9%,"[65.77%, 69.56%]"


In [None]:
def clean_list_items(items):
    if isinstance(items, str):
        try:
            items = eval(items)
        except:
            items = [items]

    return [str(item).strip() for item in items]

for batch in [batch_one, batch_two, batch_three]:
    multi_df = batch['multi']
    
    multi_df['target'] = multi_df['target'].apply(clean_list_items)
    multi_df['graph'] = multi_df['graph'].apply(clean_list_items)
    multi_df['text-embedding-3-large'] = multi_df['text-embedding-3-large'].apply(clean_list_items)
    multi_df['multilingual-e5-large'] = multi_df['multilingual-e5-large'].apply(clean_list_items)

In [None]:
def calculate_precision_at_k(true_labels, predicted_labels, k=3):
    if isinstance(true_labels, str):
        true_labels = [true_labels]
    if isinstance(predicted_labels, str):
        predicted_labels = [predicted_labels]
    
    true_set = set(true_labels[:k])
    pred_set = set(predicted_labels[:k])
    
    matches = len(true_set.intersection(pred_set))
    precision = (matches / k) * 100 
    return precision

def process_multi_sheets():
    results = {}
    
    for batch_name, batch in [('batch_one', batch_one), 
                            ('batch_two', batch_two), 
                            ('batch_three', batch_three)]:
        
        multi_df = batch['multi']
        
        multi_df['graph_precision'] = multi_df.apply(
            lambda x: calculate_precision_at_k(x['target'], x['graph']), axis=1)
        
        multi_df['text_embedding_precision'] = multi_df.apply(
            lambda x: calculate_precision_at_k(x['target'], x['text-embedding-3-large']), axis=1)
        
        multi_df['multilingual_precision'] = multi_df.apply(
            lambda x: calculate_precision_at_k(x['target'], x['multilingual-e5-large']), axis=1)
        
        results[batch_name] = {
            'Graph P@3': multi_df['graph_precision'].mean(),
            'Text-Embedding P@3': multi_df['text_embedding_precision'].mean(),
            'Multilingual-E5 P@3': multi_df['multilingual_precision'].mean()
        }
    
    precision_df = pd.DataFrame(results).round(2)
    return precision_df

precision_results = process_multi_sheets()
display(precision_results)

Unnamed: 0,batch_one,batch_two,batch_three
Graph P@3,75.33,74.0,74.67
Text-Embedding P@3,81.67,82.67,82.0
Multilingual-E5 P@3,80.33,78.0,81.33


In [15]:
display(batch_one['multi'].sample(3))

Unnamed: 0,query,target,graph,text-embedding-3-large,multilingual-e5-large,graph_precision,text_embedding_precision,multilingual_precision
7,What impact do hyperparameters have on the mod...,"[hyperparameter, model, training loss]","[learning rate, training loss, gradient descent]","[parameter, training loss, loss function]","[parameter, training loss, loss function]",33.333333,33.333333,33.333333
3,How do labels in a dataset influence the effec...,"[label, dataset, classification model]","[label, class, accuracy]","[label, classification model, class]","[label, class, multi-class classification]",33.333333,66.666667,33.333333
24,How does early stopping based on validation lo...,"[early stopping, validation loss, model]","[early stopping, validation loss, generalization]","[early stopping, validation loss, generalization]","[early stopping, validation loss, generalization]",66.666667,66.666667,66.666667


In [None]:
precision_results = calculate_metrics_with_ci(precision_results, metric_type='precision')
display(precision_results)

Unnamed: 0,Precision@3 (95% CI),95% Confidence Interval
Graph-based Method,74.67% ± 1.65%,"[73.01%, 76.32%]"
OpenAI (v3-large),82.11% ± 1.27%,"[80.85%, 83.38%]"
Multilingual E5 (large),79.89% ± 4.24%,"[75.64%, 84.13%]"


In [17]:
accuracy_results

Unnamed: 0,Top-1 Accuracy (95% CI),95% Confidence Interval
Graph-based Method,83.33% ± 3.79%,"[79.54%, 87.13%]"
OpenAI (v3-large),69.67% ± 0.72%,"[68.95%, 70.38%]"
Multilingual E5 (large),67.67% ± 1.9%,"[65.77%, 69.56%]"
