In [1]:
import itertools
import pickle

import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.metrics import cohen_kappa_score, f1_score

In [2]:
df = pd.read_csv('sigcse_2024.csv')

train_df = df[df.subset == 'train'].copy()
test_df = df[df.subset == 'test'].copy()

In [3]:
embedding_names = ['bert_base', 'bert_large', 'sbert',
                   'gpt2', 'gpt2_medium', 'gpt2_large', 'gpt2_xl',
                   'llama_7b', 'llama_13b', 'llama_30b', 'llama_65b',
                   'llama2_7b', 'llama2_13b', 'llama2_70b',
                   'llama2_7b_chat', 'llama2_13b_chat', 'llama2_70b_chat',
                   'vicuna_7b', 'vicuna_13b', 'vicuna_33b']
embedding_modes = ['last_token', 'mean_pooling']

In [4]:
kernels = ['linear', 'poly', 'rbf']

In [5]:
embedding_to_config_to_metrics = {}
for kernel in kernels:
    for i in range(6):
        split = pickle.load(open(f'embeddings/embedding_to_config_to_metrics_{kernel}_{i}.pkl', 'rb'))
        embedding_to_config_to_metrics = embedding_to_config_to_metrics | split

In [6]:
pickle.dump(embedding_to_config_to_metrics, open('embedding_to_config_to_metrics.pkl', 'wb'))

In [7]:
embedding_to_best_config = {}
embedding_to_metrics = {}
for embedding in embedding_to_config_to_metrics:
    config_to_metrics = embedding_to_config_to_metrics[embedding]
    
    best_configs = [max(config_to_metrics, key=lambda x : config_to_metrics.get(x)[i]) for i in range(3)]
    
    if len(set(best_configs)) != 1:
        print(embedding)
        print(best_configs)
    
    embedding_to_best_config[embedding] = best_configs[0]
    embedding_to_metrics[embedding] = config_to_metrics[best_configs[0]]

bert_base_mean_pooling
['rbf_8', 'rbf_8', 'rbf_16']
gpt2_mean_pooling
['rbf_4096', 'rbf_2048', 'rbf_2048']
llama_65b_last_token
['rbf_16', 'rbf_16', 'rbf_8']
vicuna_7b_last_token
['rbf_4', 'rbf_16', 'rbf_16']
vicuna_33b_last_token
['rbf_8', 'rbf_8', 'rbf_4']


In [8]:
del embedding_to_metrics['openai_api']

In [9]:
best_embeddings = [max(embedding_to_metrics, key=lambda x : embedding_to_metrics.get(x)[i] if 'last_token' in x else 0) for i in range(3)]
assert len(set(best_embeddings)) == 1

In [10]:
best_embeddings

['sbert_last_token', 'sbert_last_token', 'sbert_last_token']

In [11]:
embeddings = pickle.load(open(f'embeddings/{best_embeddings[0]}.pkl', 'rb'))

best_config = embedding_to_best_config[best_embeddings[0]]

test_df['embedding_last_token_svc'] = 0
for qid, sub_train_df in train_df.groupby('qid'):
    svc = SVC(kernel=best_config.split('_')[0], C=int(best_config.split('_')[1]))
    
    train_X = np.zeros((len(sub_train_df), np.shape(embeddings)[1]))
    train_y = np.zeros((len(sub_train_df), ), dtype=int)
    index = 0
    for row_index, row in sub_train_df.iterrows():
        train_X[index, :] = embeddings[row_index]
        train_y[index] = row.binary_ground_truth

        index += 1

    svc.fit(train_X, train_y)
    
    sub_test_df = test_df[test_df.qid == qid]
    
    test_X = np.zeros((len(sub_test_df), np.shape(embeddings)[1]))
    test_y = np.zeros((len(sub_test_df), ), dtype=int)
    index = 0
    for row_index, row in sub_test_df.iterrows():
        test_X[index, :] = embeddings[row_index]
        test_y[index] = row.binary_ground_truth

        index += 1
    
    targets = np.array(test_y)
    predicted = svc.predict(test_X)
    
    test_df.loc[sub_test_df.index, 'embedding_last_token_svc'] = predicted

In [12]:
pickle.dump(list(test_df.embedding_last_token_svc), open('embedding_last_token_svc.pkl', 'wb'))

In [13]:
best_embeddings = [max(embedding_to_metrics, key=lambda x : embedding_to_metrics.get(x)[i] if 'mean_pooling' in x else 0) for i in range(3)]
assert len(set(best_embeddings)) == 1

In [14]:
best_embeddings

['sbert_mean_pooling', 'sbert_mean_pooling', 'sbert_mean_pooling']

In [15]:
embeddings = pickle.load(open(f'embeddings/{best_embeddings[0]}.pkl', 'rb'))

best_config = embedding_to_best_config[best_embeddings[0]]

test_df['embedding_mean_pooling_svc'] = 0
for qid, sub_train_df in train_df.groupby('qid'):
    svc = SVC(kernel=best_config.split('_')[0], C=int(best_config.split('_')[1]))
    
    train_X = np.zeros((len(sub_train_df), np.shape(embeddings)[1]))
    train_y = np.zeros((len(sub_train_df), ), dtype=int)
    index = 0
    for row_index, row in sub_train_df.iterrows():
        train_X[index, :] = embeddings[row_index]
        train_y[index] = row.binary_ground_truth

        index += 1

    svc.fit(train_X, train_y)
    
    sub_test_df = test_df[test_df.qid == qid]
    
    test_X = np.zeros((len(sub_test_df), np.shape(embeddings)[1]))
    test_y = np.zeros((len(sub_test_df), ), dtype=int)
    index = 0
    for row_index, row in sub_test_df.iterrows():
        test_X[index, :] = embeddings[row_index]
        test_y[index] = row.binary_ground_truth

        index += 1
    
    targets = np.array(test_y)
    predicted = svc.predict(test_X)
    
    test_df.loc[sub_test_df.index, 'embedding_mean_pooling_svc'] = predicted

In [16]:
pickle.dump(list(test_df.embedding_mean_pooling_svc), open('embedding_mean_pooling_svc.pkl', 'wb'))

In [17]:
embeddings = pickle.load(open(f'embeddings/openai_api.pkl', 'rb'))

best_config = embedding_to_best_config['openai_api']

test_df['embedding_openai_api_svc'] = 0
for qid, sub_train_df in train_df.groupby('qid'):
    svc = SVC(kernel=best_config.split('_')[0], C=int(best_config.split('_')[1]))
    
    train_X = np.zeros((len(sub_train_df), np.shape(embeddings)[1]))
    train_y = np.zeros((len(sub_train_df), ), dtype=int)
    index = 0
    for row_index, row in sub_train_df.iterrows():
        train_X[index, :] = embeddings[row_index]
        train_y[index] = row.binary_ground_truth

        index += 1

    svc.fit(train_X, train_y)
    
    sub_test_df = test_df[test_df.qid == qid]
    
    test_X = np.zeros((len(sub_test_df), np.shape(embeddings)[1]))
    test_y = np.zeros((len(sub_test_df), ), dtype=int)
    index = 0
    for row_index, row in sub_test_df.iterrows():
        test_X[index, :] = embeddings[row_index]
        test_y[index] = row.binary_ground_truth

        index += 1
    
    targets = np.array(test_y)
    predicted = svc.predict(test_X)
    
    test_df.loc[sub_test_df.index, 'embedding_openai_api_svc'] = predicted

In [18]:
pickle.dump(list(test_df.embedding_openai_api_svc), open('embedding_openai_api_svc.pkl', 'wb'))