In [1]:
import itertools
import pickle

import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.metrics import cohen_kappa_score, f1_score

In [2]:
df = pd.read_csv('sigcse_2024.csv')

train_df = df[df.subset == 'train'].copy()
test_df = df[df.subset == 'test'].copy()

In [3]:
embedding_names = ['bert_base', 'bert_large', 'sbert',
                   'gpt2', 'gpt2_medium', 'gpt2_large', 'gpt2_xl',
                   'llama_7b', 'llama_13b', 'llama_30b', 'llama_65b',
                   'llama2_7b', 'llama2_13b', 'llama2_70b',
                   'llama2_7b_chat', 'llama2_13b_chat', 'llama2_70b_chat',
                   'vicuna_7b', 'vicuna_13b', 'vicuna_33b']
embedding_modes = ['last_token', 'mean_pooling']

In [4]:
kernels = ['linear', 'poly', 'rbf']
cs = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]

In [5]:
embedding_to_config_to_metrics = {}

for embedding in list(map(lambda x : f'{x[0]}_{x[1]}', list(itertools.product(embedding_names, embedding_modes)))) + ['openai_api']:
    print(f'{embedding}')
    embeddings = pickle.load(open(f'embeddings/{embedding}.pkl', 'rb'))
    
    config_to_metrics = {}
    for kernel in kernels:
        for c in cs:
            print(f'{kernel}_{c}')
            for qid, qid_df in train_df.groupby('qid'):
                fold_dfs = np.array_split(qid_df.sample(frac=1., random_state=64), 5)

                for i in range(5):
                    sub_train_df = pd.concat(fold_dfs[:i] + fold_dfs[i + 1:])
                    sub_validate_df = fold_dfs[i]

                    svc = SVC(kernel=kernel, C=c)

                    train_X = np.zeros((len(sub_train_df), np.shape(embeddings)[1]))
                    train_y = np.zeros((len(sub_train_df), ), dtype=int)

                    index = 0
                    for row_index, row in sub_train_df.iterrows():
                        train_X[index, :] = embeddings[row_index]
                        train_y[index] = row.binary_ground_truth

                        index += 1

                    svc.fit(train_X, train_y)

                    validate_X = np.zeros((len(sub_validate_df), np.shape(embeddings)[1]))
                    validate_y = np.zeros((len(sub_validate_df), ), dtype=int)

                    index = 0
                    for row_index, row in sub_validate_df.iterrows():
                        validate_X[index, :] = embeddings[row_index]
                        validate_y[index] = row.binary_ground_truth

                        index += 1

                    predicted = svc.predict(validate_X)

                    train_df.loc[sub_validate_df.index, 'predicted'] = predicted
            
            accuracy = len(train_df[train_df.binary_ground_truth == train_df.predicted])/len(train_df)
            kappa = cohen_kappa_score(train_df.binary_ground_truth, train_df.predicted)
            f1 = f1_score(train_df.binary_ground_truth, train_df.predicted)
        
            config_to_metrics[f'{kernel}_{c}'] = accuracy, kappa, f1
    
    embedding_to_config_to_metrics[embedding] = config_to_metrics

bert_base_last_token
poly_1
poly_2
poly_4
poly_8


KeyboardInterrupt: 

In [None]:
pickle.dump(embedding_to_config_to_metrics, open('embedding_to_config_to_metrics.pkl', 'wb'))

In [None]:
embedding_to_best_config = {}
embedding_to_metrics = {}
for embedding in embedding_to_config_to_metrics:
    config_to_metrics = embedding_to_config_to_metrics[embedding]
    
    best_configs = [max(config_to_metrics, key=lambda x : config_to_metrics.get(x)[i]) for i in range(3)]
    assert len(set(best_configs)) == 1
    
    print(embedding)
    print(best_configs[0])
    
    embedding_to_best_config[embedding] = best_configs[0]
    embedding_to_metrics[embedding] = config_to_metrics[best_configs[0]]

In [None]:
best_embeddings = [max(embedding_to_metrics, key=lambda x : embedding_to_metrics.get(x)[i]) for i in range(3)]
assert len(set(best_embeddings)) == 1

In [None]:
embeddings = pickle.load(open(f'embeddings/{best_embeddings[0]}.pkl', 'rb'))

best_config = embedding_to_best_config[best_embeddings[0]]

test_df['embedding_svc'] = 0
for qid, sub_train_df in train_df.groupby('qid'):
    svc = SVC(kernel=best_config.split('_')[0], C=int(best_config.split('_')[1]))
    
    train_X = np.zeros((len(sub_train_df), np.shape(embeddings)[1]))
    train_y = np.zeros((len(sub_train_df), ), dtype=int)
    index = 0
    for row_index, row in sub_train_df.iterrows():
        train_X[index, :] = embeddings[row_index]
        train_y[index] = row.binary_ground_truth

        index += 1

    svc.fit(train_X, train_y)
    
    sub_test_df = test_df[test_df.qid == qid]
    
    test_X = np.zeros((len(sub_test_df), np.shape(embeddings)[1]))
    test_y = np.zeros((len(sub_test_df), ), dtype=int)
    index = 0
    for row_index, row in sub_test_df.iterrows():
        test_X[index, :] = embeddings[row_index]
        test_y[index] = row.binary_ground_truth

        index += 1
    
    targets = np.array(test_y)
    predicted = svc.predict(test_X)
    
    test_df.loc[sub_test_df.index, 'embedding_svc'] = predicted

In [None]:
pickle.dump(list(test_df.bigram_lr), open('embedding_svc.pkl', 'wb'))