In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings

warnings.filterwarnings("ignore")

SEED = 42

In [2]:
import torch
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sentence_transformers import SentenceTransformer
from sklearn.base import clone

print(f"pytorch version: {torch.__version__}")
print(f"cuda available: {torch.cuda.is_available()}")
print(f"devices count: {torch.cuda.device_count()}")

pytorch version: 2.1.2+cu118
cuda available: False
devices count: 0


In [3]:
annotators = ["A001", "A002", "A003", "A004", "A005", "A007", "A008", "A009", "A010", "A012"]
embedders = ['me5-large', 't-gbert-lpc']
classifiers = ['svc', 'rfc', 'mlp']

In [15]:
score = {
    'me5-large': {
        'svc': 0,
        'rfc': 0,
        'mlp': 0
    },
    't-gbert-lpc': {
        'svc': 0,
        'rfc': 0,
        'mlp': 0
    },
}

for embedder in embedders:
    embeddings = pd.read_pickle(f"data/embeddings/{embedder}.pkl")
    for annotator in annotators:
        y_val = pd.read_json(f"data/training/y_val_{annotator}.jsonl", lines=True).set_index('id')
        X_val = embeddings.loc[y_val.index]
        for classifier in classifiers:
            with open(f"data/models/{embedder}_{classifier}_{annotator}.pkl", 'rb') as f:
                model = pickle.load(f)
            predictions = model.predict(X_val['Embedding'].to_list())
            score[embedder][classifier] += f1_score(y_val[annotator].to_list(), predictions, average='macro')
score

{'me5-large': {'svc': 2.8585116706747855,
  'rfc': 1.8957931230231722,
  'mlp': 2.9432966921892927},
 't-gbert-lpc': {'svc': 2.7827020256966764,
  'rfc': 1.9671971517976103,
  'mlp': 2.8283099490656523}}