In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings

warnings.filterwarnings("ignore")

SEED = 42

In [None]:
import torch
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sentence_transformers import SentenceTransformer

In [None]:
print(f"pytorch version: {torch.__version__}")
print(f"cuda available: {torch.cuda.is_available()}")
print(f"devices count: {torch.cuda.device_count()}")

In [None]:
def embedJsonLInputFile(model, path):
    input = pd.read_json(path, lines=True).set_index('id')
    e = pd.DataFrame(index=input.index, columns=['Embedding'])
    for idx in tqdm(input.index):
        sentence = input.loc[idx]['text']
        e.loc[idx]['Embedding'] = model.encode(sentence)
    return e

In [None]:
annotators = ["A001", "A002", "A003", "A004", "A005", "A007", "A008", "A009", "A010", "A012"]

embeddings_train = pd.read_pickle("created_data/embeddings/t-gbert-lpc.pkl")
embeddings_test = pd.read_pickle("created_data/embeddings/t-gbert-lpc_test.pkl")

classifiers = {}
for annotator in annotators:
    with open(f"models/t-gbert-lpc_svc_{annotator}.pkl", 'rb') as f:
        classifiers[annotator] = pickle.load(f)

classifiers

In [None]:
for annotator in annotators:
    y_train_split = pd.read_json(f"created_data/training_data/y_train_{annotator}.jsonl", lines=True).set_index('id')
    y_val_split = pd.read_json(f"created_data/training_data/y_val_{annotator}.jsonl", lines=True).set_index('id')
    y_train_all = pd.concat([y_train_split, y_val_split])
    X_train_all = embeddings_train.loc[y_train_all.index]
    
    print(f"{annotator}: {len(X_train_all)}")
        
    classifier = classifiers[annotator]
    classifier.fit(X_train_all['Embedding'].to_list(), y_train_all[annotator].to_list())

In [None]:
def output_st1(predictions):
    #list with columns ["id": ..., "A001": "1.0, "A002": 0.0, "A003": ..., ..., "A012": ...] 
    QUANT_TO_QUAL={0: 0, 1: 1, 2: 1, 3: 1, 4: 1}
    NUMBER_TO_LABEL={0: "0-Kein", 1: "1-Gering", 2: "2-Vorhanden", 3: "3-Stark", 4: "4-Extrem"}
    
    #predictions_quant: pd.DataFrame = predictions.applymap(lambda x: LABEL_VALS_QUANT[x] if not pd.isna(x) else x)
    predictions_qual: pd.DataFrame = predictions.applymap(lambda x: QUANT_TO_QUAL[x] if not pd.isna(x) else x)
    
    output = pd.DataFrame(index=["id"])
    output.index = predictions.index
    
    #create expected columns 
    output["bin_maj"] = predictions_qual.mode(axis='columns')[0]
    output["bin_one"] = predictions_qual.apply(lambda x: (x == 1).any(), axis='columns')
    output["bin_all"] = predictions_qual.apply(lambda x: not (x == 0).any(), axis='columns')
    output["multi_maj"] = predictions.mode(axis='columns')[0].apply(lambda x: NUMBER_TO_LABEL[x])
    output["disagree_bin"] = output.apply(lambda x: x["bin_one"] and not x["bin_all"], axis='columns')
    
    #convert False/True to 0/1
    output['bin_maj'] = output['bin_maj'].apply(lambda x: 1 if x else 0)
    output['bin_one'] = output['bin_one'].apply(lambda x: 1 if x else 0)
    output['bin_all'] = output['bin_all'].apply(lambda x: 1 if x else 0)
    output['disagree_bin'] = output['disagree_bin'].apply(lambda x: 1 if x else 0)

    return output

In [None]:
X_test=embeddings_test
predictions = pd.DataFrame(index=X_test.index, columns=annotators)

row_annotators = pd.read_json("created_data/training_data/X_test.jsonl", lines=True).set_index('id')["annotators"]
for idx, row in X_test.iterrows():
    annos = row_annotators.loc[idx]
    for anno in annos:
        predictions.loc[idx][anno] = classifiers[anno].predict([row['Embedding']])[0]
        

output = output_st1(predictions)

In [None]:
output.to_csv(f'created_data/results/t-gbert-lpc_svc_X-test.tsv', sep="\t")

output