In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings

warnings.filterwarnings("ignore")

SEED = 42

In [2]:
import torch
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sentence_transformers import SentenceTransformer

In [3]:
print(f"pytorch version: {torch.__version__}")
print(f"cuda available: {torch.cuda.is_available()}")
print(f"devices count: {torch.cuda.device_count()}")

pytorch version: 2.1.2+cu118
cuda available: True
devices count: 4


In [4]:
def embedJsonLInputFile(model, path):
    input = pd.read_json(path, lines=True).set_index('id')
    e = pd.DataFrame(index=input.index, columns=['Embedding'])
    for idx in tqdm(input.index):
        sentence = input.loc[idx]['text']
        e.loc[idx]['Embedding'] = model.encode(sentence)
    return e

In [5]:
annotators = ["A001", "A002", "A003", "A004", "A005", "A007", "A008", "A009", "A010", "A012"]

embeddings_train = pd.read_pickle("created_data/embeddings/t-gbert-lpc.pkl")
embeddings_test = pd.read_pickle("created_data/embeddings/t-gbert-lpc_test.pkl")

classifiers = {}
for annotator in annotators:
    with open(f"models/t-gbert-lpc_svc_{annotator}.pkl", 'rb') as f:
        classifiers[annotator] = pickle.load(f)

classifiers

{'A001': SVC(C=13, class_weight='balanced', random_state=42),
 'A002': SVC(C=35, class_weight='balanced', random_state=42),
 'A003': SVC(C=2, class_weight='balanced', random_state=42),
 'A004': SVC(C=13, class_weight='balanced', random_state=42),
 'A005': SVC(C=5, class_weight='balanced', random_state=42),
 'A007': SVC(C=13, class_weight='balanced', random_state=42),
 'A008': SVC(C=13, class_weight='balanced', random_state=42),
 'A009': SVC(C=56, class_weight='balanced', random_state=42),
 'A010': SVC(C=13, class_weight='balanced', random_state=42),
 'A012': SVC(C=35, class_weight='balanced', random_state=42)}

In [6]:
for annotator in annotators:
    y_train_split = pd.read_json(f"created_data/training_data/y_train_{annotator}.jsonl", lines=True).set_index('id')
    y_val_split = pd.read_json(f"created_data/training_data/y_val_{annotator}.jsonl", lines=True).set_index('id')
    y_train_all = pd.concat([y_train_split, y_val_split])
    X_train_all = embeddings_train.loc[y_train_all.index]
    
    print(f"{annotator}: {len(X_train_all)}")
        
    classifier = classifiers[annotator]
    classifier.fit(X_train_all['Embedding'].to_list(), y_train_all[annotator].to_list())

A001: 970
A002: 5998
A003: 1242
A004: 1394
A005: 1552
A007: 1246
A008: 1849
A009: 2923
A010: 5998
A012: 5998


In [7]:
def output_st1(predictions):
    #list with columns ["id": ..., "A001": "1.0, "A002": 0.0, "A003": ..., ..., "A012": ...] 
    QUANT_TO_QUAL={0: 0, 1: 1, 2: 1, 3: 1, 4: 1}
    NUMBER_TO_LABEL={0: "0-Kein", 1: "1-Gering", 2: "2-Vorhanden", 3: "3-Stark", 4: "4-Extrem"}
    
    #predictions_quant: pd.DataFrame = predictions.applymap(lambda x: LABEL_VALS_QUANT[x] if not pd.isna(x) else x)
    predictions_qual: pd.DataFrame = predictions.applymap(lambda x: QUANT_TO_QUAL[x] if not pd.isna(x) else x)
    
    output = pd.DataFrame(index=["id"])
    output.index = predictions.index
    
    #create expected columns 
    output["bin_maj"] = predictions_qual.mode(axis='columns')[0]
    output["bin_one"] = predictions_qual.apply(lambda x: (x == 1).any(), axis='columns')
    output["bin_all"] = predictions_qual.apply(lambda x: not (x == 0).any(), axis='columns')
    output["multi_maj"] = predictions.mode(axis='columns')[0].apply(lambda x: NUMBER_TO_LABEL[x])
    output["disagree_bin"] = output.apply(lambda x: x["bin_one"] and not x["bin_all"], axis='columns')
    
    #convert False/True to 0/1
    output['bin_maj'] = output['bin_maj'].apply(lambda x: 1 if x else 0)
    output['bin_one'] = output['bin_one'].apply(lambda x: 1 if x else 0)
    output['bin_all'] = output['bin_all'].apply(lambda x: 1 if x else 0)
    output['disagree_bin'] = output['disagree_bin'].apply(lambda x: 1 if x else 0)

    return output

In [8]:
X_test=embeddings_test
predictions = pd.DataFrame(index=X_test.index, columns=annotators)

row_annotators = pd.read_json("created_data/training_data/X_test.jsonl", lines=True).set_index('id')["annotators"]
for idx, row in X_test.iterrows():
    annos = row_annotators.loc[idx]
    for anno in annos:
        predictions.loc[idx][anno] = classifiers[anno].predict([row['Embedding']])[0]
        

output = output_st1(predictions)

In [9]:
output.to_csv(f'created_data/results/t-gbert-lpc_svc_X-test.tsv', sep="\t")

output

Unnamed: 0_level_0,bin_maj,bin_one,bin_all,multi_maj,disagree_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
f3b81af2f6852bf1b9896629525d2f41,0,1,0,0-Kein,1
cf8b8bac7165144bb62b399a98843366,0,0,0,0-Kein,0
0c45cdf4cca5eec566d6dd53653b532b,1,1,0,2-Vorhanden,1
3a60877d2c04ba65f457f7cc3e003169,0,1,0,0-Kein,1
f389b63364d8da93860e3c7e6569bf5b,0,1,0,0-Kein,1
...,...,...,...,...,...
2f7322c62b63ff74ec945bb38ed9f258,0,0,0,0-Kein,0
ec5fe35f542aac2f3155177dbf2731c2,0,0,0,0-Kein,0
6674986a02bab67b011df90cc7396a96,0,0,0,0-Kein,0
2a3774eba33afe18af2f0d312d081bb3,0,0,0,0-Kein,0
