In [60]:
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import re

##Binary classification

In [None]:
ev_df = pd.read_csv('deepseek_evaluation.csv')

In [None]:
def binary_word_label(file, label, not_label, deepseek_column):
  if 'csv' in file:
    df = pd.read_csv(file)
  elif 'xlsx' in file:
    df = pd.read_excel(file)


  df[deepseek_column] = df[deepseek_column].astype(str)
  df['deepseek_final'] = 'None'

  for index, row in df.iterrows():
    if not_label in row[deepseek_column]:
      df.loc[index, 'deepseek_final'] = not_label
    elif label in row[deepseek_column]:
      df.loc[index, 'deepseek_final'] = label

  df_filtered = df[df.deepseek_final != 'None']

  gold = df_filtered["annotation"].tolist()
  system = df_filtered["deepseek_final"].tolist()

  f1_macro = f1_score(gold, system, average='macro')
  f1_micro = f1_score(gold, system, average='micro')
  accuracy = accuracy_score(gold, system)

  print(f'Accuracy: {accuracy}')
  print(f'F1-macro: {f1_macro}')
  print(f'F1-micro: {f1_micro}')
  print(f'None: {len(df) - len(df_filtered)}')

  filename = file.split('.')[0]
  try:
    df.to_excel(f'{filename}_final.xlsx', index=False)
  except:
    df.to_csv(f'{filename}_final.csv', index=False)

  ev_df = pd.read_csv('deepseek_evaluation.csv')
  task = file.replace('.csv', '').replace('.xlsx', '')
  if task not in ev_df['task'].values:
    ev_df = pd.concat([
        ev_df,
        pd.DataFrame([{'task': task, 'accuracy': 0, 'f1_macro': 0}])
    ], ignore_index=True)

  ev_df.loc[ev_df['task'] == task, ['accuracy', 'f1_macro']] = [
    round(accuracy * 100, 2),
    round(f1_macro * 100, 2)
  ]
  ev_df.to_csv('deepseek_evaluation.csv', index=False)

##1.Aggression

In [None]:
binary_word_label('aggression.xlsx', 'aggressive', 'non-aggressive', 'deepseek')

Accuracy: 0.5757271815446339
F1-macro: 0.5290095185126864
F1-micro: 0.5757271815446339
None: 3


## 2. AggressionPer

In [None]:
binary_word_label('aggressionper.xlsx', 'aggressive', 'non-aggressive', 'deepseek')

Accuracy: 0.6863727454909819
F1-macro: 0.6460090544912657
F1-micro: 0.6863727454909819
None: 2


##3.Sarcasm

In [None]:
binary_word_label('sarcasm.xlsx', 'funny', 'not funny', 'deepseek')

Accuracy: 0.4894894894894895
F1-macro: 0.48948488565038595
F1-micro: 0.4894894894894895
None: 1


##4.ColBERT

In [None]:
binary_word_label('colbert.xlsx', 'funny', 'not funny', 'deepseek')

Accuracy: 0.845
F1-macro: 0.8446854881134297
F1-micro: 0.845
None: 0


##5.Spam

In [None]:
binary_word_label('spam.xlsx', 'spam', 'not spam', 'deepseek')

Accuracy: 0.7336322869955157
F1-macro: 0.6566352944348073
F1-micro: 0.7336322869955157
None: 0


##6.TextEntail

In [37]:
df = pd.read_excel('textentail2.xlsx')

In [38]:
df = df[df['split'] == 'dev']

In [39]:
df['deepseek_final'] = "None"
for index, row in df.iterrows():
    if 'not_entailment' in row['deepseek']:
      df.loc[index, 'deepseek_final'] = 'not_entailment'
    elif 'entailment' in row['deepseek'] or 'entailemtent' in row['deepseek']:
      df.loc[index, 'deepseek_final'] = 'entailment'

In [47]:
LABEL_MAP = {
    "entailment": 1,
    "not_entailment": 0,
}

In [48]:
gold_labels = [LABEL_MAP[val.lower()] for val in df.label.values]
deepseek_preds = [LABEL_MAP[val.lower()] for val in df.deepseek_final.values]

In [55]:
f1_macro = f1_score(gold_labels, deepseek_preds, average='macro')
f1_micro = f1_score(gold_labels, deepseek_preds, average='micro')
accuracy = accuracy_score(gold_labels, deepseek_preds)
print(f'Accuracy: {accuracy}')
print(f'F1-macro: {f1_macro}')
print(f'F1-micro: {f1_micro}')

Accuracy: 0.851985559566787
F1-macro: 0.8470004984574762
F1-micro: 0.851985559566787


In [44]:
ev_df = pd.read_csv('deepseek_evaluation.csv')

ev_df.loc[ev_df['task'] == 'textentail', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

TextEntail GPT4




In [50]:
gold_labels = [LABEL_MAP[val.lower()] for val in df.label.values]
gpt4_preds = [LABEL_MAP[val.lower()] for val in df.gpt4.values]

In [56]:
f1_macro = f1_score(gold_labels, gpt4_preds, average='macro')
f1_micro = f1_score(gold_labels, gpt4_preds, average='micro')
accuracy = accuracy_score(gold_labels, gpt4_preds)
print(f'Accuracy: {accuracy}')
print(f'F1-macro: {f1_macro}')
print(f'F1-micro: {f1_micro}')

Accuracy: 0.9061371841155235
F1-macro: 0.9049466793369232
F1-micro: 0.9061371841155235


In [57]:
ev_df = pd.read_csv('deepseek_evaluation.csv')

ev_df.loc[ev_df['task'] == 'textentail', ['accuracy_gpt4', 'f1_macro_gpt4']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##7.Cola

In [66]:
cola = pd.read_excel('cola.xlsx')

In [67]:
cola_filtered = cola[cola.deepseek.isin([0, 1])]

In [69]:
gold_labels = cola_filtered['annotation'].tolist()
deepseek_preds = cola_filtered['deepseek'].tolist()

In [70]:
f1_macro = f1_score(gold_labels, deepseek_preds, average='macro')
f1_micro = f1_score(gold_labels, deepseek_preds, average='micro')
accuracy = accuracy_score(gold_labels, deepseek_preds)
print(f'Accuracy: {accuracy}')
print(f'F1-macro: {f1_macro}')
print(f'F1-micro: {f1_micro}')

Accuracy: 0.8559077809798271
F1-macro: 0.8292994883903975
F1-micro: 0.8559077809798271


In [71]:
ev_df.loc[ev_df['task'] == 'cola', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##8.WSD

In [98]:
df = pd.read_excel('wsd.xlsx')

In [99]:
df['deepseek'] = df['deepseek'].astype(str)
df['deepseek_final'] = 'None'

for index, row in df.iterrows():
    if row['annotation'] in row['deepseek']:
        df.loc[index, 'deepseek_final'] = row['annotation']
    elif row['deepseek'].strip() != '':
        df.loc[index, 'deepseek_final'] = 'wrong'

gold_labels = df['annotation'].tolist()
deepseek_preds = df['deepseek_final'].tolist()

f1_macro = f1_score(gold_labels, deepseek_preds, average='macro')
f1_micro = f1_score(gold_labels, deepseek_preds, average='micro')
accuracy = accuracy_score(gold_labels, deepseek_preds)

print(f'Accuracy: {accuracy}')
print(f'F1-macro: {f1_macro}')
print(f'F1-micro: {f1_micro}')



Accuracy: 0.7836757203915621
F1-macro: 0.772345014123017
F1-micro: 0.7836757203915621


In [100]:
ev_df.loc[ev_df['task'] == 'wsd', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##9.ClarinEmo

In [101]:
import json
import ast

from torchmetrics import Accuracy, F1Score, Precision, Recall
import pandas as pd
import numpy as np
import torch

In [102]:
emotion_labels = ['radość', 'zaufanie', 'przeczuwanie', 'zdziwienie', 'strach', 'smutek', 'wstręt', 'gniew', 'pozytywny', 'negatywny', 'neutralny']

num_labels = len(emotion_labels)

In [103]:
macro_prec = Precision(num_labels=num_labels, average="macro", task='multilabel')
macro_recall = Recall(num_labels=num_labels, average="macro", task='multilabel')
micro_prec = Precision(num_labels=num_labels, average="micro", task='multilabel')
micro_recall = Recall(num_labels=num_labels, average="micro", task='multilabel')
macro_f1 = F1Score(num_labels=num_labels, average='macro', task='multilabel')
weighted_f1 = F1Score(num_labels=num_labels, average='weighted', task='multilabel')
micro_f1 = F1Score(num_labels=num_labels, average='micro', task='multilabel')
f1_scores = F1Score(num_labels=num_labels, average='none', task='multilabel')
accuracy = Accuracy(num_labels=num_labels, task='multilabel')

In [104]:
df = pd.read_excel('clarinemo.xlsx')

In [105]:
def clean_json_cell(cell):
    if isinstance(cell, str):
        cleaned = re.sub(r'^```json\s*|\s*```$', '', cell.strip())
        try:
            return cleaned
        except Exception as e:
            print(f"Ошибка при обработке строки: {cell}\n{e}")
            return None
    return cell


df['deepseek'] = df['deepseek'].apply(clean_json_cell)

In [106]:
def handle_output(x):
    if isinstance(x, list):
        return x
    elif isinstance(x, dict):
        annotations = set()
        for element in x.values():
            [annotations.add(e) for e in element]
        return list(annotations)
    assert(False)

In [107]:
y_true = []
y_pred = []
indexes = []
for index, row in df.iterrows():
    annotations = ast.literal_eval(row.annotation)
    y_true_example = [[x in anno for x in emotion_labels] for anno in annotations]
    chat_gpt =  ast.literal_eval(row.deepseek)
    sentences = chat_gpt.keys()
    sentences = sorted(sentences)
    y_pred_example = [handle_output(chat_gpt[key]) for key in sentences]
    y_pred_example = [[x in anno for x in emotion_labels] for anno in y_pred_example]
    if len(y_true_example) != len(y_pred_example):
        continue
    [y_true.append(example) for example in y_true_example]
    [y_pred.append(example) for example in y_pred_example]

In [108]:
y_true = torch.Tensor(y_true)
y_pred = torch.Tensor(y_pred)

In [109]:
f1_names = [k.capitalize() + ' f1:' for k in emotion_labels]
emotion_f1 = dict(zip(f1_names, f1_scores(y_pred, y_true)))

In [110]:
acc = float(accuracy(y_pred, y_true) * 100)
f1_macro_score = float(macro_f1(y_pred, y_true) * 100)
f1_micro_score = float(micro_f1(y_pred, y_true) * 100)
print(f'Accuracy: {acc}')
print(f'F1-macro: {f1_macro_score}')
print(f'F1-micro: {f1_micro_score}')

Accuracy: 83.26692962646484
F1-macro: 53.890533447265625
F1-micro: 63.449363708496094


In [111]:
ev_df.loc[ev_df['task'] == 'clarinemo', ['accuracy', 'f1_macro']] = [
  round(acc, 2),
  round(f1_macro_score, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)