In [1]:
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import re

##Binary classification

In [37]:
ev_df = pd.read_csv('deepseek_evaluation.csv')

In [None]:
def binary_word_label(file, label, not_label, deepseek_column):
  if 'csv' in file:
    df = pd.read_csv(file)
  elif 'xlsx' in file:
    df = pd.read_excel(file)


  df[deepseek_column] = df[deepseek_column].astype(str)
  df['deepseek_final'] = 'None'

  for index, row in df.iterrows():
    if not_label in row[deepseek_column]:
      df.loc[index, 'deepseek_final'] = not_label
    elif label in row[deepseek_column]:
      df.loc[index, 'deepseek_final'] = label

  df_filtered = df[df.deepseek_final != 'None']

  gold = df_filtered["annotation"].tolist()
  system = df_filtered["deepseek_final"].tolist()

  f1_macro = f1_score(gold, system, average='macro')
  f1_micro = f1_score(gold, system, average='micro')
  accuracy = accuracy_score(gold, system)

  print(f'Accuracy: {accuracy}')
  print(f'F1-macro: {f1_macro}')
  print(f'F1-micro: {f1_micro}')
  print(f'None: {len(df) - len(df_filtered)}')

  filename = file.split('.')[0]
  try:
    df.to_excel(f'{filename}_final.xlsx', index=False)
  except:
    df.to_csv(f'{filename}_final.csv', index=False)

  ev_df = pd.read_csv('deepseek_evaluation.csv')
  task = file.replace('.csv', '').replace('.xlsx', '')
  if task not in ev_df['task'].values:
    ev_df = pd.concat([
        ev_df,
        pd.DataFrame([{'task': task, 'accuracy': 0, 'f1_macro': 0}])
    ], ignore_index=True)

  ev_df.loc[ev_df['task'] == task, ['accuracy', 'f1_macro']] = [
    round(accuracy * 100, 2),
    round(f1_macro * 100, 2)
  ]
  ev_df.to_csv('deepseek_evaluation.csv', index=False)

##1.Aggression

In [None]:
binary_word_label('aggression.xlsx', 'aggressive', 'non-aggressive', 'deepseek')

Accuracy: 0.5757271815446339
F1-macro: 0.5290095185126864
F1-micro: 0.5757271815446339
None: 3


## 2.AggressionPer

In [None]:
binary_word_label('aggressionper.xlsx', 'aggressive', 'non-aggressive', 'deepseek')

Accuracy: 0.6863727454909819
F1-macro: 0.6460090544912657
F1-micro: 0.6863727454909819
None: 2


##3.Sarcasm

In [None]:
binary_word_label('sarcasm.xlsx', 'funny', 'not funny', 'deepseek')

Accuracy: 0.4894894894894895
F1-macro: 0.48948488565038595
F1-micro: 0.4894894894894895
None: 1


##4.ColBERT

In [None]:
binary_word_label('colbert.xlsx', 'funny', 'not funny', 'deepseek')

Accuracy: 0.845
F1-macro: 0.8446854881134297
F1-micro: 0.845
None: 0


##5.Spam

In [None]:
binary_word_label('spam.xlsx', 'spam', 'not spam', 'deepseek')

Accuracy: 0.7336322869955157
F1-macro: 0.6566352944348073
F1-micro: 0.7336322869955157
None: 0


##6.TextEntail

In [37]:
df = pd.read_excel('textentail2.xlsx')

In [38]:
df = df[df['split'] == 'dev']

In [39]:
df['deepseek_final'] = "None"
for index, row in df.iterrows():
    if 'not_entailment' in row['deepseek']:
      df.loc[index, 'deepseek_final'] = 'not_entailment'
    elif 'entailment' in row['deepseek'] or 'entailemtent' in row['deepseek']:
      df.loc[index, 'deepseek_final'] = 'entailment'

In [47]:
LABEL_MAP = {
    "entailment": 1,
    "not_entailment": 0,
}

In [48]:
gold_labels = [LABEL_MAP[val.lower()] for val in df.label.values]
deepseek_preds = [LABEL_MAP[val.lower()] for val in df.deepseek_final.values]

In [55]:
f1_macro = f1_score(gold_labels, deepseek_preds, average='macro')
f1_micro = f1_score(gold_labels, deepseek_preds, average='micro')
accuracy = accuracy_score(gold_labels, deepseek_preds)
print(f'Accuracy: {accuracy}')
print(f'F1-macro: {f1_macro}')
print(f'F1-micro: {f1_micro}')

Accuracy: 0.851985559566787
F1-macro: 0.8470004984574762
F1-micro: 0.851985559566787


In [44]:
ev_df = pd.read_csv('deepseek_evaluation.csv')

ev_df.loc[ev_df['task'] == 'textentail', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

TextEntail GPT4




In [50]:
gold_labels = [LABEL_MAP[val.lower()] for val in df.label.values]
gpt4_preds = [LABEL_MAP[val.lower()] for val in df.gpt4.values]

In [56]:
f1_macro = f1_score(gold_labels, gpt4_preds, average='macro')
f1_micro = f1_score(gold_labels, gpt4_preds, average='micro')
accuracy = accuracy_score(gold_labels, gpt4_preds)
print(f'Accuracy: {accuracy}')
print(f'F1-macro: {f1_macro}')
print(f'F1-micro: {f1_micro}')

Accuracy: 0.9061371841155235
F1-macro: 0.9049466793369232
F1-micro: 0.9061371841155235


In [57]:
ev_df = pd.read_csv('deepseek_evaluation.csv')

ev_df.loc[ev_df['task'] == 'textentail', ['accuracy_gpt4', 'f1_macro_gpt4']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##7.Cola

In [66]:
cola = pd.read_excel('cola.xlsx')

In [67]:
cola_filtered = cola[cola.deepseek.isin([0, 1])]

In [69]:
gold_labels = cola_filtered['annotation'].tolist()
deepseek_preds = cola_filtered['deepseek'].tolist()

In [70]:
f1_macro = f1_score(gold_labels, deepseek_preds, average='macro')
f1_micro = f1_score(gold_labels, deepseek_preds, average='micro')
accuracy = accuracy_score(gold_labels, deepseek_preds)
print(f'Accuracy: {accuracy}')
print(f'F1-macro: {f1_macro}')
print(f'F1-micro: {f1_micro}')

Accuracy: 0.8559077809798271
F1-macro: 0.8292994883903975
F1-micro: 0.8559077809798271


In [71]:
ev_df.loc[ev_df['task'] == 'cola', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##8.WSD

In [98]:
df = pd.read_excel('wsd.xlsx')

In [99]:
df['deepseek'] = df['deepseek'].astype(str)
df['deepseek_final'] = 'None'

for index, row in df.iterrows():
    if row['annotation'] in row['deepseek']:
        df.loc[index, 'deepseek_final'] = row['annotation']
    elif row['deepseek'].strip() != '':
        df.loc[index, 'deepseek_final'] = 'wrong'

gold_labels = df['annotation'].tolist()
deepseek_preds = df['deepseek_final'].tolist()

f1_macro = f1_score(gold_labels, deepseek_preds, average='macro')
f1_micro = f1_score(gold_labels, deepseek_preds, average='micro')
accuracy = accuracy_score(gold_labels, deepseek_preds)

print(f'Accuracy: {accuracy}')
print(f'F1-macro: {f1_macro}')
print(f'F1-micro: {f1_micro}')



Accuracy: 0.7836757203915621
F1-macro: 0.772345014123017
F1-micro: 0.7836757203915621


In [100]:
ev_df.loc[ev_df['task'] == 'wsd', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##9.ClarinEmo

In [101]:
import json
import ast

from torchmetrics import Accuracy, F1Score, Precision, Recall
import pandas as pd
import numpy as np
import torch

In [102]:
emotion_labels = ['radość', 'zaufanie', 'przeczuwanie', 'zdziwienie', 'strach', 'smutek', 'wstręt', 'gniew', 'pozytywny', 'negatywny', 'neutralny']

num_labels = len(emotion_labels)

In [103]:
macro_prec = Precision(num_labels=num_labels, average="macro", task='multilabel')
macro_recall = Recall(num_labels=num_labels, average="macro", task='multilabel')
micro_prec = Precision(num_labels=num_labels, average="micro", task='multilabel')
micro_recall = Recall(num_labels=num_labels, average="micro", task='multilabel')
macro_f1 = F1Score(num_labels=num_labels, average='macro', task='multilabel')
weighted_f1 = F1Score(num_labels=num_labels, average='weighted', task='multilabel')
micro_f1 = F1Score(num_labels=num_labels, average='micro', task='multilabel')
f1_scores = F1Score(num_labels=num_labels, average='none', task='multilabel')
accuracy = Accuracy(num_labels=num_labels, task='multilabel')

In [104]:
df = pd.read_excel('clarinemo.xlsx')

In [105]:
def clean_json_cell(cell):
    if isinstance(cell, str):
        cleaned = re.sub(r'^```json\s*|\s*```$', '', cell.strip())
        try:
            return cleaned
        except Exception as e:
            print(f"Ошибка при обработке строки: {cell}\n{e}")
            return None
    return cell


df['deepseek'] = df['deepseek'].apply(clean_json_cell)

In [106]:
def handle_output(x):
    if isinstance(x, list):
        return x
    elif isinstance(x, dict):
        annotations = set()
        for element in x.values():
            [annotations.add(e) for e in element]
        return list(annotations)
    assert(False)

In [107]:
y_true = []
y_pred = []
indexes = []
for index, row in df.iterrows():
    annotations = ast.literal_eval(row.annotation)
    y_true_example = [[x in anno for x in emotion_labels] for anno in annotations]
    chat_gpt =  ast.literal_eval(row.deepseek)
    sentences = chat_gpt.keys()
    sentences = sorted(sentences)
    y_pred_example = [handle_output(chat_gpt[key]) for key in sentences]
    y_pred_example = [[x in anno for x in emotion_labels] for anno in y_pred_example]
    if len(y_true_example) != len(y_pred_example):
        continue
    [y_true.append(example) for example in y_true_example]
    [y_pred.append(example) for example in y_pred_example]

In [108]:
y_true = torch.Tensor(y_true)
y_pred = torch.Tensor(y_pred)

In [109]:
f1_names = [k.capitalize() + ' f1:' for k in emotion_labels]
emotion_f1 = dict(zip(f1_names, f1_scores(y_pred, y_true)))

In [110]:
acc = float(accuracy(y_pred, y_true) * 100)
f1_macro_score = float(macro_f1(y_pred, y_true) * 100)
f1_micro_score = float(micro_f1(y_pred, y_true) * 100)
print(f'Accuracy: {acc}')
print(f'F1-macro: {f1_macro_score}')
print(f'F1-micro: {f1_micro_score}')

Accuracy: 83.26692962646484
F1-macro: 53.890533447265625
F1-micro: 63.449363708496094


In [111]:
ev_df.loc[ev_df['task'] == 'clarinemo', ['accuracy', 'f1_macro']] = [
  round(acc, 2),
  round(f1_macro_score, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##10.GoEmo

In [22]:
from ast import literal_eval
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, accuracy_score

In [23]:
df = pd.read_excel('goemo.xlsx')

In [24]:
unique_gold = set()
for item in df["annotation"].tolist():
  for i in item.split(','):
    unique_gold.add(i)

In [25]:
unique_answers = set()
for item in set(df["deepseek"].tolist()):
  try:
    l = ast.literal_eval(item)
    for i in l:
      unique_answers.add(i)
  except:
    unique_answers.add(item)

In [26]:
classes = 'admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise, neutral'.split(', ')

In [27]:
df["deepseek_final"] = "None"

In [28]:
for idx, evaluation in enumerate(df["deepseek"]):
    try:
      ast.literal_eval(evaluation)
    except Exception:
      try:
        m = re.search(r'\[.*?\]', evaluation)
        m = m.group()
        m = m[1:-1].split(', ')
        df.loc[idx, "deepseek"] = str(m)
      except:
        print(idx, evaluation)

In [29]:
df.deepseek.tolist()

["['caring']",
 '["confusion"]',
 '["optimism"]',
 "['gratitude']",
 '["amusement"]',
 '["gratitude"]',
 '["gratitude"]',
 '["joy"]',
 "['sadness']",
 "['annoyance']",
 '["disapproval", "anger"]',
 "['amusement', 'joy']",
 '["sadness"]',
 "['disapproval', 'confusion']",
 "['fear']",
 '["anger"]',
 '["disgust"]',
 "['fear', 'annoyance']",
 '["annoyance"]',
 '["annoyance"]',
 '["anger"]',
 "['confusion']",
 '["disgust"]',
 "['amusement']",
 '["disapproval", "annoyance"]',
 '["annoyance"]',
 '["love", "amusement"]',
 '["annoyance"]',
 '["optimism"]',
 "['curiosity']",
 '["disappointment"]',
 '["confusion"]',
 '["disgust"]',
 "['optimism']",
 '["neutral", "approval"]',
 '["confusion"]',
 "['amusement']",
 "['amusement']",
 '["anger"]',
 "['amusement', 'approval']",
 '["disapproval"]',
 '["disgust"]',
 '["annoyance"]',
 '["anger"]',
 "['joy', 'surprise']",
 "['disappointment']",
 "['amusement']",
 "['amusement']",
 '["anger"]',
 '["annoyance"]',
 '["anger"]',
 "['anger', 'disapproval']",
 "

In [30]:
final_map_ds = {'"joy"': 'joy',
 "'amusement'": 'amusement',
 "'annoyance'": 'annoyance',
 "'approval'": 'approval',
 "'desire'": 'desire',
 "'remorse'": 'remorse',
 'admiration': 'admiration',
 'amusement': 'amusement',
 'anger': 'anger',
 'annoyance': 'annoyance',
 'approval': 'approval',
 'caring': 'caring',
 'confusion': 'confusion',
 'curiosity': 'curiosity',
 'desire': 'desire',
 'determination': 'none',
 'disappointment': 'disappointment',
 'disapproval': 'disapproval',
 'discomfort': 'none',
 'disgust': 'disgust',
 'doubt': 'none',
 'embarrassment': 'embarrassment',
 'excitement': 'excitement',
 'fear': 'fear',
 'gratitude': 'gratitude',
 'grief': 'grief',
 'hope': 'none',
 'joy': 'joy',
 'love': 'love',
 'nervousness': 'nervousness',
 'neutral': 'neutral',
 'nostalgia': 'none',
 'optimism': 'optimism',
 'pain': 'none',
 'pride': 'pride',
 'realization': 'realization',
 'relief': 'relief',
 'remorse': 'remorse',
 'sadness': 'sadness',
 'surprise': 'surprise',
 'worry': 'none'
}

In [31]:
df["deepseek_final"] = df["deepseek"].apply(literal_eval)

In [32]:
gold = []
gold_all = []
for item in df["annotation"].tolist():
  gold.append(item.split(','))
  gold_all.extend(item.split(','))

In [33]:
system = []
system_all = []
for index, item in enumerate(df["deepseek"].tolist()):
  resp = []
  try:
    l = ast.literal_eval(item)
    for i in l:
      resp.append(final_map_ds[i])
  except:
    resp.append(final_map_ds[item])
  system.append(resp)
  system_all.extend(resp)

In [34]:
lb = MultiLabelBinarizer(classes=classes)
y_true = lb.fit_transform(gold)
y_pred = lb.fit_transform(system)



In [35]:
f1_macro = f1_score(y_true, y_pred, average='macro') # f1_macro
f1_micro = f1_score(y_true, y_pred, average='micro')
accuracy = accuracy_score(y_true, y_pred)


print(f'Accuracy: {accuracy}')
print(f'F1-macro: {f1_macro}')
print(f'F1-micro: {f1_micro}')

Accuracy: 0.214
F1-macro: 0.2940646496983506
F1-micro: 0.27059843885516044


In [38]:
ev_df.loc[ev_df['task'] == 'goemo', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##11. GoEmoPer0

In [140]:
!wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/emotions.txt

--2025-04-25 20:33:13--  https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/emotions.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 248 [text/plain]
Saving to: ‘emotions.txt.2’


2025-04-25 20:33:14 (4.10 MB/s) - ‘emotions.txt.2’ saved [248/248]



In [141]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from ast import literal_eval
import re

In [142]:
ds_evaluation_df = pd.read_excel('goemoper0.xlsx')

In [143]:
labels_df = pd.read_csv('emotions.txt', header=None)
ID_TO_EMOTION = dict(labels_df[0])
EMOTION_TO_ID = {emotion: emotion_id for emotion_id, emotion in ID_TO_EMOTION.items()}

In [144]:
ds_evaluation_df["annotation"] = ds_evaluation_df["annotation"].apply(literal_eval)

In [145]:
ds_evaluation_df['deepseek_final'] = "None"
for index, row in ds_evaluation_df.iterrows():
    final = re.search(r'\[.*?\]', row['deepseek'])
    if final:
        f = final.group().replace("\"", "'")
        ds_evaluation_df.loc[index, 'deepseek_final'] = f

In [146]:
ds_evaluation_df.loc[329, "deepseek_final"] = "['disapproval']"
ds_evaluation_df["deepseek_final"] = ds_evaluation_df["deepseek_final"].apply(literal_eval)

In [147]:
ds_evaluation_df = ds_evaluation_df.dropna(subset=["deepseek_final"])

In [148]:
unique_ds_answers = set()
for evaluation in ds_evaluation_df["deepseek_final"]:
    unique_ds_answers = unique_ds_answers.union(set(evaluation))

In [149]:
is_valid_label_map = {
    label: label in EMOTION_TO_ID
    for label in unique_ds_answers
}

In [150]:
ds_evaluation_df = ds_evaluation_df.loc[
    ds_evaluation_df["deepseek_final"].apply(
        lambda labels: all((is_valid_label_map[label] for label in labels))
    )
]

In [151]:
def encode_one_hot(annotations: list[str]):
    one_hot = np.zeros(len(ID_TO_EMOTION), dtype="bool")
    for emotion in annotations:
        one_hot[EMOTION_TO_ID[emotion]] = 1
    return one_hot

In [152]:
ds_evaluation_df["true_labels"] = ds_evaluation_df["annotation"].apply(encode_one_hot)
ds_evaluation_df["ds_labels"] = ds_evaluation_df["deepseek_final"].apply(encode_one_hot)

In [153]:
accuracy = accuracy_score(
    y_true=np.array(ds_evaluation_df["true_labels"].tolist()),
    y_pred=np.array(ds_evaluation_df["ds_labels"].tolist()),
)

f1 = f1_score(
    y_true=np.array(ds_evaluation_df["true_labels"].tolist()),
    y_pred=np.array(ds_evaluation_df["ds_labels"].tolist()),
    average='macro',
)

print("Accuracy:", accuracy)
print("F1 (macro):", f1)

Accuracy: 0.19700967458223395
F1 (macro): 0.23437347705362196


In [154]:
ev_df.loc[ev_df['task'] == 'goemoper0', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1 * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##12.GoEmoPer1

In [226]:
ds_evaluation_df = pd.read_excel('goemoper1.xlsx')

In [227]:
labels_df = pd.read_csv('emotions.txt', header=None)
ID_TO_EMOTION = dict(labels_df[0])
EMOTION_TO_ID = {emotion: emotion_id for emotion_id, emotion in ID_TO_EMOTION.items()}

In [228]:
ds_evaluation_df["annotation"] = ds_evaluation_df["annotation"].apply(literal_eval)

In [229]:
ds_evaluation_df['deepseek_final'] = None
for index, row in ds_evaluation_df.iterrows():
    if isinstance(row['deepseek'], str):
      final = re.search(r'\[.*?\]', row['deepseek'])
      if final:
          f = final.group().replace("\"", "'")
          ds_evaluation_df.loc[index, 'deepseek_final'] = f

In [230]:
ds_evaluation_df.loc[329, "deepseek_final"] = "['disapproval']"
from ast import literal_eval

def safe_literal_eval(val):
    if isinstance(val, list):
        return val
    if isinstance(val, str):
        try:
            return literal_eval(val)
        except:
            return None
    return None

ds_evaluation_df["deepseek_final"] = ds_evaluation_df["deepseek_final"].apply(safe_literal_eval)


In [231]:
ds_evaluation_df = ds_evaluation_df.dropna(subset=["deepseek_final"])

In [232]:
unique_ds_answers = set()
for evaluation in ds_evaluation_df["deepseek_final"]:
    unique_ds_answers = unique_ds_answers.union(set(evaluation))

In [233]:
is_valid_label_map = {
    label: label in EMOTION_TO_ID
    for label in unique_ds_answers
}

In [234]:
ds_evaluation_df = ds_evaluation_df.loc[
    ds_evaluation_df["deepseek_final"].apply(
        lambda labels: all((is_valid_label_map[label] for label in labels))
    )
]

In [235]:
def encode_one_hot(annotations: list[str]):
    one_hot = np.zeros(len(ID_TO_EMOTION), dtype="bool")
    for emotion in annotations:
        one_hot[EMOTION_TO_ID[emotion]] = 1
    return one_hot

In [236]:
ds_evaluation_df["true_labels"] = ds_evaluation_df["annotation"].apply(encode_one_hot)
ds_evaluation_df["ds_labels"] = ds_evaluation_df["deepseek_final"].apply(encode_one_hot)

In [237]:
accuracy = accuracy_score(
    y_true=np.array(ds_evaluation_df["true_labels"].tolist()),
    y_pred=np.array(ds_evaluation_df["ds_labels"].tolist()),
)

f1 = f1_score(
    y_true=np.array(ds_evaluation_df["true_labels"].tolist()),
    y_pred=np.array(ds_evaluation_df["ds_labels"].tolist()),
    average='macro',
)

print("Accuracy:", accuracy)
print("F1 (macro):", f1)

Accuracy: 0.19469026548672566
F1 (macro): 0.2502515862839795


In [238]:
ev_df.loc[ev_df['task'] == 'goemoper1', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1 * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##13.GoEmoPer2

In [168]:
ds_evaluation_df = pd.read_excel('goemoper2.xlsx')

In [169]:
labels_df = pd.read_csv('emotions.txt', header=None)
ID_TO_EMOTION = dict(labels_df[0])
EMOTION_TO_ID = {emotion: emotion_id for emotion_id, emotion in ID_TO_EMOTION.items()}

In [170]:
ds_evaluation_df["annotation"] = ds_evaluation_df["annotation"].apply(literal_eval)

In [171]:
ds_evaluation_df['deepseek_final'] = None
for index, row in ds_evaluation_df.iterrows():
    if isinstance(row['deepseek'], str):
      final = re.search(r'\[.*?\]', row['deepseek'])
      if final:
          f = final.group().replace("\"", "'")
          ds_evaluation_df.loc[index, 'deepseek_final'] = f

In [172]:
ds_evaluation_df.loc[329, "deepseek_final"] = "['disapproval']"
from ast import literal_eval

def safe_literal_eval(val):
    if isinstance(val, list):
        return val
    if isinstance(val, str):
        try:
            return literal_eval(val)
        except:
            return None
    return None

ds_evaluation_df["deepseek_final"] = ds_evaluation_df["deepseek_final"].apply(safe_literal_eval)

In [173]:
ds_evaluation_df = ds_evaluation_df.dropna(subset=["deepseek_final"])

In [174]:
unique_ds_answers = set()
for evaluation in ds_evaluation_df["deepseek_final"]:
    unique_ds_answers = unique_ds_answers.union(set(evaluation))

In [175]:
is_valid_label_map = {
    label: label in EMOTION_TO_ID
    for label in unique_ds_answers
}

In [176]:
ds_evaluation_df = ds_evaluation_df.loc[
    ds_evaluation_df["deepseek_final"].apply(
        lambda labels: all((is_valid_label_map[label] for label in labels))
    )
]

In [177]:
def encode_one_hot(annotations: list[str]):
    one_hot = np.zeros(len(ID_TO_EMOTION), dtype="bool")
    for emotion in annotations:
        one_hot[EMOTION_TO_ID[emotion]] = 1
    return one_hot

In [178]:
ds_evaluation_df["true_labels"] = ds_evaluation_df["annotation"].apply(encode_one_hot)
ds_evaluation_df["ds_labels"] = ds_evaluation_df["deepseek_final"].apply(encode_one_hot)

In [179]:
accuracy = accuracy_score(
    y_true=np.array(ds_evaluation_df["true_labels"].tolist()),
    y_pred=np.array(ds_evaluation_df["ds_labels"].tolist()),
)

f1 = f1_score(
    y_true=np.array(ds_evaluation_df["true_labels"].tolist()),
    y_pred=np.array(ds_evaluation_df["ds_labels"].tolist()),
    average='macro',
)

print("Accuracy:", accuracy)
print("F1 (macro):", f1)

Accuracy: 0.1961301671064204
F1 (macro): 0.2552321956235119


In [181]:
ev_df.loc[ev_df['task'] == 'goemoper2', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1 * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##14.GoemoPer3

In [213]:
ds_evaluation_df = pd.read_excel('goemoper3.xlsx')

In [214]:
labels_df = pd.read_csv('emotions.txt', header=None)
ID_TO_EMOTION = dict(labels_df[0])
EMOTION_TO_ID = {emotion: emotion_id for emotion_id, emotion in ID_TO_EMOTION.items()}

In [215]:
ds_evaluation_df["annotation"] = ds_evaluation_df["annotation"].apply(literal_eval)

In [216]:
ds_evaluation_df['deepseek_final'] = None
for index, row in ds_evaluation_df.iterrows():
    if isinstance(row['deepseek'], str):
      final = re.search(r'\[.*?\]', row['deepseek'])
      if final:
          f = final.group().replace("\"", "'")
          ds_evaluation_df.loc[index, 'deepseek_final'] = f

In [217]:
ds_evaluation_df.loc[329, "deepseek_final"] = "['disapproval']"
from ast import literal_eval

def safe_literal_eval(val):
    if isinstance(val, list):
        return val
    if isinstance(val, str):
        try:
            return literal_eval(val)
        except:
            return None
    return None

ds_evaluation_df["deepseek_final"] = ds_evaluation_df["deepseek_final"].apply(safe_literal_eval)

In [218]:
ds_evaluation_df = ds_evaluation_df.dropna(subset=["deepseek_final"])

In [219]:
unique_ds_answers = set()
for evaluation in ds_evaluation_df["deepseek_final"]:
    unique_ds_answers = unique_ds_answers.union(set(evaluation))

In [220]:
is_valid_label_map = {
    label: label in EMOTION_TO_ID
    for label in unique_ds_answers
}

In [221]:
ds_evaluation_df = ds_evaluation_df.loc[
    ds_evaluation_df["deepseek_final"].apply(
        lambda labels: all((is_valid_label_map[label] for label in labels))
    )
]

In [222]:
def encode_one_hot(annotations: list[str]):
    one_hot = np.zeros(len(ID_TO_EMOTION), dtype="bool")
    for emotion in annotations:
        one_hot[EMOTION_TO_ID[emotion]] = 1
    return one_hot

In [223]:
ds_evaluation_df["true_labels"] = ds_evaluation_df["annotation"].apply(encode_one_hot)
ds_evaluation_df["ds_labels"] = ds_evaluation_df["deepseek_final"].apply(encode_one_hot)

In [224]:
accuracy = accuracy_score(
    y_true=np.array(ds_evaluation_df["true_labels"].tolist()),
    y_pred=np.array(ds_evaluation_df["ds_labels"].tolist()),
)

f1 = f1_score(
    y_true=np.array(ds_evaluation_df["true_labels"].tolist()),
    y_pred=np.array(ds_evaluation_df["ds_labels"].tolist()),
    average='macro',
)

print("Accuracy:", accuracy)
print("F1 (macro):", f1)

Accuracy: 0.18761061946902655
F1 (macro): 0.2414172357091951


In [225]:
ev_df.loc[ev_df['task'] == 'goemoper3', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1 * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##15.Reading

In [239]:
from sklearn.metrics import classification_report

In [240]:
df = pd.read_excel('https://github.com/ddariath/gpt_deepseek_evaluation/raw/refs/heads/main/deepseek_raw/reading.xlsx')

In [241]:
df.loc[24, 'deepseek'] = 'A'
df.loc[40, 'deepseek'] = 'C'
df.loc[69, 'deepseek'] = 'C'
df.loc[98, 'deepseek'] = 'D'
df.loc[807, 'deepseek'] = 'None'
df.loc[879, 'deepseek'] = 'A'
df.loc[882, 'deepseek'] = 'D'
df.loc[894, 'deepseek'] = 'B'
df.loc[913, 'deepseek'] = 'D'
df.loc[917, 'deepseek'] = 'D'
df.loc[923, 'deepseek'] = 'None'
df.loc[932, 'deepseek'] = 'None'
df.loc[958, 'deepseek'] = 'D'
df.loc[967, 'deepseek'] = 'None'
df.loc[975, 'deepseek'] = 'D'

In [242]:
df_filtered = df[df.deepseek != 'None']

In [243]:
def evaluation(y_true, y_pred):
    results = classification_report(y_true, y_pred, output_dict=True)
    accuracy = results['accuracy']
    macro_f1 = results['macro avg']['f1-score']
    return accuracy, macro_f1


In [244]:
y_true = df_filtered.annotation.apply(lambda x: ", ".join(sorted(x))).tolist()
y_pred = df_filtered.deepseek.apply(lambda x: ", ".join(sorted(x))).tolist()

In [246]:
accuracy, f1_macro = evaluation(y_true, y_pred)

print(f"Accuracy:", accuracy)
print(f"Macro F1:", f1_macro)

Accuracy: 0.9106425702811245
Macro F1: 0.909927739116316


In [247]:
ev_df.loc[ev_df['task'] == 'reading', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

## 16.Polemo

In [286]:
import re
from ast import literal_eval
import pandas as pd
from sklearn.metrics import f1_score

In [287]:
df = pd.read_excel('https://github.com/ddariath/gpt_deepseek_evaluation/raw/refs/heads/main/deepseek_raw/polemo.xlsx')

In [288]:
options = 'pozytywny, negatywny, neutralny, ambiwalentny'.split(', ')

In [289]:
for idx, evaluation in enumerate(df["deepseek"]):
    try:
      literal_eval(evaluation)
    except Exception:
      #print(idx, evaluation)
      try:
        m = re.search(r'\[.*?\]', evaluation)
        m = m.group()
        m = m[1:-1].split(', ')
        df.loc[idx, "deepseek"] = str(m)
      except:
        print(idx, evaluation)

155 negatywny
211  Porównywane też były wyniki tych badań pod kątem sezonowości zmian . Aby zaś rozgraniczyć wpływ zanieczyszczeń np . od czynników alergenowych ( roztocza , kurz , pyłki roślin ) , obserwacje prowadzono tylko w miesiącach zimowych . W tym czasie bowiem dzięki ogrzewaniu domów do powietrza dostaje się mnóstwo szkodliwych substancji , takich jak benzo ( a ) piren , sadza czy tlenki siarki i azotu .
273 goraco polecam za to chirurga ze szpitala w ostrodzie doktorant w Gorzowie oraz rehabilitanta tegoz tez z szpitala w ostrodzie tego dodam ze nie ma do kogo sie o
307 negatywny
325 nan
357 nan
406  Po raz kolejny okazuje się , że status w programie lojalnościowym nie ma żadnego znaczenia . Już od 2 tygodni czekam na korektę faktury .
448 nan
475 ambitwalentny
510 nan
548 ambiwalentny
551 nan
556 i wydać receptę.
560 dostosowanie się do możliwości (studentów)"negatywny
564 nan
581 nan
643 negatywny
647  Klinika ok , ale mogłby być lepszy dostęp do lekarza ambiwalentny
678 na

In [290]:
df.loc[155, 'deepseek'] = "['negatywny']"
df.loc[211, 'deepseek'] = "None"
df.loc[273, 'deepseek'] = "None"
df.loc[307, 'deepseek'] = "['negatywny']"
df.loc[325, 'deepseek'] = "None"
df.loc[357, 'deepseek'] = "None"
df.loc[406, 'deepseek'] = "None"
df.loc[448, 'deepseek'] = "None"
df.loc[475, 'deepseek'] = "['ambitwalentny']"
df.loc[510, 'deepseek'] = "None"
df.loc[548, 'deepseek'] = "['ambitwalentny']"
df.loc[551, 'deepseek'] = "None"
df.loc[556, 'deepseek'] = "None"
df.loc[560, 'deepseek'] = "['negatywny']"
df.loc[564, 'deepseek'] = "None"
df.loc[581, 'deepseek'] = "None"
df.loc[643, 'deepseek'] = "['negatywny']"
df.loc[647, 'deepseek'] = "['ambitwalentny']"
df.loc[678, 'deepseek'] = "None"
df.loc[735, 'deepseek'] = "None"
df.loc[795, 'deepseek'] = "None"
df.loc[814, 'deepseek'] = "['pozytywny']"

In [291]:
set(df["deepseek"].tolist())

{'\n\n["ambiwalentny"]',
 '\n["pozytywny"]',
 "\n['negatywny']",
 " \n\n['ambiwalentny']",
 " \n\n['negatywny']",
 " \n\n['neutralny']",
 " \n\n['pozytywny']",
 "  \n\n['negatywny']",
 "  \n\n['pozytywny']",
 ' ["ambiwalentny"]',
 ' ["negatywny"]',
 ' ["pozytywny"]',
 'None',
 '["\'ambiwalentny\'"]',
 '["\'negatywny\'"]',
 '["\'neutral\'"]',
 '["\'neutralny\'"]',
 '["\'pozytywny\'"]',
 '["ambiwalentny"]',
 '["negatywny"]',
 '["pozytywny", "negatywny", "neutralny", "ambiwalentny"]',
 '["pozytywny"]',
 '[\' "ambiwalentny" \']',
 '[\'"ambiwalentny"\']',
 '[\'"negatywny"\']',
 '[\'"neutralny"\']',
 '[\'"pozytywny"\', \'"negatywny"\']',
 '[\'"pozytywny"\']',
 "['_negatywny']",
 "['`negatywny`']",
 "['ambitwalentny']",
 "['ambiwalentny']",
 "['dodatnia']",
 "['negatywny']",
 "['neutral']",
 "['neutralny']",
 "['pozytywny']",
 "[['negatywny']]"}

In [292]:
final_map_ds = {
    '\n\n["ambiwalentny"]': 'ambiwalentny',
 '\n["pozytywny"]': 'pozytywny',
 "\n['negatywny']": 'negatywny',
 " \n\n['ambiwalentny']": 'ambiwalentny',
 " \n\n['negatywny']": 'negatywny',
 " \n\n['neutralny']": 'neutralny',
 " \n\n['pozytywny']": 'pozytywny',
 "  \n\n['negatywny']": 'negatywny',
 "  \n\n['pozytywny']": 'pozytywny',
 ' ["ambiwalentny"]': 'ambiwalentny',
 ' ["negatywny"]': 'negatywny',
 ' ["pozytywny"]': 'pozytywny',
 'None': 'None',
 '["\'ambiwalentny\'"]': 'ambiwalentny',
 '["\'negatywny\'"]': 'negatywny',
 '["\'neutral\'"]': 'neutralny',
 '["\'neutralny\'"]': 'neutralny',
 '["\'pozytywny\'"]': 'pozytywny',
 '["ambiwalentny"]': 'ambiwalentny',
 '["negatywny"]': 'negatywny',
 '["pozytywny", "negatywny", "neutralny", "ambiwalentny"]': 'None',
 '["pozytywny"]': 'pozytywny',
 '[\' "ambiwalentny" \']': 'ambiwalentny',
 '[\'"ambiwalentny"\']': 'ambiwalentny',
 '[\'"negatywny"\']': 'negatywny',
 '[\'"neutralny"\']': 'neutralny',
 '[\'"pozytywny"\', \'"negatywny"\']': 'None',
 '[\'"pozytywny"\']': 'pozytywny',
 "['_negatywny']": 'negatywny',
 "['`negatywny`']": 'negatywny',
 "['ambitwalentny']": 'ambiwalentny',
 "['ambiwalentny']": 'ambiwalentny',
 "['dodatnia']": 'None',
 "['negatywny']": 'negatywny',
 "['neutral']": 'neutralny',
 "['neutralny']": 'neutralny',
 "['pozytywny']": 'pozytywny',
 "[['negatywny']]": 'negatywny'
}

In [293]:
df["deepseek"] = df["deepseek"].apply(lambda x: final_map_ds[x])

In [294]:
df_filtered = df[df.deepseek != 'None']

In [295]:
gold = df_filtered["annotation"].tolist()
system = df_filtered["deepseek"].tolist()

In [299]:
f1_macro = f1_score(gold, system, average='macro')
accuracy = f1_score(gold, system, average='micro')

print(f"Accuracy:", accuracy)
print(f"Macro F1:", f1_macro)

Accuracy: 0.775840597758406
Macro F1: 0.6682117448130658


In [300]:
ev_df.loc[ev_df['task'] == 'polemo', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##17.WordContext

In [301]:
import pandas as pd
import numpy as np

In [302]:
df = pd.read_excel('https://github.com/ddariath/gpt_deepseek_evaluation/raw/refs/heads/main/deepseek_raw/wordcontext.xlsx')

In [303]:
df = df[df['split'] == 'dev']

In [304]:
for index, evaluation in enumerate(df['deepseek']):
  if evaluation not in ['T', 'F']:
    print(index, evaluation)

352 T correct answer is: "T"


In [305]:
df.loc[352, 'deepseek'] = 'T'

In [306]:
df['deepseek'] = df['deepseek'].astype(str)

In [307]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
F,319
T,319


In [308]:
LABEL_MAP = {
    "T": 1,
    "F": 0,
}

gold_labels = [LABEL_MAP[val] for val in df.label.values]
ds_preds = [LABEL_MAP[val] for val in df.deepseek.values]

In [309]:
from sklearn.metrics import classification_report

report_lines = classification_report(y_true=gold_labels, y_pred=ds_preds, digits=4).split('\n')

for line in report_lines:
  print(line)

              precision    recall  f1-score   support

           0     0.5842    0.9248    0.7160       319
           1     0.8195    0.3417    0.4823       319

    accuracy                         0.6332       638
   macro avg     0.7019    0.6332    0.5992       638
weighted avg     0.7019    0.6332    0.5992       638



In [313]:
report = classification_report(y_true=gold_labels, y_pred=ds_preds, digits=4, output_dict=True)

accuracy = report['accuracy']
f1_macro = report['macro avg']['f1-score']

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)

Accuracy: 0.6332288401253918
Macro F1: 0.5991601512157402


In [314]:
ev_df.loc[ev_df['task'] == 'wordcontext', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##18.TweetSent

In [433]:
import pandas as pd
from sklearn.metrics import classification_report
import re
from ast import literal_eval

In [434]:
df = pd.read_excel('https://github.com/ddariath/gpt_deepseek_evaluation/raw/refs/heads/main/deepseek_raw/tweetsent.xlsx')

In [435]:
df["deepseek_final"] = "None"

In [436]:
for index, evaluation in enumerate(df["deepseek"]):
    try:
        literal_eval(str(evaluation))
    except Exception:
        try:
            m = re.search(r'\d', str(evaluation))
            m = m.group()
            df.loc[index, 'deepseek'] = m
        except Exception:
            df.loc[index, 'deepseek'] = 'None'

In [437]:
def to_int(value):
    try:
        res = round(float(value))
        if res in [1, 2, 0]:
            return res
        else:
            return "None"
    except Exception:
        return "None"

In [438]:
df['deepseek'] = df['deepseek'].apply(to_int)

In [439]:
df = df[df['deepseek'] != "None"]
df['annotation'] = df['annotation'].astype(int)
df['deepseek'] = df['deepseek'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['annotation'] = df['annotation'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['deepseek'] = df['deepseek'].astype(int)


In [440]:
df_clean = df.copy()

In [441]:
def evaluation(df: pd.DataFrame, y_col: str):
    results = classification_report(
        df["annotation"],
        df[y_col],
        labels=[0,1,2],
        output_dict=True
    )
    accuracy = results['accuracy']
    macro_f1 = results['macro avg']['f1-score']
    return accuracy, macro_f1, results

In [442]:
accuracy, f1_macro, results = evaluation(df_clean, y_col="deepseek")

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)

Accuracy: 0.5986984815618221
Macro F1: 0.5892705855483159


In [443]:
ev_df.loc[ev_df['task'] == 'tweetsent', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##19.TweetEmoji

In [447]:
import ast
import json
import numpy as np
import pandas as pd
import pathlib
#import swifter
import re
from ast import literal_eval
from sklearn.metrics import accuracy_score, f1_score

In [448]:
DATA_DIR = pathlib.Path("data")
EMOJI_MAPPING = {
    "❤": 0,
    "😍": 1,
    "😂": 2,
    "💕": 3,
    "🔥": 4,
    "😊": 5,
    "😎": 6,
    "✨": 7,
    "💙": 8,
    "😘": 9,
    "📷": 10,
    "🇺🇸": 11,
    "☀": 12,
    "💜": 13,
    "😉": 14,
    "💯": 15,
    "😁": 16,
    "🎄": 17,
    "📸": 18,
    "😜": 19
}

In [450]:
df = pd.read_excel('https://github.com/ddariath/gpt_deepseek_evaluation/raw/refs/heads/main/deepseek_raw/tweetemoji.xlsx')

In [451]:
df["deepseek_final"] = 'None'

In [452]:
for idx, evaluation in enumerate(df["deepseek"]):
    try:
      ast.literal_eval(evaluation)
    except Exception:
      try:
        m = re.search(r'\[.*?\]', evaluation)
        m = m.group()
        #m = m[1:-1].split(', ')
        df.loc[idx, "deepseek"] = m
        print(m)
      except:
        print('Ошибка', idx, evaluation)

[2,0,1]
[9, 3, 5]
[3, 0, 9]
[0, 1, 2]
[4, 6, 15]
[0, 1, 2]


In [458]:
def get_results(df: pd.DataFrame) -> pd.DataFrame:
    #df["chatgpt_final"] = df["chatgpt_final"].swifter.apply(lambda x: ast.literal_eval(x))
    df["deepseek_final"] = df["deepseek"].apply(literal_eval)

    y_true = df["annotation"].to_numpy()

    ds_answer = df["deepseek_final"].to_list()
    y_pred_top1 = np.array([answer[0] for answer in ds_answer])
    y_pred_top3 = np.array(
        [
            y_true[i] if y_true[i] in answer else answer[0]
            for i, answer in enumerate(ds_answer)
        ]
    )

    f1_macro = f1_score(y_true=y_true, y_pred=y_pred_top1, average="macro")
    f1_micro = f1_score(y_true=y_true, y_pred=y_pred_top1, average="micro")
    acc_top1 = accuracy_score(y_true=y_true, y_pred=y_pred_top1)
    acc_top3 = accuracy_score(y_true=y_true, y_pred=y_pred_top3)

    return acc_top1, f1_macro

In [462]:
df = df.drop(df[df["chatgpt_final"] == "[-1]"].index)
accuracy, f1_macro = get_results(df)

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)

Accuracy: 0.2734375
Macro F1: 0.206987998306092


In [463]:
ev_df.loc[ev_df['task'] == 'tweetemoji', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##20. WNLI

In [475]:
import pandas as pd
import numpy as np

In [476]:
df = pd.read_excel('https://github.com/ddariath/gpt_deepseek_evaluation/raw/refs/heads/main/deepseek_raw/wnli2.xlsx')

In [477]:
df = df[df['split'] == 'dev']

In [478]:
df['deepseek'] = df['deepseek'].apply(round)

In [479]:
df['deepseek'] = df['deepseek'].astype(int)
df['label'] = df['label'].astype(int)

gold_labels = df.label.values
deepseek_preds = df.deepseek.values

In [480]:
from sklearn.metrics import classification_report

report_lines = classification_report(y_true=gold_labels, y_pred=deepseek_preds, digits=4, output_dict=True)

accuracy = report_lines['accuracy']
f1_macro = report_lines['macro avg']['f1-score']

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)

Accuracy: 0.8873239436619719
Macro F1: 0.8862179487179487


In [481]:
ev_df.loc[ev_df['task'] == 'wnli', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

GPT4

In [483]:
df['gpt4'] = df['gpt4'].apply(round)

In [486]:
df['gpt4'] = df['gpt4'].astype(int)
df['label'] = df['label'].astype(int)

gold_labels = df.label.values
gpt4_preds = df.gpt4.values

In [487]:
from sklearn.metrics import classification_report

report_lines = classification_report(y_true=gold_labels, y_pred=gpt4_preds, digits=4, output_dict=True)

accuracy = report_lines['accuracy']
f1_macro = report_lines['macro avg']['f1-score']

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)

Accuracy: 0.8732394366197183
Macro F1: 0.8706739526411658


In [488]:
ev_df.loc[ev_df['task'] == 'wnli', ['accuracy_gpt4', 'f1_macro_gpt4']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

GPT3_5

In [489]:
df['gpt3_5'] = df['gpt3_5'].apply(round)

In [490]:
df['gpt3_5'] = df['gpt3_5'].astype(int)
df['label'] = df['label'].astype(int)

gold_labels = df.label.values
gpt3_5_preds = df.gpt3_5.values

In [491]:
from sklearn.metrics import classification_report

report_lines = classification_report(y_true=gold_labels, y_pred=gpt3_5_preds, digits=4, output_dict=True)

accuracy = report_lines['accuracy']
f1_macro = report_lines['macro avg']['f1-score']

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)

Accuracy: 0.6197183098591549
Macro F1: 0.5935976256094976


In [492]:
ev_df.loc[ev_df['task'] == 'wnli', ['accuracy_gpt3_5', 'f1_macro_gpt3_5']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

##21.MathQA

## MathQA evaluation

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import re
from ast import literal_eval

In [510]:
df = pd.read_excel('https://github.com/ddariath/gpt_deepseek_evaluation/raw/refs/heads/main/deepseek_raw/mathqa2.xlsx')

In [511]:
df['deepseek_final'] = 'None'

In [512]:
for index, row in df.iterrows():
  a = row['deepseek']
  if "Answer:" in a:
    try:
      a = re.sub(r'\n', ' ', a)
      a = re.sub(r'\*\*', '', a)
      a = re.split(r'Answer:', a)[-1]
      res = re.search(r'[\d\.\,]+', a)
      ans = res.group()
      ans = re.sub(r',', '', ans)
      if '.' in ans:
        ans = ans.split('.')[0]
      df.loc[index, 'deepseek_final'] = ans
    except:
      df.loc[index, 'deepseek_final'] = "None"

In [513]:
df_filtered = df[df.deepseek_final != 'None']

In [514]:
gold = df_filtered["expected_res"].tolist()
system = df_filtered["deepseek_final"].tolist()

In [515]:
gold = list(map(str, gold))
system = list(map(str, system))

In [516]:
f1_macro = f1_score(gold, system, average='macro')
accuracy = accuracy_score(gold, system)

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)

Accuracy: 0.9590163934426229
Macro F1: 0.9183024837321901


In [535]:
ev_df.loc[ev_df['task'] == 'mathqa', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)

model - openai/gpt-4.1

In [529]:
df['gpt_final'] = 'None'

In [530]:
for index, row in df.iterrows():
  a = row['gpt']
  if "Answer:" in a:
    try:
      a = re.sub(r'\n', ' ', a)
      a = re.sub(r'\*\*', '', a)
      a = re.split(r'Answer:', a)[-1]
      res = re.search(r'[\d\.\,]+', a)
      ans = res.group()
      ans = re.sub(r',', '', ans)
      if '.' in ans:
        ans = ans.split('.')[0]
      df.loc[index, 'gpt_final'] = ans
    except:
      df.loc[index, 'gpt_final'] = "None"

In [531]:
df_filtered = df[df.gpt_final != 'None']

In [532]:
gold = df_filtered["expected_res"].tolist()
system = df_filtered["gpt_final"].tolist()

In [533]:
gold = list(map(str, gold))
system = list(map(str, system))

In [534]:
f1_macro = f1_score(gold, system, average='macro')
accuracy = accuracy_score(gold, system)

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)

Accuracy: 0.959
Macro F1: 0.905427456227366


##22.TweetStance

In [561]:
import pandas as pd
from pathlib import Path
from sklearn.metrics import classification_report

import seaborn as sns
from scipy.stats import entropy
from collections import Counter

In [563]:
df = pd.read_excel('https://github.com/ddariath/gpt_deepseek_evaluation/raw/refs/heads/main/deepseek_raw/tweetstance.xlsx')

In [564]:
TOPICS = [
    "abortion",
    "atheism",
    "climate",
    "feminist",
    "hillary",
]

MAPPING = {
    0: "none",
    1: "against",
    2: "favor"
}

RESULTS = ["0", "1", "2"]

In [565]:
def evaluation(df_stats: pd.DataFrame, y_col: str = "deepseek"):
    try:
        results = classification_report(df_stats["annotation"], df_stats[y_col].astype(int), output_dict=True)

        f1_against = results['1']['f1-score']
        f1_favor = results['2']['f1-score']
        tweeteval_result = (f1_against+f1_favor) / 2

    except Exception as ex:
        print(f"Issues with task: {ex}")

    return tweeteval_result, results

In [574]:
df = df[df["deepseek"].isin([0, 1, 2])]

In [570]:
result, results = evaluation(df)

In [571]:
pd.DataFrame(results).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.507614,0.438596,0.470588,228.0
1,0.770468,0.73913,0.754474,713.0
2,0.623626,0.746711,0.679641,304.0
accuracy,0.685944,0.685944,0.685944,0.685944
macro avg,0.633903,0.641479,0.634901,1245.0
weighted avg,0.686476,0.685944,0.684213,1245.0


In [573]:
accuracy = results['accuracy']
f1_macro = results['macro avg']['f1-score']

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)

Accuracy: 0.6859437751004016
Macro F1: 0.6349009421470336


In [575]:
ev_df.loc[ev_df['task'] == 'tweetstance', ['accuracy', 'f1_macro']] = [
  round(accuracy * 100, 2),
  round(f1_macro * 100, 2)
]
ev_df.to_csv('deepseek_evaluation.csv', index=False)