#Install and import libraries

In [67]:
%%capture
!pip install --upgrade openai
!pip install pywer

In [180]:
import pandas as pd
import pywer
import numpy as np

from openai import OpenAI
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import corpus_bleu
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
#@title Init gpt-client
GPT_API_KEY = "YOUR_GPT_API_KEY"
client = OpenAI(api_key=GPT_API_KEY)

#Helper functions

In [69]:
#@title the prompt function

def get_prompts(system_content, two_shots_list):
  shots = {
      "zero-shot": [
                {
                  "role": "system",
                  "content": system_content
                }
            ],
      "two-shot": [
                {
                  "role": "system",
                  "content": system_content
                }
            ]
  }
  for (role, content) in two_shots_list:
    shots['two-shot'].append(
        {
          "role": role,
          "content": content
        }
    )
  return shots

In [70]:
#@title gpt predictions
def get_gpt_response(prompt_dict, gpt_model, x, y_true):
  res_dict = {"zero-shot": [], "two-shot": []}

  for shot in prompt_dict:
    for i, instance in enumerate(x):
        messages = prompt_dict[shot].copy()
        messages.append({"role": "user", "content": instance})
        response = client.chat.completions.create(
          model=gpt_model,
          messages=messages,
          temperature=0.2,
          seed=42,
          top_p=1,
          frequency_penalty=0,
          presence_penalty=0,
          logprobs=True
        )
        y_pred = response.choices[0].message.content
        res_dict[shot].append({"x": instance, "y_true": y_true.iloc[i], "y_pred": y_pred})
    res_dict[shot] = pd.DataFrame(res_dict[shot])

  df_pred = pd.merge(res_dict['zero-shot'], res_dict['two-shot'], on=["x", "y_true"], how="inner")
  df_pred.columns = ["x", "y_true", "y_pred_zero_shot", "y_pred_two_shot"]
  return df_pred

#GEC

In [5]:
%%capture
!gdown 1-f-Rpz9AV-svNPD5hFofKSgMWwQ49skD
df = pd.read_csv("korre.csv")

In [None]:
#@title Predictions
x, y_true = df.original_text, df.corrected_text
system_content = "Given a sentence, correct it for grammatical errors, including punctuation, spelling, and morphology of word. Generate only the corrected text."
two_shots_list = [
    ('user', "Δεν ήθελε να θεωρηθεί προκατειλημένος και για αυτό δε συνέχισε τη συνεργασία περεταίρω."),
    ('assistant', "Δεν ήθελε να θεωρηθεί προκατειλημμένος και για αυτό δε συνέχισε τη συνεργασία περαιτέρω."),
    ('user', "Το περιθώριο των κερδών τους δεν αλλάζουν εύκολα."),
    ('assistant', "Το περιθώριο των κερδών τους δεν αλλάζει εύκολα.")
]
prompt_dict = get_prompts(system_content, two_shots_list)

df_gpt_3_5 = get_gpt_response(prompt_dict, "gpt-3.5-turbo", x, y_true)
df_gpt_4o = get_gpt_response(prompt_dict, "gpt-4o", x, y_true)

In [51]:
#@title Results
print(f"gpt-model \t shot \t\t cer \t\t wer")
cer = df_gpt_3_5.apply(lambda row: pywer.cer([row.y_true], [row.y_pred_zero_shot]),1).agg(['mean', 'sem'])
wer = df_gpt_3_5.apply(lambda row: pywer.wer([row.y_true], [row.y_pred_zero_shot]),1).agg(['mean', 'sem'])
print(f"3.5 \t\t zero-shot \t {cer['mean']:.2f}±{cer['sem']:.2f} \t {wer['mean']:.2f}±{wer['sem']:.2f}")
cer = df_gpt_3_5.apply(lambda row: pywer.cer([row.y_true], [row.y_pred_two_shot]),1).agg(['mean', 'sem'])
wer = df_gpt_3_5.apply(lambda row: pywer.wer([row.y_true], [row.y_pred_two_shot]),1).agg(['mean', 'sem'])
print(f"3.5 \t\t two-shot \t {cer['mean']:.2f}±{cer['sem']:.2f} \t {wer['mean']:.2f}±{wer['sem']:.2f}")
cer = df_gpt_4o.apply(lambda row: pywer.cer([row.y_true], [row.y_pred_zero_shot]),1).agg(['mean', 'sem'])
wer = df_gpt_4o.apply(lambda row: pywer.wer([row.y_true], [row.y_pred_zero_shot]),1).agg(['mean', 'sem'])
print(f"4o \t\t zero-shot \t {cer['mean']:.2f}±{cer['sem']:.2f} \t {wer['mean']:.2f}±{wer['sem']:.2f}")
cer = df_gpt_4o.apply(lambda row: pywer.cer([row.y_true], [row.y_pred_two_shot]),1).agg(['mean', 'sem'])
wer = df_gpt_4o.apply(lambda row: pywer.wer([row.y_true], [row.y_pred_two_shot]),1).agg(['mean', 'sem'])
print(f"4o \t\t two-shot \t {cer['mean']:.2f}±{cer['sem']:.2f} \t {wer['mean']:.2f}±{wer['sem']:.2f}")

gpt-model 	 shot 		 cer 		 wer
3.5 		 zero-shot 	 2.99±0.39 	 8.24±0.66
3.5 		 two-shot 	 2.59±0.38 	 7.22±0.65
4o 		 zero-shot 	 1.79±0.20 	 6.92±0.48
4o 		 two-shot 	 2.06±0.22 	 7.49±0.50


#Toxicity

In [None]:
%%capture
!gdown 1-qZlF8hHp2VeXi0irhGLZhUgANIsPku_
!gdown 1-xZfiYx0bNmYF851Ulquh2TyXOFS8341
df_test = pd.read_csv("zampieri_test.csv")

In [None]:
#@title Predictions

x, y_true = df_test.text, df_test.subtask_a
system_content = "Given a tweet, determine if it contains offensive language. If the tweet is offensive, output a 1. If the text is not offensive, output a 0."
# Examples taken from training set
two_shots_list = [
    ('user', "Το #gntmgr κρατα τρεις μήνες μόνο για να μη φαίνεται η κοιλιά αν γκαστρωθεί κάποια κοπέλα."),
    ('assistant', "1"),
    ('user', "Κανονικά ο τύπος θα έπρεπε να ντρέπεται μόνο και μόνο που συμμετέχει στο #poweroflovegr όχι που το βλέπει #giatinparea"),
    ('assistant', "0")
]
prompt_dict = get_prompts(system_content, two_shots_list)

df_gpt_3_5 = get_gpt_response(prompt_dict, "gpt-3.5-turbo", x, y_true)
df_gpt_4o = get_gpt_response(prompt_dict, "gpt-4o", x, y_true)

In [65]:
#@title Results
gpt_3_5_s0, gpt_3_5_s2, gpt_4o_s0, gpt_4o_s2 = [], [], [], []
for i in range(10):
  gpt_3_5_test_sample = df_gpt_3_5.sample(500)
  gpt_3_5_s0.append(f1_score(gpt_3_5_test_sample.y_true, gpt_3_5_test_sample.y_pred_zero_shot, average='macro'))
  gpt_3_5_s2.append(f1_score(gpt_3_5_test_sample.y_true, gpt_3_5_test_sample.y_pred_two_shot, average='macro'))

  gpt_4o_test_sample = df_gpt_4o.sample(500)
  gpt_4o_s0.append(f1_score(gpt_4o_test_sample.y_true, gpt_4o_test_sample.y_pred_zero_shot, average='macro'))
  gpt_4o_s2.append(f1_score(gpt_4o_test_sample.y_true, gpt_4o_test_sample.y_pred_two_shot, average='macro'))


print(f"gpt-model \t shot \t\t F1")
f1 = pd.Series(gpt_3_5_s0).agg(['mean', 'sem'])
print(f"3.5 \t\t zero-shot \t {f1['mean']:.2f}±{f1['sem']:.3f}")
f1 = pd.Series(gpt_3_5_s2).agg(['mean', 'sem'])
print(f"3.5 \t\t two-shot \t {f1['mean']:.2f}±{f1['sem']:.3f}")
f1 = pd.Series(gpt_4o_s0).agg(['mean', 'sem'])
print(f"4o \t\t zero-shot \t {f1['mean']:.2f}±{f1['sem']:.3f}")
f1 = pd.Series(gpt_4o_s2).agg(['mean', 'sem'])
print(f"4o \t\t two-shot \t {f1['mean']:.2f}±{f1['sem']:.3f}")

gpt-model 	 shot 		 F1
3.5 		 zero-shot 	 0.68±0.010
3.5 		 two-shot 	 0.55±0.008
4o 		 zero-shot 	 0.74±0.008
4o 		 two-shot 	 0.66±0.008


#MT

In [76]:
%%capture
!gdown 1-m4soJYv2F-YgjXp1Wn5aBXXoORnjC0C
df = pd.read_csv("prokopidis.csv")
# Get the first 17 languages to evaluate
lang_cols = [col for col in df.columns if 'score' not in col][:18]
df = df[lang_cols]

In [None]:
#@title Predictions

target_langs = list(df.columns[1:])

res = {}
for lang in target_langs:
  lan_all_df = df[df[lang].notna()]
  if lang == 'English':
    lang_df = df[df[lang].notna()].loc[:, ['Greek', lang]]
  else:
    lang_df = df[df[lang].notna()].loc[:, ['Greek', 'English', lang]]

  # Train-test split. In test set add only instances that have the
  # English translation.
  null_mask = lang_df['English'].isna()
  without_en_df = lang_df[null_mask]
  with_en_df = lang_df[~null_mask]
  train_df, test_df = train_test_split(with_en_df, test_size=50, random_state=42)
  train_df = pd.concat([train_df, without_en_df])

  # Get a test sample of 5 instances to evaluate
  sample_test_df = test_df.sample(5, random_state=42)
  x, y_true = sample_test_df["Greek"], sample_test_df[lang]

  # Get two samples from training set to build the two-shot examples
  train_sample_df = df.sample(2, random_state=42)
  system_content = f"Given a piece of text in Greek, translate it to {lang}. Generate only the translated text."
  two_shots_list = [
      ('user', train_sample_df["Greek"].iloc[0]),
      ('assistant', train_sample_df[lang].iloc[0]),
      ('user', train_sample_df["Greek"].iloc[1]),
      ('assistant', train_sample_df[lang].iloc[1])
  ]
  prompt_dict = get_prompts(system_content, two_shots_list)

  df_gpt_3_5 = get_gpt_response(prompt_dict, "gpt-3.5-turbo", x, y_true)
  df_gpt_4o = get_gpt_response(prompt_dict, "gpt-4o", x, y_true)
  res[lang] = [df_gpt_3_5, df_gpt_4o]

In [106]:
#@title Results per target language

def calc_bleu_score(row, col):
  ref = row.y_true
  cand = row[col]
  return corpus_bleu([[ref]], [cand], weights=(1.0,))

print("\t\t\t\tBleu score")
print("\t\t gpt3.5 \t\t\t gpt4o")
print(f"lang \t 0shot \t\t 2shot \t\t 0shot \t\t 2shot\n")
bleu_dict = {}
for lang in res:
  gpt_3_5_bleu_0s = res[lang][0].apply(lambda row: calc_bleu_score(row, "y_pred_zero_shot"), axis=1).mean()
  gpt_3_5_bleu_2s = res[lang][0].apply(lambda row: calc_bleu_score(row, "y_pred_two_shot"), axis=1).mean()
  gpt_4o_bleu_0s = res[lang][1].apply(lambda row: calc_bleu_score(row, "y_pred_zero_shot"), axis=1).mean()
  gpt_4o_bleu_2s = res[lang][1].apply(lambda row: calc_bleu_score(row, "y_pred_two_shot"), axis=1).mean()
  print(f"{lang[:5]} \t {gpt_3_5_bleu_0s.mean():.2f}±{gpt_3_5_bleu_0s.std():.2f} \t {gpt_3_5_bleu_2s.mean():.2f}±{gpt_3_5_bleu_2s.std():.2f} \t {gpt_4o_bleu_0s.mean():.2f}±{gpt_4o_bleu_0s.std():.2f} \t {gpt_4o_bleu_2s.mean():.2f}±{gpt_4o_bleu_2s.std():.2f}")
  bleu_dict[lang] = [gpt_3_5_bleu_0s.mean(), gpt_3_5_bleu_2s.mean(), gpt_4o_bleu_0s.mean(), gpt_4o_bleu_2s.mean()]

				Bleu score
		 gpt3.5 			 gpt4o
lang 	 0shot 		 2shot 		 0shot 		 2shot

Engli 	 0.73±0.00 	 0.77±0.00 	 0.79±0.00 	 0.79±0.00
Esper 	 0.80±0.00 	 0.78±0.00 	 0.82±0.00 	 0.80±0.00
Farsi 	 0.81±0.00 	 0.84±0.00 	 0.79±0.00 	 0.80±0.00
Filip 	 0.66±0.00 	 0.71±0.00 	 0.68±0.00 	 0.69±0.00
Frenc 	 0.79±0.00 	 0.77±0.00 	 0.77±0.00 	 0.77±0.00
Hebre 	 0.67±0.00 	 0.70±0.00 	 0.60±0.00 	 0.71±0.00
Hindi 	 0.55±0.00 	 0.57±0.00 	 0.62±0.00 	 0.58±0.00
Hunga 	 0.72±0.00 	 0.72±0.00 	 0.73±0.00 	 0.73±0.00
Indon 	 0.79±0.00 	 0.80±0.00 	 0.79±0.00 	 0.79±0.00
Itali 	 0.75±0.00 	 0.75±0.00 	 0.77±0.00 	 0.79±0.00
Japan 	 0.24±0.00 	 0.31±0.00 	 0.31±0.00 	 0.31±0.00
Khmer 	 0.31±0.00 	 0.32±0.00 	 0.32±0.00 	 0.34±0.00
Korea 	 0.37±0.00 	 0.40±0.00 	 0.34±0.00 	 0.35±0.00
Maced 	 0.73±0.00 	 0.71±0.00 	 0.75±0.00 	 0.75±0.00
Malag 	 0.64±0.00 	 0.68±0.00 	 0.64±0.00 	 0.65±0.00
Burme 	 0.20±0.00 	 0.20±0.00 	 0.53±0.00 	 0.55±0.00
Dutch 	 0.77±0.00 	 0.76±0.00 	 0.78±0.00 	 0.82±0.00


In [120]:
#@title Results per language tier

# Language tiers
tiers_dict = {
    "English": 1,
    "Esperanto": 3,
    "Farsi": 3,
    "Filipino": 3,
    'French': 1,
    'Hebrew': 3,
    'Hindi': 2,
    'Hungarian': 3,
    'Indonesian': 3,
    'Italian': 2,
    'Japanese': 2,
    'Khmer': 3,
    'Korean':2,
    'Macedonian':3,
    'Malagasy':3,
    'Burmese':3,
    'Dutch': 2
}

tier_res = {
    1: [],
    2: [],
    3: []
}

for lan, tier in tiers_dict.items():
  gpt_3_5_bleu_0s, gpt_3_5_bleu_2s, gpt_4o_bleu_0s, gpt_4o_bleu_2s = bleu_dict[lan]
  tier_res[tier].append([gpt_3_5_bleu_0s, gpt_3_5_bleu_2s, gpt_4o_bleu_0s, gpt_4o_bleu_2s])

print("\t\t\t\tBleu score")
print("\t\t gpt3.5 \t\t\t gpt4o")
print(f"tier \t 0shot \t\t 2shot \t\t 0shot \t\t 2shot\n")
for tier, bleu_s in tier_res.items():
  bleu_s = np.array(bleu_s)
  means = np.mean(bleu_s, axis=0)
  stds = np.std(bleu_s, axis=0)
  print(f"{tier} \t {means[0]:.2f}±{stds[0]:.2f} \t {means[1]:.2f}±{stds[1]:.2f} \t {means[2]:.2f}±{stds[2]:.2f} \t {means[3]:.2f}±{stds[3]:.2f}")

				Bleu score
		 gpt3.5 			 gpt4o
tier 	 0shot 		 2shot 		 0shot 		 2shot

1 	 0.76±0.03 	 0.77±0.00 	 0.78±0.01 	 0.78±0.01
2 	 0.53±0.21 	 0.56±0.18 	 0.56±0.20 	 0.57±0.21
3 	 0.63±0.20 	 0.64±0.20 	 0.67±0.14 	 0.68±0.13


# NER

In [122]:
!gdown 105zhcFuStxs2BeQYfUqIXYa69ULITQ7F
!gdown 1VQx4OE8xtmC_kZoRbHlLkB_-fD6Xax7M
from sklearn.model_selection import train_test_split

df = pd.read_pickle("barziokas.pkl.csv")
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

Downloading...
From: https://drive.google.com/uc?id=105zhcFuStxs2BeQYfUqIXYa69ULITQ7F
To: /content/barziokas.csv
100% 12.2M/12.2M [00:00<00:00, 104MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1VQx4OE8xtmC_kZoRbHlLkB_-fD6Xax7M
To: /content/barziokas.pkl.csv
100% 7.85M/7.85M [00:00<00:00, 53.2MB/s]


In [None]:
#@title Predictions

sample_test_df = test_df.sample(50, random_state=42)

train_samples = train_df.sample(10, random_state=42)
train_sample_1 = train_samples.iloc[2]
train_sample_2 = train_samples.iloc[-2]
x, y_true = sample_test_df.sentence, sample_test_df.tagset4


system_content = '''
Identify and label named entities in a given sentence using the specified NER tag set: `['S-LOC', 'O', 'B-ORG', 'E-ORG', 'B-PERSON', 'E-PERSON', 'I-ORG', 'B-LOC', 'E-LOC', 'S-PERSON', 'I-PERSON', 'S-ORG', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'I-LOC']`.
You will be provided with a list of words, which form a sentence. Your task is to analyze this sentence and assign the appropriate named entity tag to each word.
- For single-token entities, use the `S-` prefix followed by the appropriate entity type (e.g., `S-LOC` for a single-token location).
- For multi-token entities, use the `B-`, `I-`, and `E-` prefixes to denote the beginning, inside, and end of the entity, respectively (e.g., `B-PERSON`, `I-PERSON`, `E-PERSON` for a person entity spanning multiple tokens).
- Use the `O` tag for words that are not part of any named entity.
Generate just a list with just the elements being the named entity tags corresponding to each word in the input list. Ensure that the tags correctly represent the boundaries and types of named entities as per the tag set provided.

Tag Set:
    - `S-LOC`: Single-token location entity.
    - `O`: Outside any named entity.
    - `B-ORG`: Beginning of an organization entity.
    - `E-ORG`: End of an organization entity.
    - `B-PERSON`: Beginning of a person entity.
    - `E-PERSON`: End of a person entity.
    - `I-ORG`: Inside an organization entity.
    - `B-LOC`: Beginning of a location entity.
    - `E-LOC`: End of a location entity.
    - `S-PERSON`: Single-token person entity.
    - `I-PERSON`: Inside a person entity.
    - `S-ORG`: Single-token organization entity.
    - `S-MISC`: Single-token miscellaneous entity.
    - `B-MISC`: Beginning of a miscellaneous entity.
    - `I-MISC`: Inside a miscellaneous entity.
    - `E-MISC`: End of a miscellaneous entity.
    - `I-LOC`: Inside a location entity.
'''
two_shots_list = [
    ('user', ", ".join(train_sample_1.sentence)),
    ('assistant', ", ".join(train_sample_1.tagset4)),
    ('user', ", ".join(train_sample_2.sentence)),
    ('assistant', ", ".join(train_sample_2.tagset4))
]
prompt_dict = get_prompts(system_content, two_shots_list)

df_gpt_3_5 = get_gpt_response(prompt_dict, "gpt-3.5-turbo", x, y_true)
df_gpt_4o = get_gpt_response(prompt_dict, "gpt-4o", x, y_true)

In [208]:
#@title Results
print("gpt-3.5 zero-shot")
report = classification_report(df_gpt_3_5.y_true.explode(), df_gpt_3_5.y_pred_zero_shot.explode(), labels=[
    'S-LOC', 'B-LOC', 'I-LOC', 'E-LOC', 'S-ORG', 'B-ORG', 'I-ORG', 'E-ORG',
    'S-PERSON', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'O'
], zero_division=0)
print(report)

print("\ngpt-3.5 two-shot")
report = classification_report(df_gpt_3_5.y_true.explode(), df_gpt_3_5.y_pred_two_shot.explode(), labels=[
    'S-LOC', 'B-LOC', 'I-LOC', 'E-LOC', 'S-ORG', 'B-ORG', 'I-ORG', 'E-ORG',
    'S-PERSON', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'O'
], zero_division=0)
print(report)

print("\ngpt-4o zero-shot")
report = classification_report(df_gpt_3_5.y_true.explode(), df_gpt_4o.y_pred_zero_shot.explode(), labels=[
    'S-LOC', 'B-LOC', 'I-LOC', 'E-LOC', 'S-ORG', 'B-ORG', 'I-ORG', 'E-ORG',
    'S-PERSON', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'O'
], zero_division=0)
print(report)

print("\ngpt-4o two-shot")
report = classification_report(df_gpt_3_5.y_true.explode(), df_gpt_4o.y_pred_two_shot.explode(), labels=[
    'S-LOC', 'B-LOC', 'I-LOC', 'E-LOC', 'S-ORG', 'B-ORG', 'I-ORG', 'E-ORG',
    'S-PERSON', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'O'
], zero_division=0)
print(report)

gpt-3.5 zero-shot
              precision    recall  f1-score   support

       S-LOC       0.16      0.29      0.21        14
       B-LOC       0.00      0.00      0.00         2
       I-LOC       0.00      0.00      0.00         1
       E-LOC       0.00      0.00      0.00         2
       S-ORG       0.25      0.18      0.21        22
       B-ORG       0.33      0.33      0.33        12
       I-ORG       0.60      0.38      0.46         8
       E-ORG       0.33      0.33      0.33        12
    S-PERSON       0.13      0.20      0.16        10
    B-PERSON       0.08      0.10      0.09        10
    I-PERSON       0.00      0.00      0.00         0
    E-PERSON       0.09      0.10      0.10        10
      S-MISC       0.00      0.00      0.00         9
      B-MISC       0.00      0.00      0.00         1
      I-MISC       0.00      0.00      0.00         2
      E-MISC       0.00      0.00      0.00         1
           O       0.93      0.90      0.91       990

   micro