In [None]:
%pip install openai

In [15]:
import pandas as pd
import json
import numpy as np
from time import sleep
import time
from openai import OpenAI
from tqdm import tqdm
client = OpenAI(api_key=open('API_key').read())

In [16]:
# The code to open the test dataset - copied from the directory `systems/web`
import json

import re
mention_re=re.compile(r'@\S+')
hashtag_re=re.compile(r'#\S+')
url_re=re.compile(r'http\S+')
space_re=re.compile(r'\s+')

def clean_tweets(text):
    return space_re.sub(' ',url_re.sub(' ',hashtag_re.sub(' ',mention_re.sub(' ',text))).replace(' RT ',' ').lower()).strip()


def load_twitter_dataset(categories=['bs','hr','sr','me']):
    twitter_dataset=json.load(open('../../data/Twitter-HBS.json'))
    X={'train':[],'dev':[],'test':[]}
    y={'train':[],'dev':[],'test':[]}
    for instance in twitter_dataset:
        lang=instance['language']
        split=instance['split']
        if lang in categories:
            y[split].append(lang)
            X[split].append(clean_tweets(' '.join(instance['tweets'])))
    return X,y

def load_setimes_dataset():
    setimes_dataset=json.load(open('../../data/SETimes.HBS.json'))
    X={'train':[],'dev':[],'test':[]}
    y={'train':[],'dev':[],'test':[]}
    for instance in setimes_dataset:
        lang=instance['language']
        split=instance['split']
        y[split].append(lang)
        X[split].append(instance['text'].lower())
    return X,y

LABELS = ["hr", "bs", "sr", "me"]

In [17]:
twitter_X,twitter_y=load_twitter_dataset()
twitter3_X,twitter3_y=load_twitter_dataset(['bs','hr','sr'])
setimes_X,setimes_y=load_setimes_dataset()

twitter3_test = pd.DataFrame({"labels": twitter3_y["test"], "text": twitter3_X["test"]})

twitter_test = pd.DataFrame({"labels": twitter_y["test"], "text": twitter_X["test"]})

setimes_test = pd.DataFrame({"labels": setimes_y["test"], "text": setimes_X["test"]})

print(twitter3_test.shape, twitter_test.shape, setimes_test.shape)

(112, 2) (123, 2) (921, 2)


In [18]:
display(twitter_test.labels.value_counts())

twitter_test.head(3)

labels
sr    79
hr    18
bs    15
me    11
Name: count, dtype: int64

Unnamed: 0,labels,text
0,sr,oj srbijo medju sljivama :d uvek je premnogo l...
1,sr,"kiseli kupus, pozz ;3 ja uvek zaboravim da se ..."
2,sr,jos 263 dana do pocetka leta. da borka pavicev...


In [19]:
display(twitter3_test.labels.value_counts())

twitter3_test.head(3)

labels
sr    79
hr    18
bs    15
Name: count, dtype: int64

Unnamed: 0,labels,text
0,sr,oj srbijo medju sljivama :d uvek je premnogo l...
1,sr,"kiseli kupus, pozz ;3 ja uvek zaboravim da se ..."
2,sr,jos 263 dana do pocetka leta. da borka pavicev...


In [20]:
display(setimes_test.labels.value_counts())

setimes_test.head(3)

labels
hr    313
bs    312
sr    296
Name: count, dtype: int64

Unnamed: 0,labels,text
0,sr,diplomatski dnevnik: makedonija i nemačka razg...
1,sr,skulpture su postavljene u specijalno projekto...
2,sr,kosovski zvaničnici aktivno raspravljaju o reš...


In [24]:
def predict_gpt(df_test_name, gpt_model):
	"""
	Args:
	- df_test_name: Can be either setimes, twitter, or twitter3.
	"""

	df_dict = {
		"setimes": setimes_test,
		"twitter": twitter_test,
		"twitter3": twitter3_test
	}

	df = df_dict[df_test_name]

	responses = []
	
	texts = df["text"].to_list()

	start_time = time.time()

	labels_dict = {"Croatian": 1, "Serbian": 2, "Montenegrin": 3, "Bosnian": 4}
	final_labels_dict = {1: "hr", 2: "sr", 3: "me", 4: "bs"}

	for text in texts:
		try:
			completion = client.chat.completions.create(model=gpt_model,
			response_format= {"type": "json_object"},
			messages= [
			{
				"role": "user",
				"content": f"""
				### Task
				Your task is to determine which South Slavic language is used in the provided text: Croatian, Serbian, Montenegrin, or Bosnian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian dialects), vocabulary choices (e.g., hljeb vs. kruh) and any specific local or national references. If multiple language labels are plausible, choose the most likely one based on the text content. Always provide a language label, even if you are not sure.

				### Output format
					Return a valid JSON dictionary with the following key: 'lang' and a value should be an integer which represents one of the labels according to the following dictionary: {labels_dict}.

					
					Text: '{text}'
			"""
				}
			],
			temperature = 0)

			response=completion.choices[0].message.content

			response = response.replace("\n", "")
			response = response.replace("\t", "")

			# Convert the string into a dictionary
			response = json.loads(response)

			# Get out a label
			try:
				predicted = final_labels_dict[response["lang"]]
				responses.append(predicted)
			# add a possibility of something going wrong
			except:
				predicted = "error"
				print("error with extracting a label - issue with parsing the prediction")
				responses.append(predicted)
		except:
			predicted = "error"
			print("error with extracting a label - issue with the text")
			responses.append(predicted)

	end_time = time.time()
	elapsed_time_min = end_time-start_time

	print(f"Prediction finished. It took {elapsed_time_min/60} min for {df.shape[0]} instances - {elapsed_time_min/df.shape[0]} s per instance.")

	cur_results = {
			'train':'NA (zero-shot)',
			'test': df_test_name,
			'predictions': responses }

	return cur_results

In [25]:
for model in ["gpt-4o-2024-08-06", "gpt-3.5-turbo-0125", "gpt-4o-mini-2024-07-18"]:
	print(model)

	final_results = {
		"system": model,
		"predictions": []
	}

	for test in ["setimes", "twitter", "twitter3"]:
		cur_results = predict_gpt(test, model)
		final_results["predictions"].append(cur_results)

	# Save the results as a new json
	with open("{}.predictions.json".format(model), "w") as file:
		json.dump(final_results, file)


gpt-4o-2024-08-06
Prediction finished. It took 10.19646878639857 min for 921 instances - 0.6642650675178221 s per instance.
Prediction finished. It took 2.3601268410682676 min for 123 instances - 1.15128138588696 s per instance.
Prediction finished. It took 2.0119425296783446 min for 112 instances - 1.0778263551848275 s per instance.
gpt-3.5-turbo-0125
Prediction finished. It took 8.697613469759624 min for 921 instances - 0.5666197700169135 s per instance.
error with extracting a label - issue with the text
error with extracting a label - issue with the text
error with extracting a label - issue with the text
error with extracting a label - issue with the text
error with extracting a label - issue with the text
error with extracting a label - issue with the text
error with extracting a label - issue with the text
error with extracting a label - issue with the text
error with extracting a label - issue with the text
error with extracting a label - issue with the text
error with extracti