In [None]:
%pip install openai

In [1]:
import pandas as pd
import json
import numpy as np
from time import sleep
import time
from openai import OpenAI
from tqdm import tqdm
client = OpenAI(api_key=open('API_key').read())

In [2]:
# The code to open the test dataset - copied from the directory `systems/web`
import json

import re
mention_re=re.compile(r'@\S+')
hashtag_re=re.compile(r'#\S+')
url_re=re.compile(r'http\S+')
space_re=re.compile(r'\s+')

def clean_tweets(text):
    return space_re.sub(' ',url_re.sub(' ',hashtag_re.sub(' ',mention_re.sub(' ',text))).replace(' RT ',' ').lower()).strip()


def load_twitter_dataset(categories=['bs','hr','sr','me']):
    twitter_dataset=json.load(open('../../data/Twitter-HBS.json'))
    X={'train':[],'dev':[],'test':[]}
    y={'train':[],'dev':[],'test':[]}
    for instance in twitter_dataset:
        lang=instance['language']
        split=instance['split']
        if lang in categories:
            y[split].append(lang)
            X[split].append(clean_tweets(' '.join(instance['tweets'])))
    return X,y

def load_setimes_dataset():
    setimes_dataset=json.load(open('../../data/SETimes.HBS.json'))
    X={'train':[],'dev':[],'test':[]}
    y={'train':[],'dev':[],'test':[]}
    for instance in setimes_dataset:
        lang=instance['language']
        split=instance['split']
        y[split].append(lang)
        X[split].append(instance['text'].lower())
    return X,y

In [3]:
twitter_X,twitter_y=load_twitter_dataset()
twitter3_X,twitter3_y=load_twitter_dataset(['bs','hr','sr'])
setimes_X,setimes_y=load_setimes_dataset()

twitter3_test = pd.DataFrame({"labels": twitter3_y["test"], "text": twitter3_X["test"]})

twitter_test = pd.DataFrame({"labels": twitter_y["test"], "text": twitter_X["test"]})

setimes_test = pd.DataFrame({"labels": setimes_y["test"], "text": setimes_X["test"]})

print(twitter3_test.shape, twitter_test.shape, setimes_test.shape)

(112, 2) (123, 2) (921, 2)


In [4]:
display(twitter_test.labels.value_counts())

twitter_test.head(3)

labels
sr    79
hr    18
bs    15
me    11
Name: count, dtype: int64

Unnamed: 0,labels,text
0,sr,oj srbijo medju sljivama :d uvek je premnogo l...
1,sr,"kiseli kupus, pozz ;3 ja uvek zaboravim da se ..."
2,sr,jos 263 dana do pocetka leta. da borka pavicev...


In [5]:
display(twitter3_test.labels.value_counts())

twitter3_test.head(3)

labels
sr    79
hr    18
bs    15
Name: count, dtype: int64

Unnamed: 0,labels,text
0,sr,oj srbijo medju sljivama :d uvek je premnogo l...
1,sr,"kiseli kupus, pozz ;3 ja uvek zaboravim da se ..."
2,sr,jos 263 dana do pocetka leta. da borka pavicev...


In [6]:
display(setimes_test.labels.value_counts())

setimes_test.head(3)

labels
hr    313
bs    312
sr    296
Name: count, dtype: int64

Unnamed: 0,labels,text
0,sr,diplomatski dnevnik: makedonija i nemačka razg...
1,sr,skulpture su postavljene u specijalno projekto...
2,sr,kosovski zvaničnici aktivno raspravljaju o reš...


In [7]:
def predict_gpt(df_test_name, gpt_model):
	"""
	Args:
	- df_test_name: Can be either setimes, twitter, or twitter3.
	"""
	LABELS_3 = {"Bosnian": 1, "Croatian": 2, "Serbian": 3, }
	LABELS_4 = {"Bosnian": 1, "Croatian": 2, "Serbian": 3, "Montenegrin": 4}

	df_dict = {
		"setimes": {
			"dataset": setimes_test,
			"labels": LABELS_3,
			"prompt": """
				### Task
					You will be provided with a news article text. Your task is to determine which South Slavic language is used in the provided text: Bosnian, Croatian, or Serbian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian variants) and vocabulary choices (e.g., hljeb vs. kruh). Do not base your decision on specific local or national references - the decision should be based solely on linguistic differences and vocabulary choices. Always provide a language label, even if you are not sure.
				"""
			},
		"twitter": {
			"dataset": twitter_test,
			"labels": LABELS_4,
			"prompt": """
				### Task
					You will be provided with a collection of social media texts written by the same author. Your task is to determine which South Slavic language is used in the provided texts: Bosnian, Croatian, Montenegrin or Serbian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian variants), vocabulary choices (e.g., hljeb vs. kruh) and any specific local or national references. Always provide a language label, even if you are not sure.
				"""
			},
		"twitter3": {
			"dataset": twitter3_test,
			"labels": LABELS_3,
			"prompt": """
				### Task
					You will be provided with a collection of social media texts written by the same author. Your task is to determine which South Slavic language is used in the provided texts: Bosnian, Croatian, or Serbian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian variants), vocabulary choices (e.g., hljeb vs. kruh) and any specific local or national references. Always provide a language label, even if you are not sure.
				"""
			}
	}

	print(df_test_name)
	df = df_dict[df_test_name]["dataset"]

	labels_dict = df_dict[df_test_name]["labels"]
	print(labels_dict)

	final_labels_dict = {1: "bs", 2: "hr", 3: "sr", 4: "me"}

	prompt = df_dict[df_test_name]["prompt"]
	print(prompt)
	
	responses = []
	
	texts = df["text"].to_list()

	start_time = time.time()

	for text in tqdm(texts):
		try:
			completion = client.chat.completions.create(model=gpt_model,
			response_format= {"type": "json_object"},
			messages= [
			{
				"role": "user",
				"content": prompt + f"""

				### Output format
					Return a valid JSON dictionary with the following key: 'lang' and a value should be an integer which represents one of the labels according to the following dictionary: {labels_dict}.

					
					Text: '{text}'
			"""
				}
			],
			temperature = 0)

			response=completion.choices[0].message.content

			response = response.replace("\n", "")
			response = response.replace("\t", "")

			# Convert the string into a dictionary
			response = json.loads(response)

			# Get out a label
			try:
				predicted = final_labels_dict[response["lang"]]
				responses.append(predicted)
			# add a possibility of something going wrong
			except:
				predicted = "error"
				print("error with extracting a label - issue with parsing the prediction")
				responses.append(predicted)
		except:
			predicted = "error"
			print("error with extracting a label - issue with the text")
			responses.append(predicted)

	end_time = time.time()
	elapsed_time_min = end_time-start_time

	print(f"Prediction finished. It took {elapsed_time_min/60} min for {df.shape[0]} instances - {elapsed_time_min/df.shape[0]} s per instance.")

	cur_results = {
			'train':'NA (zero-shot)',
			'test': df_test_name,
			'predictions': responses }

	return cur_results

In [8]:
for model in ["gpt-4o-2024-08-06", "gpt-4o-mini-2024-07-18"]:
	print(model)

	final_results = {
		"system": model,
		"predictions": []
	}

	for test in ["twitter", "twitter3", "setimes"]:
		cur_results = predict_gpt(test, model)
		final_results["predictions"].append(cur_results)

	# Save the results as a new json
	with open("{}.predictions.json".format(model), "w") as file:
		json.dump(final_results, file)


gpt-4o-2024-08-06
twitter
{'Bosnian': 1, 'Croatian': 2, 'Serbian': 3, 'Montenegrin': 4}

				### Task
					You will be provided with a collection of social media texts written by the same author. Your task is to determine which South Slavic language is used in the provided texts: Bosnian, Croatian, Montenegrin or Serbian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian variants), vocabulary choices (e.g., hljeb vs. kruh) and any specific local or national references. Always provide a language label, even if you are not sure.
				


100%|██████████| 123/123 [02:33<00:00,  1.25s/it]


Prediction finished. It took 2.5597654700279238 min for 123 instances - 1.2486660829404506 s per instance.
twitter3
{'Bosnian': 1, 'Croatian': 2, 'Serbian': 3}

				### Task
					You will be provided with a collection of social media texts written by the same author. Your task is to determine which South Slavic language is used in the provided texts: Bosnian, Croatian, or Serbian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian variants), vocabulary choices (e.g., hljeb vs. kruh) and any specific local or national references. Always provide a language label, even if you are not sure.
				


100%|██████████| 112/112 [02:30<00:00,  1.34s/it]


Prediction finished. It took 2.501479657491048 min for 112 instances - 1.340078387941633 s per instance.
setimes
{'Bosnian': 1, 'Croatian': 2, 'Serbian': 3}

				### Task
					You will be provided with a news article text. Your task is to determine which South Slavic language is used in the provided text: Bosnian, Croatian, or Serbian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian variants) and vocabulary choices (e.g., hljeb vs. kruh). Do not base your decision on specific local or national references - the decision should be based solely on linguistic differences and vocabulary choices. Always provide a language label, even if you are not sure.
				


100%|██████████| 921/921 [10:38<00:00,  1.44it/s]


Prediction finished. It took 10.636692833900451 min for 921 instances - 0.69294415856029 s per instance.
gpt-4o-mini-2024-07-18
twitter
{'Bosnian': 1, 'Croatian': 2, 'Serbian': 3, 'Montenegrin': 4}

				### Task
					You will be provided with a collection of social media texts written by the same author. Your task is to determine which South Slavic language is used in the provided texts: Bosnian, Croatian, Montenegrin or Serbian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian variants), vocabulary choices (e.g., hljeb vs. kruh) and any specific local or national references. Always provide a language label, even if you are not sure.
				


100%|██████████| 123/123 [02:57<00:00,  1.44s/it]


Prediction finished. It took 2.9570902824401855 min for 123 instances - 1.4424830646049687 s per instance.
twitter3
{'Bosnian': 1, 'Croatian': 2, 'Serbian': 3}

				### Task
					You will be provided with a collection of social media texts written by the same author. Your task is to determine which South Slavic language is used in the provided texts: Bosnian, Croatian, or Serbian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian variants), vocabulary choices (e.g., hljeb vs. kruh) and any specific local or national references. Always provide a language label, even if you are not sure.
				


100%|██████████| 112/112 [02:57<00:00,  1.58s/it]


Prediction finished. It took 2.9507838169733684 min for 112 instances - 1.5807770448071616 s per instance.
setimes
{'Bosnian': 1, 'Croatian': 2, 'Serbian': 3}

				### Task
					You will be provided with a news article text. Your task is to determine which South Slavic language is used in the provided text: Bosnian, Croatian, or Serbian. Your choice should be based on the linguistic differences (e.g., the use of ijekavian vs. ekavian variants) and vocabulary choices (e.g., hljeb vs. kruh). Do not base your decision on specific local or national references - the decision should be based solely on linguistic differences and vocabulary choices. Always provide a language label, even if you are not sure.
				


100%|██████████| 921/921 [11:50<00:00,  1.30it/s]

Prediction finished. It took 11.848440170288086 min for 921 instances - 0.7718853531132304 s per instance.



