In [1]:
import pandas as pd
import openai
import random
from collections import defaultdict


In [None]:
# Load dataset
df = pd.read_csv("genalaskan2k.csv")

# Group sentences by language
language_data = defaultdict(list)
for _, row in df.iterrows():
    language_data[row["Language"]].append(row["Sentence"])

# Ensure each language has exactly 100 sentences
for lang, sentences in language_data.items():
    assert len(sentences) == 100, f"Language {lang} has {len(sentences)} sentences, expected 100."
    

In [3]:
# Split dataset based on 3-4-3 setup
zero_shot_samples = {lang: sentences[:30] for lang, sentences in language_data.items()}
few_shot_samples = {lang: sentences[30:70] for lang, sentences in language_data.items()}  # 40 examples for training
validation_samples = {lang: sentences[70:] for lang, sentences in language_data.items()}  # 30 for final test


In [None]:
# Initialize OpenAI client (for API version 1.0.0+)
client = openai.OpenAI(api_key="insert api key here")

In [None]:
def query_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Use GPT-4o or GPT-4o-mini
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

In [6]:
# Zero-Shot Evaluation
zero_shot_results = []
for lang, test_sentences in zero_shot_samples.items():
    for sentence in test_sentences:
        prompt = f"You are a linguistics expert who knows every single language that exists in this world. What language is this sentence in?\n\nSentence: {sentence}. Reply with only the language itself and nothing else."
        predicted_lang = query_gpt(prompt)
        zero_shot_results.append([lang, sentence, predicted_lang])

In [None]:
# Few-Shot Learning with GPT
few_shot_results = []
for lang, test_sentences in validation_samples.items():
    # Format the few-shot prompt
    examples = "\n".join(
        [f"Example {i+1}: {sent} (Language: {lang})" for i, sent in enumerate(few_shot_samples[lang])]
    )
    
    for sentence in test_sentences:
        prompt = f"""
        Here are examples of sentences and their languages:
        {examples}
        
        You are a linguistics expert who knows every single language that exists in this world. Now, what language is this sentence in?\n\nSentence: {sentence}. Reply with only the language itself and nothing else.
        """
        predicted_lang = query_gpt(prompt)
        few_shot_results.append([lang, sentence, predicted_lang])
        

In [8]:
# Convert results to DataFrames and save
zero_shot_df = pd.DataFrame(zero_shot_results, columns=["True Language", "Sentence", "Predicted Language"])
few_shot_df = pd.DataFrame(few_shot_results, columns=["True Language", "Sentence", "Predicted Language"])

# Save outputs
zero_shot_df.to_csv("zero_shot_results2k_mini.csv", index=False)
few_shot_df.to_csv("few_shot_results2k_mini.csv", index=False)

print("Experiment complete! Results saved.")


Experiment complete! Results saved.
