<a href="https://colab.research.google.com/github/doanhieung/colab_notebooks/blob/main/Label_IMDb_unsupervised_dataset_with_LLama_3_1_8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import dependencies
import os
import random
import pandas as pd
from tqdm.notebook import tqdm
from dotenv import load_dotenv
from together import Together
from datasets import load_dataset

# Define configuration
DATASET = "stanfordnlp/imdb"
SEED = 42
NUM_EXAMPLES = 5000
MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
TEMPARATURE = 0
MAX_TOKENS = 256

# Load Together.ai client
_ = load_dotenv()
client = Together()

In [None]:
# Load IMDb dataset
dataset = load_dataset(DATASET, split="unsupervised")

In [None]:
# Randomly select 5000 unsupervised samples to label using LLM
random.seed(SEED)
random_indices = random.sample(range(len(dataset)), NUM_EXAMPLES)
selected_examples = [dataset[i] for i in random_indices]

In [None]:
# Call Together.ai API to get the labels
prompt = """Analyze the sentiment of the following movie review and classify them as POSITIVE or NEGATIVE.

Only output the sentiment of the review. Do not include any other information.

Movie Review:
```
{input}
```
Sentiment: """

for index, example in enumerate(tqdm(selected_examples)):
    save_path = f"./results/{index}.txt"
    if os.path.exists(save_path):
        continue
    input = example["text"]
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt.format(input=input)}],
        model=MODEL,
        temperature=TEMPARATURE,
        max_tokens=MAX_TOKENS,
    )
    response = chat_completion.choices[0].message.content
    # Save model response
    with open(save_path, "w") as f:
        f.write(response)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
# Save result to a CSV file
res = []
for index, example in enumerate(selected_examples):
    save_path = f"./results/{index}.txt"
    with open(save_path, "r") as f:
        label = f.read()
    if label in ["POSITIVE", "NEGATIVE"]:
        res.append({"text": example["text"], "label": 0 if label == "NEGATIVE" else 1})
pd.DataFrame(res).to_csv("data/extra_data.csv", index=False)