In [6]:
import json
import pandas as pd
import traceback
from dotenv import load_dotenv
from groq import Groq
from sklearn.model_selection import train_test_split

load_dotenv()
client = Groq()

In [7]:
df = pd.read_csv("../data/Britannia.csv")
negative_reviews = df.loc[df["Negative_Review"] != "No Negative", "Negative_Review"]
positive_reviews = df.loc[df["Positive_Review"] != "No Positive", "Positive_Review"]
review_df = pd.DataFrame(
    {
        "Review": pd.concat([negative_reviews, positive_reviews], ignore_index=True),
        "Sentiment": ["Negative"] * len(negative_reviews)
        + ["Positive"] * len(positive_reviews),
    }
)
review_df.describe()

Unnamed: 0,Review,Sentiment
count,8361,8361
unique,7689,2
top,Location,Negative
freq,151,4262


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    review_df["Review"],
    review_df["Sentiment"],
    test_size=0.2,
    random_state=42,
    stratify=review_df["Sentiment"],
)
print(X_train.shape, X_test.shape)

(6688,) (1673,)


In [9]:
template = """Add a "label" field to each object in the JSON below, based on the sentiment of the "text" field. The "label" must be either "Positive" or "Negative". Return the JSON formatted as pretty-printed text.

Desired JSON structure:
[
    {{
        "id": <integer>,           // Unique identifier for the object
        "text": <string>,          // A sentence or phrase to analyze
        "label": <"Positive"|"Negative"> // Sentiment of the text
    }},
    ...
]

Input JSON:
{input_json}
"""

In [10]:
def generate_batches(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i : i + batch_size]


for batch_id, batch in enumerate(generate_batches(X_test.values, batch_size=50)):
    try:
        input_json = [{"id": i, "text": text} for i, text in enumerate(batch, start=1)]
        messages = [
            {
                "role": "user",
                "content": template.format(input_json=json.dumps(input_json, indent=4)),
            },
        ]

        chat_completion = client.chat.completions.create(
            messages=messages,
            model="llama-3.1-8b-instant",
            temperature=0,
            response_format={"type": "json_object"},
        )
        response = chat_completion.choices[0].message.content
        response = json.loads(response)
        with open(f"../model/llm/{batch_id}.json", "w") as f:
            json.dump(response, f, indent=4)
    except Exception as e:
        error_message = traceback.format_exc()
        print("Caught an exception:", error_message)

KeyboardInterrupt: 