# Synthetic data generation using OpenAI API

## Import required libraries

In [53]:
import csv
import json
from openai import OpenAI
import pandas as pd

## Create the batch input

In [62]:
# Define the base structure of the JSON object
base_structure = {
    "custom_id": "request-1",
    "method": "POST",
    "url": "/v1/chat/completions",
    "body": {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "system", "content": "You are an assistant that generates unique reviews of products in Slovak with maximum of 25 words and different lengths into csv with columns review_text and sentiment (1-positive, 0-negative)"},
            {"role": "user", "content": "Napíš 130 pozitívnych a negatívnych recenzií striedavo"}
        ],
        "max_tokens": 4026
    }
}

# Open a file to write the JSONL data
with open('batch_input.jsonl', 'w') as f:
    # Loop to create 4800 entries
    for i in range(1, 49):
        # Update the custom_id in the base structure
        base_structure["custom_id"] = f"request-{i}"
        # Write the JSON object as a line in the file
        f.write(json.dumps(base_structure) + '\n')

## Load the batch input

In [63]:
client = OpenAI()

In [64]:

batch_input_file = client.files.create(
  file=open("batch_input.jsonl", "rb"),
  purpose="batch"
)


## Create and send the batch for processing

In [65]:
batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "reviews data"
    }
)

## Retrieve the batch info

In [66]:
client.batches.retrieve("batch_pJtDc5AvpklTVhj37Vlmejun")

## Load the batch output when it's done

In [6]:
# content = client.files.content("file-YSIhGaf310zV6RUXt1AnNwsj").content # 10 reviews
# content = client.files.content("batch_qQoYeYrrJQFdjjASb6oRjHUM") # 1000 reviews
content = client.files.content("file-jls9PUyOnFSbQXotF2sTDD0f").content

## Save the batch output

In [7]:
with open('data/gpt_reviews.jsonl', 'wb') as file:
    file.write(content)

## Open the batch output

In [8]:
results = []
with open('data/gpt_reviews.jsonl', 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

In [9]:
# results[:5]

## Write the result to csv file

In [86]:
with open('data/gpt_3.5_reviews.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file, quoting=csv.QUOTE_NONNUMERIC)
    file.write('review_text,sentiment\n')
    
    for res in results:
        result = res['response']['body']['choices'][0]['message']['content']
        
        result = result.replace('"', '').replace('review_text,sentiment\n', '').replace('```', '').replace('csv', '')
        result = result.replace('**The output has been truncated as it has reached the maximum response length.**', '')
        result_lines = result.split('\n')
        
        for row in result_lines:
            if row.strip():
                parts = row.rsplit(',', 1)
                if len(parts) == 2 and parts[1] in [1, 0, '1', '0']:
                    review_text = parts[0].strip()
                    sentiment = parts[1].strip()
                    writer.writerow([review_text, int(sentiment)])
                

## Append the result to csv file

In [10]:
with open('data/gpt_3.5_reviews.csv', 'a', encoding='utf-8') as file:
    writer = csv.writer(file, quoting=csv.QUOTE_NONNUMERIC)
    # file.write('review_text,sentiment\n')
    for res in results:
        result = res['response']['body']['choices'][0]['message']['content']
        
        result = result.replace('"', '').replace('review_text,sentiment\n', '').replace('```', '').replace('csv', '')
        result = result.replace('**The output has been truncated as it has reached the maximum response length.**', '')
        result_lines = result.split('\n')
        
        for row in result_lines:
            if row.strip():
                parts = row.rsplit(',', 1)
                if len(parts) == 2 and parts[1] in [1, 0, '1', '0']:
                    review_text = parts[0].strip()
                    sentiment = parts[1].strip()
                    # print(review_text, sentiment)
                    writer.writerow([review_text, int(sentiment)])

## Data samples

In [92]:
# translated_tweets_df = pd.read_csv('data/translated_tweets.csv')
gpt4_reviews_df = pd.read_csv('data/GPT4_reviews.csv')
gpt35_reviews_df = pd.read_csv('data/gpt_3.5_reviews.csv')
heureka_reviews_df = pd.read_json('data/reviews.json')

In [121]:
translated_tweets_df.head(10)

In [122]:
translated_tweets_df.info()

In [93]:
# gpt4_reviews_df.drop('review_id', axis=1, inplace=True)
gpt4_reviews_df.head(10)

In [94]:
gpt4_reviews_df.info()

In [95]:
gpt35_reviews_df.head(10)

In [96]:
gpt35_reviews_df.info()

In [97]:
heureka_reviews_df.head(10)

In [98]:
heureka_reviews_df.info()