# Synthetic data generation using OpenAI API

## Import required libraries

In [53]:
import csv
import json
from openai import OpenAI
import pandas as pd

## Create the batch input

In [62]:
# Define the base structure of the JSON object
base_structure = {
    "custom_id": "request-1",
    "method": "POST",
    "url": "/v1/chat/completions",
    "body": {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "system", "content": "You are an assistant that generates unique reviews of products in Slovak with maximum of 25 words and different lengths into csv with columns review_text and sentiment (1-positive, 0-negative)"},
            {"role": "user", "content": "Napíš 130 pozitívnych a negatívnych recenzií striedavo"}
        ],
        "max_tokens": 4026
    }
}

# Open a file to write the JSONL data
with open('batch_input.jsonl', 'w') as f:
    # Loop to create 4800 entries
    for i in range(1, 49):
        # Update the custom_id in the base structure
        base_structure["custom_id"] = f"request-{i}"
        # Write the JSON object as a line in the file
        f.write(json.dumps(base_structure) + '\n')

## Load the batch input

In [63]:
client = OpenAI()

In [64]:

batch_input_file = client.files.create(
  file=open("batch_input.jsonl", "rb"),
  purpose="batch"
)


## Create and send the batch for processing

In [65]:
batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "reviews data"
    }
)

Batch(id='batch_8nHJN2XGsgzSY1DF8VnaYavR', completion_window='24h', created_at=1719097769, endpoint='/v1/chat/completions', input_file_id='file-F1GV5TupIC7DEkA6NtrbHba2', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1719184169, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'reviews data'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

## Retrieve the batch info

In [66]:
client.batches.retrieve("batch_pJtDc5AvpklTVhj37Vlmejun")

Batch(id='batch_pJtDc5AvpklTVhj37Vlmejun', completion_window='24h', created_at=1719089251, endpoint='/v1/chat/completions', input_file_id='file-ObsBMajX2e0o0XIEMeckiqav', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1719175651, failed_at=None, finalizing_at=None, in_progress_at=1719089252, metadata={'description': 'reviews data'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=48))

## Load the batch output when it's done

In [6]:
# content = client.files.content("file-YSIhGaf310zV6RUXt1AnNwsj").content # 10 reviews
# content = client.files.content("batch_qQoYeYrrJQFdjjASb6oRjHUM") # 1000 reviews
content = client.files.content("file-jls9PUyOnFSbQXotF2sTDD0f").content

## Save the batch output

In [7]:
with open('data/gpt_reviews.jsonl', 'wb') as file:
    file.write(content)

## Open the batch output

In [8]:
results = []
with open('data/gpt_reviews.jsonl', 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

In [9]:
# results[:5]

## Write the result to csv file

In [86]:
with open('data/gpt_3.5_reviews.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file, quoting=csv.QUOTE_NONNUMERIC)
    file.write('review_text,sentiment\n')
    
    for res in results:
        result = res['response']['body']['choices'][0]['message']['content']
        
        result = result.replace('"', '').replace('review_text,sentiment\n', '').replace('```', '').replace('csv', '')
        result = result.replace('**The output has been truncated as it has reached the maximum response length.**', '')
        result_lines = result.split('\n')
        
        for row in result_lines:
            if row.strip():
                parts = row.rsplit(',', 1)
                if len(parts) == 2 and parts[1] in [1, 0, '1', '0']:
                    review_text = parts[0].strip()
                    sentiment = parts[1].strip()
                    writer.writerow([review_text, int(sentiment)])
                

## Append the result to csv file

In [10]:
with open('data/gpt_3.5_reviews.csv', 'a', encoding='utf-8') as file:
    writer = csv.writer(file, quoting=csv.QUOTE_NONNUMERIC)
    # file.write('review_text,sentiment\n')
    for res in results:
        result = res['response']['body']['choices'][0]['message']['content']
        
        result = result.replace('"', '').replace('review_text,sentiment\n', '').replace('```', '').replace('csv', '')
        result = result.replace('**The output has been truncated as it has reached the maximum response length.**', '')
        result_lines = result.split('\n')
        
        for row in result_lines:
            if row.strip():
                parts = row.rsplit(',', 1)
                if len(parts) == 2 and parts[1] in [1, 0, '1', '0']:
                    review_text = parts[0].strip()
                    sentiment = parts[1].strip()
                    # print(review_text, sentiment)
                    writer.writerow([review_text, int(sentiment)])

## Data samples

In [92]:
# translated_tweets_df = pd.read_csv('data/translated_tweets.csv')
gpt4_reviews_df = pd.read_csv('data/GPT4_reviews.csv')
gpt35_reviews_df = pd.read_csv('data/gpt_3.5_reviews.csv')
heureka_reviews_df = pd.read_json('data/reviews.json')

In [121]:
translated_tweets_df.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,Je to tak smutné pre môjho APL priateľa..........
1,2,0,Sentiment140,Chýbal mi nový trailer...
2,3,1,Sentiment140,Omg je už 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. som bol u toh...
4,5,0,Sentiment140,"Myslím si, že mi BF podvádza na mňa!!!"
5,6,0,Sentiment140,Alebo sa len príliš obávam?
6,7,1,Sentiment140,ZľavyZľavyZľavyZľavyZľavyZľavyZľavyZľavyZľavyZ...
7,8,0,Sentiment140,Sunny Again Work Tomorrow (TV dnes večer)
8,9,1,Sentiment140,"dneska som ťa dostala do uniformy, už mi chýbaš"
9,10,1,Sentiment140,"Hmmmm... som zvedavý, ako sa ona moje číslo @-)"


In [122]:
translated_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138103 entries, 0 to 1138102
Data columns (total 4 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   ItemID           1138103 non-null  int64 
 1   Sentiment        1138103 non-null  int64 
 2   SentimentSource  1138103 non-null  object
 3   SentimentText    1138103 non-null  object
dtypes: int64(2), object(2)
memory usage: 34.7+ MB


In [93]:
# gpt4_reviews_df.drop('review_id', axis=1, inplace=True)
gpt4_reviews_df.head(10)

Unnamed: 0,review_id,review_text,sentiment
0,1,Tento produkt je úžasný a veľmi užitočný!,1
1,2,Som sklamaný z kvality tohto produktu.,0
2,3,"Výborne funguje, odporúčam!",1
3,4,"Nevydrží dlho, nie je to stojí za peniaze.",0
4,5,Excelentná kvalita za rozumnú cenu.,1
5,6,Produkt neprišiel včas a bol poškodený.,0
6,7,Veľmi spokojný s nákupom!,1
7,8,Hrozná zákaznícka podpora a produkt nezodpoved...,0
8,9,"Výborný nákup, produkt presne ako je popísaný.",1
9,10,"Úplne nevhodný, vrátil som ho.",0


In [94]:
gpt4_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1226 entries, 0 to 1225
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_id    1226 non-null   int64 
 1   review_text  1226 non-null   object
 2   sentiment    1226 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 28.9+ KB


In [95]:
gpt35_reviews_df.head(10)

Unnamed: 0,review_text,sentiment
0,Skvelý výkon a dizajn!,1
1,Slabá batéria,0
2,Veľmi praktický produkt.,1
3,Občasné spomalenia,0
4,Skvelá kvalita za tú cenu.,1
5,Zlá kompatibilita s inými zariadeniami,0
6,Jednoduché ovládanie.,1
7,Príliš veľké rozmery,0
8,Odporúčam každému!,1
9,Problémy s pripojením,0


In [96]:
gpt35_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4052 entries, 0 to 4051
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  4052 non-null   object
 1   sentiment    4052 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 63.4+ KB


In [97]:
heureka_reviews_df.head(10)

Unnamed: 0,review_text,sentiment
0,Som spokojná s tovarom aj rýchlosťou doručenia...,
1,"Kvalita ,rychlost",
2,Doposiaľ najrýchlejšie dodanie tovaru,1.0
3,Neskora donaśka,0.0
4,Dopravca s nemožnosťou platby kartou,0.0
5,S obchodom som spokojný ale vyjadril som nespo...,
6,Blizko,1.0
7,Rýchle dodanie tovaru. Som veľmi spokojná.,1.0
8,Rýchlosť doručenia,1.0
9,"Tovar mi prišiel otvorený, našťastie funguje v...",


In [98]:
heureka_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048277 entries, 0 to 3048276
Data columns (total 2 columns):
 #   Column       Dtype  
---  ------       -----  
 0   review_text  object 
 1   sentiment    float64
dtypes: float64(1), object(1)
memory usage: 46.5+ MB
