In [62]:
import csv
import json
from openai import OpenAI

## Load the batch input

In [3]:
client = OpenAI()

batch_input_file = client.files.create(
  file=open("batchinput.jsonl", "rb"),
  purpose="batch"
)


## Create and send the batch for processing

In [4]:
batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "reviews data"
    }
)

Batch(id='batch_9Hxo1v3SLghRfA4ktQE3riyb', completion_window='24h', created_at=1718999240, endpoint='/v1/chat/completions', input_file_id='file-aE0QYvxvd9ZC3lLVNhsNfxL6', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1719085640, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'reviews data'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

## Retrieve the batch info

In [5]:
client.batches.retrieve("batch_9Hxo1v3SLghRfA4ktQE3riyb")

Batch(id='batch_9Hxo1v3SLghRfA4ktQE3riyb', completion_window='24h', created_at=1718999240, endpoint='/v1/chat/completions', input_file_id='file-aE0QYvxvd9ZC3lLVNhsNfxL6', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1718999386, error_file_id=None, errors=None, expired_at=None, expires_at=1719085640, failed_at=None, finalizing_at=1718999385, in_progress_at=1718999241, metadata={'description': 'reviews data'}, output_file_id='file-ORSg5cDjy3JnymNDROusz3cc', request_counts=BatchRequestCounts(completed=7, failed=0, total=7))

## Load the batch output when it's done

In [6]:
# content = client.files.content("file-YSIhGaf310zV6RUXt1AnNwsj").content # 10 reviews
# content = client.files.content("batch_qQoYeYrrJQFdjjASb6oRjHUM") # 1000 reviews
content = client.files.content("file-ORSg5cDjy3JnymNDROusz3cc").content

## Save the batch output

In [7]:
with open('data/gpt_reviews.jsonl', 'wb') as file:
    file.write(content)

## Open the batch output

In [8]:
results = []
with open('data/gpt_reviews.jsonl', 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

## Write the result to csv file

In [63]:
with open('data/gpt_3.5_reviews.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file, quoting=csv.QUOTE_NONNUMERIC)
    file.write('review_text,sentiment\n')
    
    for res in results:
        result = res['response']['body']['choices'][0]['message']['content']
        
        result = result.replace('"', '').replace('review_text,sentiment\n', '').replace('```', '').replace('csv', '')
        result = result.replace('**The output has been truncated as it has reached the maximum response length.**', '')
        result_lines = result.split('\n')
        
        for row in result_lines:
            if row.strip():
                parts = row.rsplit(',', 1)
                if len(parts) == 2:
                    review_text = parts[0].strip()
                    sentiment = parts[1].strip()
                    writer.writerow([review_text, int(sentiment)])
                

## Append the result to csv file

In [61]:
with open('data/gpt_3.5_reviews.csv', 'a', encoding='utf-8') as file:
    writer = csv.writer(file, quoting=csv.QUOTE_NONNUMERIC)
    # file.write('review_text,sentiment\n')
    for res in results:
        result = res['response']['body']['choices'][0]['message']['content']
        
        result = result.replace('"', '').replace('review_text,sentiment\n', '').replace('```', '').replace('csv', '')
        result = result.replace('**The output has been truncated as it has reached the maximum response length.**', '')
        result_lines = result.split('\n')
        
        for row in result_lines:
            if row.strip():
                parts = row.rsplit(',', 1)
                if len(parts) == 2:
                    review_text = parts[0].strip()
                    sentiment = parts[1].strip()
                    writer.writerow([review_text, int(sentiment)])

## Data samples

In [60]:
import pandas as pd
translated_tweets_df = pd.read_csv('data/translated_tweets.csv')
gpt4_reviews_df = pd.read_csv('data/GPT4_reviews.csv')
gpt35_reviews_df = pd.read_csv('data/gpt_3.5_reviews.csv')
heureka_reviews_df = pd.read_json('data/reviews.json')

FileNotFoundError: [Errno 2] No such file or directory: 'data/translated_tweets.csv'

In [121]:
translated_tweets_df.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,Je to tak smutné pre môjho APL priateľa..........
1,2,0,Sentiment140,Chýbal mi nový trailer...
2,3,1,Sentiment140,Omg je už 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. som bol u toh...
4,5,0,Sentiment140,"Myslím si, že mi BF podvádza na mňa!!!"
5,6,0,Sentiment140,Alebo sa len príliš obávam?
6,7,1,Sentiment140,ZľavyZľavyZľavyZľavyZľavyZľavyZľavyZľavyZľavyZ...
7,8,0,Sentiment140,Sunny Again Work Tomorrow (TV dnes večer)
8,9,1,Sentiment140,"dneska som ťa dostala do uniformy, už mi chýbaš"
9,10,1,Sentiment140,"Hmmmm... som zvedavý, ako sa ona moje číslo @-)"


In [122]:
translated_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138103 entries, 0 to 1138102
Data columns (total 4 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   ItemID           1138103 non-null  int64 
 1   Sentiment        1138103 non-null  int64 
 2   SentimentSource  1138103 non-null  object
 3   SentimentText    1138103 non-null  object
dtypes: int64(2), object(2)
memory usage: 34.7+ MB


In [133]:
# gpt4_reviews_df.drop('review_id', axis=1, inplace=True)
gpt4_reviews_df.head(10)

Unnamed: 0,review_text,sentiment
0,Tento produkt je úžasný a veľmi užitočný!,1
1,Som sklamaný z kvality tohto produktu.,0
2,"Výborne funguje, odporúčam!",1
3,"Nevydrží dlho, nie je to stojí za peniaze.",0
4,Excelentná kvalita za rozumnú cenu.,1
5,Produkt neprišiel včas a bol poškodený.,0
6,Veľmi spokojný s nákupom!,1
7,Hrozná zákaznícka podpora a produkt nezodpoved...,0
8,"Výborný nákup, produkt presne ako je popísaný.",1
9,"Úplne nevhodný, vrátil som ho.",0


In [123]:
gpt4_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1226 entries, 0 to 1225
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_id    1226 non-null   int64 
 1   review_text  1226 non-null   object
 2   sentiment    1226 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 28.9+ KB


In [None]:
gpt35_reviews_df.head(10)

In [None]:
gpt35_reviews_df.info()

In [136]:
heureka_reviews_df.head(10)

Unnamed: 0,review_text,sentiment
0,Som spokojná s tovarom aj rýchlosťou doručenia...,
1,"Kvalita ,rychlost",
2,Doposiaľ najrýchlejšie dodanie tovaru,1.0
3,Neskora donaśka,0.0
4,Dopravca s nemožnosťou platby kartou,0.0
5,S obchodom som spokojný ale vyjadril som nespo...,
6,Blizko,1.0
7,Rýchle dodanie tovaru. Som veľmi spokojná.,1.0
8,Rýchlosť doručenia,1.0
9,"Tovar mi prišiel otvorený, našťastie funguje v...",
