In [None]:
!pip install openai

In [None]:
import openai
from tqdm import tqdm
import pandas as pd
import time

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [None]:
openai.api_key ="""API KEY HERE"""
model_engine = "gpt-3.5-turbo"

In [None]:
df = pd.read_json('/kaggle/input/5wqa-dataset-github-final/data/5WQA_all_claims_with_evidence.json')

In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_answers(prompt):
    completion = openai.ChatCompletion.create(
      model = 'gpt-3.5-turbo',
      messages = [
        {'role': 'user', 'content': prompt}
      ],
      temperature = 0  
    )
    return completion['choices'][0]['message']['content']

In [None]:
prompt = '''Generate 5 different paraphrases of the following text and then place all these 5 paraphrases in one list the format of list is defined below. Do not write anything other than just the list.
The format of list is: 
1. a
2. b
3. c
4. d
5. e 
Here a, b, c, d, e are paraphrase generated. Please follow this format strickly.
The claim is: '''


In [None]:
def extract_sentences(string):
    sentences = []
    start_index = 0
    end_index = 0
    i = 0
    while i<5:
        i += 1
        start_index = string.find(str(i) + ".", end_index)
        end_index = string.find(str(i+1) + ".", start_index)
        
        if start_index == -1:
            break

        if end_index == -1:
          sentence = string[start_index + 2:].strip()
        else:
          sentence = string[start_index + 2:end_index].strip()

        sentences.append(sentence)

    
    df = pd.DataFrame({"Sentences": sentences})
    return df

In [None]:
df_main = df[90000:96000]

In [None]:
import pandas as pd
import time
import openai

last_save_time = time.time()
df_new = pd.DataFrame()
i = 0
total_rows = len(df)

def process_text_value(text_value):
    processed_value = get_answers(prompt + text_value)
    return processed_value

def handle_rate_limit_error(row, index, df_new, retry_count=0):
    max_retries = 5
    backoff_time = 2 ** retry_count 
    try:
        time.sleep(backoff_time) 
        paraphrase_generated = extract_sentences(process_text_value(row['claim']))
        df_new = df_new.append(paraphrase_generated.T, ignore_index=True)
    except openai.error.APIError as e:
        if e.error['code'] == 502:
            print(f"APIError: Bad gateway. Retrying ({retry_count+1}/{max_retries})...")
            time.sleep(backoff_time)
            handle_rate_limit_error(row, index, df_new, retry_count+1)
        else:
            raise
    return df_new


for index, row in df_main.iterrows():
    text_value = row['claim']

    try:
        paraphrase_generated = extract_sentences(process_text_value(text_value))
        df_new = df_new.append(paraphrase_generated.T, ignore_index=True)

    except openai.error.RateLimitError:
        print("Rate limit exceeded. Handling rate limit error...")
        df_new = handle_rate_limit_error(row, index, df_new)

    current_time = time.time()
    time_since_last_save = current_time - last_save_time
    if time_since_last_save >= 10:  # 1 hour = 3600 seconds
        df_new.to_csv(f'paraphrase_text_v4.csv', index=True)  # Save the DataFrame as a CSV file
        print('Data saved to CSV file.')
        print(index)
        last_save_time = current_time  # update the last save time
    time.sleep(0.1)

# Print the updated DataFrame
df_new.to_csv(f'paraphrase_text.csv', index=True)
