<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 1 - Phase 3_2 - eyamrog

The aim of this phase is to design a solution of text revision using ChatGPT.

## Required Python packages

- openai
- pandas
- python-dotenv

## Importing the required libraries

In [None]:
from dotenv import load_dotenv
import openai
import pandas as pd
import re
import os
import sys
import logging
from tqdm import tqdm
import time

## Defining input variables

In [None]:
input_directory = 'cl_st1_ph31_eyamrog'
output_directory = 'cl_st1_ph32_eyamrog'

## Creating output directory

In [None]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

## Configuring logging

In [None]:
# Configure logging to write to a file
logging.basicConfig(
    filename = f"{output_directory}/chatgpt_review_log.txt",
    level = logging.INFO,
    format = '%(asctime)s - %(levelname)s - %(message)s'
)

## Preparing data for testing

### Importing the data into a DataFrame

In [None]:
df_scielo_preprint_preChatGPT_en = pd.read_json(f'{input_directory}/test_erpp_pp.jsonl', lines=True)

In [None]:
df_scielo_preprint_preChatGPT_en.dtypes

In [None]:
df_scielo_preprint_preChatGPT_en['Submitted'] = pd.to_datetime(df_scielo_preprint_preChatGPT_en['Submitted'], unit='ms')
df_scielo_preprint_preChatGPT_en['Posted'] = pd.to_datetime(df_scielo_preprint_preChatGPT_en['Posted'], unit='ms')

### Replacing unnecessary spaces with a single space

## Loading all environment variables from `.env` into `os.environ`

Create the `.env` file with the required `OPENAI_API_KEY` prior to running the following cell.

```
OPENAI_API_KEY=<YOUR_OPENAI_API_KEY>
```

In [None]:
load_dotenv()

## Importing the required programme variables from the environment

In [None]:
openai.api_key = os.environ.get('OPENAI_API_KEY', '')
assert openai.api_key

## Defining a function to query ChatGPT

In [None]:
# Defining a function to query ChatGPT with exponential backoff
def get_completion(prompt, model='gpt-3.5-turbo', max_retries=5):
    client = openai.OpenAI()
    messages = [{'role': 'user', 'content': prompt}]
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0
            )
            return response.choices[0].message.content
        except openai.error.RateLimitError as e:
            wait_time = 2 ** attempt  # Exponential backoff
            logging.warning(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except Exception as e:
            logging.error(f"Error querying ChatGPT: {e}")
            return None
    logging.error("Max retries exceeded.")
    return None

## Getting improved paragraphs from ChatGPT

In [None]:
# Defining the ChatGPT prompt template
prompt_template = 'Dear ChatGPT, would it be possible for you to improve the writing of the following passage of a research article considering the generally accepted standards of English for Academic Purposes? Please keep each improved passage within a single paragraph - do not split it into multiple paragraphs. OK?\n'

# Defining a function to improve text using ChatGPT
def improve_text(text):
    paragraphs = text.split('\n')  # Split text into paragraphs
    improved_paragraphs = []
    for paragraph in paragraphs:
        prompt = prompt_template + paragraph
        improved_paragraph = get_completion(prompt)
        if improved_paragraph:
            improved_paragraphs.append(improved_paragraph)
        else:
            improved_paragraphs.append(paragraph)  # Keep original if there's an error
        time.sleep(15)  # Fixed delay between queries
    return '\n'.join(improved_paragraphs)

# Applying the function to the 'Text' column with progress indication
improved_texts = []
for text in tqdm(df_scielo_preprint_preChatGPT_en['Text'], desc="Processing texts"):
    improved_texts.append(improve_text(text))

df_scielo_preprint_preChatGPT_en['Text ChatGPT'] = improved_texts

### Replacing unnecessary spaces with a single space

## Exporting each article processed by ChatGPT to individual files

In [None]:
for index, row in df_scielo_preprint_preChatGPT_en.iterrows():
    file_name = f"{output_directory}/{row['Text ID']}.txt"
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(row['Text'])

In [None]:
for index, row in df_scielo_preprint_preChatGPT_en.iterrows():
    file_name = f"{output_directory}/{row['Text ID']}_chatgpt.txt"
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(row['Text ChatGPT'])

### Exporting to a file

In [None]:
df_scielo_preprint_preChatGPT_en.to_json(f"{output_directory}/test_chatgpt_erpp_pp.jsonl", orient='records', lines=True)