<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Using ChatGPT as a writing assistant of English for Research Publication Purposes (ERPP)

## Importing the required libraries

In [1]:
import openai
import pandas as pd
import json
from os import environ
import datetime as dt
from google.cloud import translate
from IPython.display import clear_output

## Setting the required environment variables in the environment

Please refer to [Setting environment variables](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#setting-environment-variables) on [Managing environments](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#).

Note: Do **not** place the value of the variables between ' ' or " ".

In [None]:
(base) C:\Users\eyamr>conda env list
# conda environments:
#
base                  *  C:\Users\eyamr\anaconda3
Env20240401              C:\Users\eyamr\anaconda3\envs\Env20240401


(base) C:\Users\eyamr>conda activate Env20240401

(Env20240401) C:\Users\eyamr>conda env config vars list

(Env20240401) C:\Users\eyamr>conda env config vars set GOOGLE_CLOUD_PROJECT=<omitted>
To make your changes take effect please reactivate your environment

(Env20240401) C:\Users\eyamr>conda env config vars set OPENAI_API_KEY=<omitted>
To make your changes take effect please reactivate your environment

(Env20240401) C:\Users\eyamr>conda activate Env20240401

(Env20240401) C:\Users\eyamr>conda env config vars list
GOOGLE_CLOUD_PROJECT = <omitted>
OPENAI_API_KEY = <omitted>

(Env20240401) C:\Users\eyamr>conda deactivate

(base) C:\Users\eyamr>

## Importing the required programme variables from the environment

In [2]:
openai.api_key = environ.get('OPENAI_API_KEY', '')
assert openai.api_key
GOOGLE_CLOUD_PROJECT = environ.get('GOOGLE_CLOUD_PROJECT', '')
assert GOOGLE_CLOUD_PROJECT
GOOGLE_CLOUD_PROJECT = str(GOOGLE_CLOUD_PROJECT)
PARENT = f'projects/{GOOGLE_CLOUD_PROJECT}'
#print(openai.api_key)
#print(GOOGLE_CLOUD_PROJECT)
#print(PARENT)

## Defining a function to translate passages with Google Cloud Translation API

Application Default Credentials (ADC) should be configured in order to authenticate the use of Google Cloud Translation API - follow the procedure indicated in [Set up authentication for Cloud Translation](https://cloud.google.com/translate/docs/authentication#authn-how-to).

In [3]:
def translate_text(text: str, target_language_code: str) -> translate.Translation:
    client = translate.TranslationServiceClient()
    response = client.translate_text(
        parent = PARENT,
        contents = [text],
        target_language_code = target_language_code
    )
    return response.translations[0]

## Defining a function to query ChatGPT

In [4]:
def get_completion(prompt, model = 'gpt-3.5-turbo'):
    client = openai.OpenAI()
    messages = [{'role': 'user', 'content': prompt}]
    response = client.chat.completions.create(
        model = model,
        messages = messages,
        temperature = 0
    )
    return response.choices[0].message.content

## Collecting input

Note: Pandas raises the following data type warning (DtypeWarning) when data is cast to an incompatible data typeL'

FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'In the field of Corpus Linguistics ...' has dtype incompatible with float64, please explicitly cast to a compatible dtype first

df_text.at[index, 'composed'] = query

Refer to the following references to learn how to handle it:
- [pandas.errors.DtypeWarning](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.errors.DtypeWarning.html)
- [Pandas DtypeWarning: Columns have mixed types](https://www.slingacademy.com/article/pandas-dtypewarning-columns-have-mixed-types/)

In [5]:
end = False
while end == False:
    filename = str(input('Enter the input full filename: '))
    if filename != '':
        try:
            with open(filename, 'r', encoding = 'utf8', newline='\n') as responses:
                print('The file exists.')
            input_file = filename
            output_file = input_file + '.out.txt'
            output_file_json = input_file + '.out.json'
            end = True
            clear_output()
        except FileNotFoundError:
            print('No such file.')
df_text = pd.read_table(input_file, sep = '\\n', header = None, engine = 'python')
df_text = df_text.rename(columns = {0: 'notes'})
df_text.at[0, 'ai_composed'] = 1 # Adding column 'ai_composed' by initialising it with a numeric value in order to avoid DtypeWarning
df_text.at[0, 'human_revised'] = 1
df_text.at[0, 'ai_translated'] = 1
df_text = df_text.astype('object') # Converting the column to the desired data type
print(str(len(df_text)) + ' set(s) of notes to process.')

1 set(s) of notes to process.


## Composing with ChatGPT

### Prompts

Dear ChatGPT, please write a piece of academic text based on the following notes considering the generally accepted standards of English for Academic Purposes. It is very important that you are as objective, scientific and non-metaphorical as you can be.\n

### Composing with ChatGPT

In [6]:
with open(output_file, 'a', encoding = 'utf8', newline='\n') as responses:
    responses.write('ChatGPT ERPP writing assistant' + '\n\n')
    responses.write('Start time: ' + str(dt.datetime.now()) + '\n\n')
    prompt = 'Dear ChatGPT, please write a piece of academic text based on the following notes considering the generally accepted standards of English for Academic Purposes. It is very important that you are as objective, scientific and non-metaphorical as you can be.\n'
    for index, row in df_text.iterrows():
        responses.write('Notes ' + str(index) + ':\n' + row['notes'] + '\n\n')
        print('Notes ' + str(index) + ':\n' + row['notes'])
        query = get_completion(prompt + row['notes'])
        df_text.at[index, 'ai_composed'] = query
        responses.write('Composed by AI ' + str(index) + ':\n' + query + '\n\n')
        print('\nComposed by AI ' + str(index) + ':\n' + query)
        clear_output(wait = True)
    responses.write('End time: ' + str(dt.datetime.now()) + '\n\n')
print('Job completed!')
df_text_json = df_text.to_json()
df_text_json_parsed = json.loads(df_text_json)
df_text_json_prettified = json.dumps(df_text_json_parsed, indent=4)
with open(output_file_json, 'w', encoding='utf8', newline='\n') as file:
    file.write(df_text_json_prettified)
df_text

Job completed!


Unnamed: 0,notes,ai_composed,human_revised,ai_translated
0,The Lexical Multi-Dimensional Analysis (LMD An...,The Lexical Multi-Dimensional Analysis (LMD An...,1.0,1.0


## Revising

Edit the column `human_revised` output json file.

## Translating with Google Cloud Translate

In [7]:
df_text = pd.read_json(output_file_json) # Update the dataframe with the revised texts
target_language = 'pt'
with open(output_file, 'a', encoding = 'utf8', newline='\n') as responses:
    responses.write('Translating with Google Cloud Translate' + '\n\n')
    responses.write('Start time: ' + str(dt.datetime.now()) + '\n\n')
    for index, row in df_text.iterrows():
        responses.write('Human Revised ' + str(index) + ':\n' + row['human_revised'] + '\n\n')
        print('Human Revised ' + str(index) + ':\n' + row['human_revised'])
        translation = translate_text(row['human_revised'], target_language)
        source_language = translation.detected_language_code
        translated_passage = translation.translated_text
        df_text.at[index, 'ai_translated'] = translated_passage
        responses.write('Translated by AI ' + str(index) + ' from (' + source_language + ')' + ':\n' + translated_passage + '\n\n')
        print('\nTranslated by AI ' + str(index) + ' from (' + source_language + ')' + ':\n' + translated_passage)
        clear_output(wait = True)
    responses.write('End time: ' + str(dt.datetime.now()) + '\n\n')
print('Job completed!')
df_text_json = df_text.to_json()
df_text_json_parsed = json.loads(df_text_json)
df_text_json_prettified = json.dumps(df_text_json_parsed, indent=4)
with open(output_file_json, 'w', encoding='utf8', newline='\n') as file:
    file.write(df_text_json_prettified)
df_text

Job completed!


Unnamed: 0,notes,ai_composed,human_revised,ai_translated
0,The Lexical Multi-Dimensional Analysis (LMD An...,The Lexical Multi-Dimensional Analysis (LMD An...,The Lexical Multi-Dimensional Analysis (LMD An...,A Análise Multidimensional Lexical (Análise LM...


## Translating with ChatGPT

### Prompts

Dear ChatGPT, please translate the following text into Brazilian Portuguese.\n

In [None]:
df_text = pd.read_json(output_file_json) # Update the dataframe with the revised texts
with open(output_file, 'a', encoding = 'utf8', newline='\n') as responses:
    responses.write('Translating with ChatGPT' + '\n\n')
    responses.write('Start time: ' + str(dt.datetime.now()) + '\n\n')
    for index, row in df_text.iterrows():
        responses.write('Human Revised ' + str(index) + ':\n' + row['human_revised'] + '\n\n')
        print('Human Revised ' + str(index) + ':\n' + row['human_revised'])
        prompt = 'Dear ChatGPT, please translate the following text into Brazilian Portuguese.\n'
        query = get_completion(prompt + row['human_revised'])
        df_text.at[index, 'ai_translated'] = query
        responses.write('Translated by AI ' + str(index) + ':\n' + query + '\n\n')
        print('\nTranslated by AI ' + str(index) + ':\n' + query)
        clear_output(wait = True)
    responses.write('End time: ' + str(dt.datetime.now()) + '\n\n')
print('Job completed!')
df_text_json = df_text.to_json()
df_text_json_parsed = json.loads(df_text_json)
df_text_json_prettified = json.dumps(df_text_json_parsed, indent=4)
with open(output_file_json, 'w', encoding='utf8', newline='\n') as file:
    file.write(df_text_json_prettified)
df_text