In [1]:
import os
import re
import csv
import nltk
import openai
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

# OpenAI API setup
openai.api_key = 'X'
MODEL = "gpt-3.5-turbo"

# Function to clean and tokenize text
def clean_and_tokenize(text):
    text = re.sub(r'\s+', ' ', text)  
    text = text.lower()
    sentences = sent_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    processed_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        processed_sentences.append((sentence, lemmatized_words))

    return processed_sentences

def filter_potential_similes(sentences):
    simile_sentences = []
    for sentence, words in sentences:
        if "like" in words or ("as" in words and "as" in words[words.index("as")+1:]):
            simile_sentences.append(sentence)
    return simile_sentences

# Setting your OpenAI API key
openai.api_key = 'sk-proj-SZiOR5j3WnDOBpZTaVxaT3BlbkFJbXg5hwCFmRo9K5607sSI'
MODEL = "gpt-3.5-turbo"  # Choose the model according to your account

# Function to check if given sentences are similes
def check_if_simile(sentences):
    """Check if given sentences contain similes and return a list of sentences that are similes."""
    similes = []
    prompt = (
        "Act as a computational linguist, Your task is to determine whether a given sentence contains a simile. "
        "A simile is a figure of speech that directly compares two different things, "
        "typically using words such as 'like' or 'as...as...'. Review the sentence provided "
        "and respond with 'Yes' if it contains a simile, or 'No' if it does not."
    )
    for sentence in sentences:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": sentence}
            ],
            max_tokens=10,  # Adjust max_tokens to a smaller number if only "Yes" or "No" is needed
            temperature=0.5
        )
        if "yes" in response.choices[0].message['content'].lower():
            similes.append(sentence)
    return similes

# Function to extract elements of similes

def extract_elements(similes):
    """Extract tenor, vehicle, and shared property from simile sentences"""
    elements = []
    system_prompt = (
        "Act as a computational linguist and identify the core elements of a simile."
        "The tenor is the primary subject of the simile, which is being described.It is the part of the simile that is being described or compared to something else to convey meaning"
        "The vehicle is the image or concept used to make the comparison."
        "The shared property is the characteristic or quality that is common to both the tenor and the vehicle"
        "Identify the tenor, the vehicle (object of comparison), and the shared property if explicitly mentioned. "
        "Use one word in the sentence for each element."
    )
    
    for simile in similes:
        user_prompt = f"Given the sentence, identify the tenor and vehicle. Provide your response clearly and concisely:\n\n'{simile}'"
        
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=60,
            temperature=0.5
        )
        
        text = response.choices[0].message['content']
        
        # Using regular expressions to match single-word responses for tenor and vehicle
        tenor_match = re.search(r"Tenor:\s*(\S.*)\s*", text)
        vehicle_match = re.search(r"Vehicle:\s*(\S.*)\s*", text)
        property_match = re.search(r"Shared property:\s*(\S.*)\s*", text)
        
        tenor = tenor_match.group(1) if tenor_match else ""
        vehicle = vehicle_match.group(1) if vehicle_match else ""
        shared_property = property_match.group(1) if property_match and property_match.group(1) else ""
        
        elements.append((simile, tenor, vehicle, shared_property))
    
    return elements

def get_core_word(phrase):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Act as a computational linguist. When provided a phrase, respond with only the core word of the phrase and nothing else."},
            {"role": "user", "content": f"Extract the core word from the phrase: '{phrase}'"}
        ]
    )
    
    core_word = response['choices'][0]['message']['content'].strip()
    core_word = re.sub(r"['\"]", "", core_word).lower()
    return core_word

# Function to process a single book
def process_book(file_path):
    rank, book_name = os.path.basename(file_path).split('_', 1)
    book_name = book_name.replace('.txt', '').strip()
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        processed_sentences = clean_and_tokenize(content)
        potential_similes = filter_potential_similes(processed_sentences)
        confirmed_similes = check_if_simile(potential_similes)
        simile_elements = extract_elements(confirmed_similes)
        data = []
        for simile, tenor, vehicle, shared_property in simile_elements:
            data.append([rank, book_name, simile, tenor, vehicle, shared_property])
    
    # Convert data to DataFrame
    df_output = pd.DataFrame(data, columns=['Rank', 'Book Name', 'Simile', 'Tenor', 'Vehicle', 'Shared Property'])
    
    # Apply get_core_word to ensure each element is a single word
    for column in ['Tenor', 'Vehicle', 'Shared Property']:
        df_output[column] = df_output[column].apply(lambda x: get_core_word(x) if len(x.split()) > 1 else x)
    
    return df_output

# Function to save DataFrame to CSV
def save_to_csv(df, output_file):
    df.to_csv(output_file, index=False, encoding='utf-8')



[nltk_data] Downloading package punkt to /Users/xumingkai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xumingkai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [96]:
file_path = "corpus/100_Incidents in the Life of a Slave Girl, Written by Herself.txt"
output_file = "100.csv"
df_output = process_book(file_path)
save_to_csv(df_output, output_file)