In [3]:
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')  # Make sure to download 'punkt' tokenizer models

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def find_sentences_and_chapters(text):
    chapters = re.split(r'CHAPTER\s+[IVXLCDM]+', text)
    sentences_with_chapters = []
    chapter_number = 0
    
    for chapter in chapters[1:]: 
        chapter = re.sub(r'\s+', ' ', chapter).strip()
        sentences = sent_tokenize(chapter)  
        chapter_number += 1
        sentences_with_chapters += [(preprocess_text(sentence), chapter_number) for sentence in sentences]
    return sentences_with_chapters

def save_sentences_to_csv(sentences, output_file):
    df = pd.DataFrame(sentences, columns=['Sentence', 'Chapter'])
    df.to_csv(output_file, index=False)

def process_book_text_to_csv(input_file_path, output_csv_path):
    with open(input_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        start_marker = "*** start of the project gutenberg ebook"
        end_marker = "*** end of the project gutenberg ebook"
        start_index = content.lower().find(start_marker) + len(start_marker)
        end_index = content.lower().find(end_marker)
        
        main_content_start = content.find("***", start_index) + 3
        main_content = content[main_content_start:end_index].strip()
        
        sentences_with_chapters = find_sentences_and_chapters(main_content)
    
    save_sentences_to_csv(sentences_with_chapters, output_csv_file)

    print("Process complete. Output saved to:", output_csv_file)



[nltk_data] Downloading package punkt to /Users/xumingkai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

def is_potential_simile(token):
    
    if token.dep_ in ["prep", "mark"] and token.head.pos_ in ["VERB", "ADJ"]:
        for child in token.children:
            if child.pos_ in ["NOUN", "ADJ", "PROPN"]:
                return True
    return False
def extract_simile_components(sentence, chapter):
    
    if not isinstance(sentence, str):
        return [] 
    
    doc = nlp(sentence)
    similes = []
    for token in doc:
        if token.lemma_ in ["like", "as"] and is_potential_simile(token):
            tenor = token.head.text
            vehicle = " ".join([child.text for child in token.children if child.dep_ in ["pobj"]])
            if vehicle:  
                similes.append({"Chapter": chapter, "Tenor": tenor, "Vehicle": vehicle, "Sentence": sentence})
    return similes

def process_csv_and_find_similes(input_csv, output_csv):
    df = pd.read_csv(input_csv).fillna('')
    results = []

    for _, row in df.iterrows():
        sentence = row['Sentence']
        chapter = row['Chapter']  
        similes = extract_simile_components(sentence, chapter) 
        results.extend(similes)

    if results:
        results_df = pd.DataFrame(results)
        results_df.to_csv(output_csv, index=False)
        print(f"Simile analysis results saved to {output_csv}")
    else:
        print("No similes found in the input data.")


In [None]:
input_file_path = 'Piccadilly Jim.txt'
output_csv_path = 'Piccadilly Jim_sentences_with_chapters.csv'
process_book_text_to_csv(input_file_path, output_csv_path)

In [None]:
input_csv_path = 'Piccadilly Jim_sentences_with_chapters.csv'  
output_csv_path = 'Piccadilly Jim_simile.csv'  

process_csv_and_find_similes(input_csv_path, output_csv_path)