### Imports

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter

from ebooklib import epub
from util import ebook_parser

import pandas as pd
import re

### ML Model

In [2]:
bert_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Book Pre-Processing

In [3]:
import xml.etree.ElementTree as ET
import os  
import shutil  
import zipfile  
import re  

def get_navpoints(book):
    toc = book.get_item_with_id('ncx')
    toc_content = toc.content.decode('utf-8')
    root = ET.fromstring(toc_content)

    search_words = ["title", "copyright", "dedication", "epigraph", "table of contents",
                    "contents", "foreword", "preface", "acknowledgements", "introduction", "prologue",
                    "epilogue", "afterword", "footnote", "endnote", "glossary", "index", "appendix",
                    "appendices", "illustration", "bibliography", "references", "blurb", "praise",
                    "biography", "project gutenberg"]

    navPoints = []

    for navPoint in root.find('{http://www.daisy.org/z3986/2005/ncx/}navMap').iter('{http://www.daisy.org/z3986/2005/ncx/}navPoint'):
        text = navPoint.find('{http://www.daisy.org/z3986/2005/ncx/}navLabel').find('{http://www.daisy.org/z3986/2005/ncx/}text').text
        src = navPoint.find('{http://www.daisy.org/z3986/2005/ncx/}content').attrib['src']

        if any(word.lower() in text.lower() for word in search_words):
            target_string = src.split('.html', 1)[0]
            start_anchor_id = src.split('.html', 1)[1]
            navPoints.append((text, target_string, start_anchor_id))
    return navPoints

def generate_processed_book(book_path, book_name, epub_file_extension, navPoints):
    def get_next_anchor_id(current_id):  
        prefix, num = re.match(r'(.*?)(\d+)$', current_id).groups()  
        next_num = str(int(num) + 1).zfill(len(num))
        return f'{prefix}{next_num}'
      
    temp_dir = 'temp_epub_extracted'  

    with zipfile.ZipFile(book_path, 'r') as zip_ref:  
        zip_ref.extractall(temp_dir)  
    

    for text, target_string, start_anchor_id in navPoints:  
        start_anchor_id = start_anchor_id[1:]  # Remove the '#' from the start_anchor_id  
        
        for root, dirs, files in os.walk(temp_dir):  
            for file in files:  
                if target_string in file:  
                    file_path = os.path.join(root, file)  
                    with open(file_path, 'r', encoding='utf-8') as f:  
                        content = f.read()  

                    if start_anchor_id == 'pg-footer-heading':
                        pattern = fr'(<[^>]*id="{re.escape(start_anchor_id)}".*?>).*?(</body>)'  
                    else:
                        end_anchor_id = get_next_anchor_id(start_anchor_id)  # Assuming the ID starts with '#'
                        # print(end_anchor_id)
                        # print(start_anchor_id) 
                        pattern = fr'(<[^>]*id="{re.escape(start_anchor_id)}".*?>).*?(<[^>]*id="{re.escape(end_anchor_id)}")'  

                    content = re.sub(pattern, r'\2', content, flags=re.DOTALL)  
    
                    with open(file_path, 'w', encoding='utf-8') as f:  
                        f.write(content)  
                    # print(f"Modified file: {file}") 
    
    # Zip the contents back into a new EPUB file  
    new_epub_path = 'data/books-pre-processed/'+ book_name + '-pre-processed' + epub_file_extension
    with zipfile.ZipFile(new_epub_path, 'w', zipfile.ZIP_DEFLATED) as zip_ref:  
        for root, dirs, files in os.walk(temp_dir):  
            for file in files:  
                file_path = os.path.join(root, file)  
                arcname = os.path.relpath(file_path, temp_dir)  
                zip_ref.write(file_path, arcname)  
    
    shutil.rmtree(temp_dir)

In [4]:
book_names = ["the-odyssey", "war-and-peace", "animal-farm", "a-christmas-carol"]
book_index = 2
book_name = book_names[book_index]

books_path = "data/books/"
epub_file_extension = ".epub"
book_path = books_path + book_name + epub_file_extension

book = epub.read_epub(book_path)
navPoints = get_navpoints(book)
generate_processed_book(book_path, book_name, epub_file_extension, navPoints)



### Function Definitions

In [5]:
# Process ePub to dataframe.

def process_ebook(book):
    
    def remove_newline_sentence(sentences_df):
        return sentences_df['sentence'].replace(to_replace=[r"\\", r"\n"], value=' ', regex=True)

    words, _, _ = ebook_parser.process_book(book)
    df = pd.DataFrame(words)
    sentences_df = df.groupby('sid')['w'].apply(lambda x: ' '.join(x)).reset_index(name='sentence')
    sentences_df['sentence'] = remove_newline_sentence(sentences_df)
    
    return sentences_df

# Returns True if 'character' is a character.

def is_alphabetic(character):
    return bool(re.match('^[A-Za-zÀ-ÖØ-öø-ÿ]+$', character))

# Divide book into appropriate token sized chunks.

def chunkify(tokenizer, sentences_df):
    merged_sentences = ' '.join(sentences_df['sentence'])
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size = 512, chunk_overlap = 0)
    ner_chunks = text_splitter.split_text(merged_sentences)
    return ner_chunks

# Extract character names.

def extract_character_names(ner_chunks, tokenizer):
    nlp = pipeline("ner", model=bert_model, tokenizer=tokenizer)
    final_names = []

    for chunk_index, ner_chunk in enumerate(ner_chunks):
        names = []
        ner_results = nlp(ner_chunk)

        for entity in ner_results:
            if entity['entity'] in ['B-PER', 'I-PER']:
                clean_name, start, end = extract_full_name(entity['word'], entity['start'], entity['end'], ner_chunk)

                if len(names) > 0 and abs(names[-1]['index'] - entity['index']) == 1 and "##" not in entity['word']:
                    names[-1]['index'] += 1
                    names[-1]['end'] = end
                    names[-1]['word'] += " " + clean_name
                elif len(names) == 0 or names[-1]['end'] < entity['end']:
                    names.append({'index': entity['index'], 'word': clean_name, 'start': start, 'end': end, 'score': entity['score'], 'chunk': chunk_index})

        final_names.extend(names)

    return final_names

# Help function to extract the full name.

def extract_full_name(word, start, end, ner_chunk):
    clean_name = word.replace("##", "")
    start -= 1
    while start >= 0 and ner_chunk[start] != " " and is_alphabetic(ner_chunk[start]):
        clean_name = ner_chunk[start] + clean_name
        start -= 1
    start += 1

    while end < len(ner_chunk) and ner_chunk[end] != " " and is_alphabetic(ner_chunk[end]):
        clean_name += ner_chunk[end]
        end += 1

    return clean_name, start, end

# Removes non-character entries in a list.

def filter_names(names):
    titles_to_remove, honorifics_without_punctuation_and_space = get_titles_to_remove()

    def remove_exact_honorifics(names, honorifics_without_punctuation):
        honorifics_with_punctuation = [honorific + "." for honorific in honorifics_without_punctuation]
        return [name for name in names if name not in honorifics_without_punctuation + honorifics_with_punctuation] 
    

    def remove_titles(name):
        for title in titles_to_remove:
            name = name.replace(title, "").strip()
        return name
    
    def filter_odd_characters(lst):
        filtered_list = []
        for s in lst:
            filtered_string = ''.join(char for char in s if re.match('[A-Za-zÀ-ÖØ-öø-ÿ.]', char) or char == ' ')
            if all(ch == ' ' for ch in filtered_string[1:][::2]):
                filtered_string = filtered_string.replace(' ', '')
            filtered_list.append(filtered_string)
        return filtered_list
    
    def remove_non_alphabetic(names):
        filtered_names = []
        for name in names:
            filtered_name = ''.join(char for char in name if char.isalpha() or char == '.' or char == ' ')
            filtered_names.append(filtered_name)
        return filtered_names
    
    def capitalize_first_letter(names):
        return [name[0].upper() + name[1:] for name in names]
    
    filtered_names = map(remove_titles, names)
    filtered_names = remove_non_alphabetic(filtered_names)
    filtered_names = filter_odd_characters(filtered_names)
    filtered_names = filter(lambda x: x != "", filtered_names)
    filtered_names = remove_exact_honorifics(filtered_names, honorifics_without_punctuation_and_space)
    filtered_names = capitalize_first_letter(filtered_names)

    return list(filtered_names)

def get_titles_to_remove():
    base_path = "data/filter-lists/"
    honorifics_path = base_path + "salutations-titles-honorifics/"
    honorifics_without_punctuation_file = honorifics_path + "list-of-salutations-titles-honorifics-without-punctuation.txt"
    honorifics_file = honorifics_path + "list-of-salutations-titles-honorifics.txt"
    relationships_file = base_path + "list-of-family-relationships.txt"
    real_entities_file = base_path + "list-of-real-entities.txt"

    with open(honorifics_without_punctuation_file, "r") as file:
        honorifics_without_punctuation = file.readlines()
        honorifics_without_punctuation = [honorific.strip() for honorific in honorifics_without_punctuation]
        honorifics_without_punctuation_and_space = [honorific for honorific in honorifics_without_punctuation]
        honorifics_with_punctuation = list(map(lambda honorific: honorific + ". ", honorifics_without_punctuation))
        honorifics_without_punctuation = [honorific + ' ' for honorific in honorifics_without_punctuation]

    with open(honorifics_file, "r") as file:
        honorifics = file.readlines()
        honorifics = [honorific.strip() + ' ' for honorific in honorifics]

    with open(relationships_file, "r") as file:
        relationships = file.readlines()
        relationships = [relationship.strip() + ' ' for relationship in relationships]
    
    with open(real_entities_file, "r") as file:
        real_entities = file.readlines()
        real_entities = [real_entity.strip() for real_entity in real_entities]

    return honorifics_with_punctuation + honorifics_without_punctuation + honorifics + relationships + real_entities, honorifics_without_punctuation_and_space

def append_name_frequency(filtered_entities):
    unique_names = set(filtered_entities)
    unique_name_frequencies = []
    names_only_occurring_once = []
    for unique_name in unique_names: 
        occurrences_of_name = filtered_entities.count(unique_name)
        if occurrences_of_name > 1:
            unique_name_frequencies.append({ 'name': unique_name, 'frequency': occurrences_of_name })
        else:
            names_only_occurring_once.append(unique_name)
    unique_names_sorted_by_frequency = sorted(unique_name_frequencies, key=lambda unique_name_frequency: unique_name_frequency['frequency'], reverse=True)
    return unique_names_sorted_by_frequency

def filter_entities(filtered_names_and_freq):
    
    def remove_last_name_if_present_in_multiple_names(filtered_names_and_freq):
        candidates_to_remove = []

        for name in [filtered_name_and_freq['name'] for filtered_name_and_freq in filtered_names_and_freq]:
            first_space_index = name.find(" ")
            if first_space_index != -1:
                candidates_to_remove.append({"first_name": name[:first_space_index], "last_name": name[first_space_index + 1:]})
        
        to_remove = []
        
        for candidate in candidates_to_remove:
            candidates_excluding_current_candidate = [character for character in candidates_to_remove if candidate != character]
            answer = next((character for character in candidates_excluding_current_candidate if character['last_name'] == candidate['last_name']), None)
            if answer is not None:
                to_remove.append(answer['last_name'])
        
        return [name for name in filtered_names_and_freq if name['name'] not in to_remove]

    def process_entities_with_same_first_name(filtered_names_and_freq):
        
        def exist_entities_with_duplicate_first_name(name):

            first_name = name[:first_space_index]
            filtered_names_without_self = [character for character in filtered_names_and_freq if name != character['name']]

            filtered_names_without_self_and_last_names = [character['name'].split(" ")[0] for character in filtered_names_without_self if " " in character['name']]
            return any(first_name == character_first_name for character_first_name in filtered_names_without_self_and_last_names)
        
        for filtered_name_and_freq in filtered_names_and_freq:
            first_space_index = filtered_name_and_freq['name'].find(" ")
            
            if first_space_index != -1 and exist_entities_with_duplicate_first_name(filtered_name_and_freq['name']):
                filtered_names_and_freq = [character for character in filtered_names_and_freq if character['name'] != filtered_name_and_freq['name'][:first_space_index]]
                    
        return filtered_names_and_freq
    
    def remove_author(book, filtered_names_and_freq):
        author = book.get_metadata('DC', 'creator')
        if author != None:
            author = author[0][0]
            return [filtered_name_and_freq for filtered_name_and_freq in filtered_names_and_freq if author != filtered_name_and_freq['name']]
        return filtered_names_and_freq
    
    def remove_plural_lastnames(filtered_names_and_freq):
        lastnames = [name['name'].split(' ')[-1] for name in filtered_names_and_freq if ' ' in name['name']]
        filtered_names_and_freq = [name for name in filtered_names_and_freq
            if not (name['name'].endswith('s') and name['name'][:-1] in lastnames)]
        return filtered_names_and_freq

    filtered_names_and_freq = remove_last_name_if_present_in_multiple_names(filtered_names_and_freq)
    filtered_names_and_freq = process_entities_with_same_first_name(filtered_names_and_freq)
    filtered_names_and_freq = remove_author(book, filtered_names_and_freq)
    filtered_names_and_freq = remove_plural_lastnames(filtered_names_and_freq)

    return filtered_names_and_freq

### bert-base-NER

In [6]:
book_path = 'data/books-pre-processed/' + book_name + '-pre-processed' + epub_file_extension
book = epub.read_epub(book_path)

sentences_df = process_ebook(book)
sentences_df.to_csv('data/sentences/' + book_names[book_index] + '_sentences.csv', index=False)

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
ner_chunks = chunkify(tokenizer, sentences_df)
entities = extract_character_names(ner_chunks, tokenizer)




tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
names = [entity for entity in entities if entity['score'] >= 0.95 and len(entity['word']) > 1]
names = [entity['word'] for entity in names]

filtered_names_and_freq = filter_names(names)
filtered_names_and_freq = append_name_frequency(filtered_names_and_freq)
filtered_entities_and_freq = filter_entities(filtered_names_and_freq)

final_names_df = pd.DataFrame(filtered_entities_and_freq)
final_names_df.to_csv(f"data/characters/{book_name}_characters.csv")
