In [2]:
# imports
from bertopic import BERTopic
import pandas as pd
import spacy
import json
import os

In [73]:
# load book metadata dataframe
df_metadata = pd.read_csv("../data/interim/books/books_metadata.txt", dtype="str")
ids = list(df_metadata["id"])

In [74]:
# open a fulltext
id = "003"
load_path = "../data/interim/books/fulltexts/"
book_path = load_path + f"book{id}.txt"
with open(book_path) as file:
    fulltext = file.read()

In [68]:
# split into satisfactory chunks using spaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(fulltext)

chunk_lists = []
token_count = 0
chunk_list = []
for sent in doc.sents:
    l = len(sent)
    if token_count + l > 400:
        # 400 is an arbitrary value, keeping a large buffer
        # to keep BERT's tokens below 512
        chunk_lists.append(chunk_list)
        token_count = l
        chunk_list = []
        chunk_list.append(sent.text)
    else:
        token_count += l
        chunk_list.append(sent.text)
chunks = [" ".join(chunk_list) for chunk_list in chunk_lists]

In [69]:
# run BERTopic
model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")
df_fulltopic = model.get_topic_info()

In [70]:
topics, probabilities = model.transform(chunks)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches: 100%|██████████| 7/7 [00:15<00:00,  2.22s/it]


In [71]:
# extract topics and probabililties
topic_dicts = {}

for topic in topics:
    topic_dicts[topic] = {
        "prob": 0,
        "words": list(
            df_fulltopic.loc[df_fulltopic["Topic"] == topic]["Representation"]
        ),
    }

for i, topic in enumerate(topics):
    topic_dicts[topic]["prob"] += probabilities[i]

df_topic = pd.DataFrame.from_dict(topic_dicts, orient="index")
df_topic = df_topic.sort_values(by = 'prob', ascending=False)
df_topic

Unnamed: 0,prob,words
1471,7.567925,"[[legions, heresy, legion, crusade, factions, ..."
98,7.080056,"[[goddesses, stanzas, mythology, stanza, valky..."
2075,3.433458,"[[1605, gunpowder, undercroft, conspirators, 1..."
270,2.674975,"[[brigades, soldiers, reinforcements, troops, ..."
999,2.599750,"[[rescuers, rescuer, survivors, rescue, camped..."
...,...,...
1243,0.352430,"[[heterochromia, pigmentation, pigment, pigmen..."
769,0.351572,"[[verse, poetry, poet, poems, poem, poets, 166..."
273,0.349843,"[[refrigerants, refrigeration, refrigerant, re..."
1987,0.323385,"[[yoon, jin, jung, hye, kyung, ji, han, hyun, ..."


In [76]:
# full pipeline

failed_ids_tm = []

# directories
load_path = "../data/interim/books/fulltexts/"
save_path = "../data/interim/topics/"

# models
nlp = spacy.load("en_core_web_sm")
model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")
df_fulltopic = model.get_topic_info()

for id in ids[0:]:
    try:
        # open the fulltext
        book_path = load_path + f"book{id}.txt"
        with open(book_path) as file:
            fulltext = file.read()

        # split into satisfactory chunks using spaCy
        doc = nlp(fulltext)

        chunk_lists = []
        token_count = 0
        chunk_list = []
        for sent in doc.sents:
            l = len(sent)
            if token_count + l > 400:
                # 400 is an arbitrary value, keeping a large buffer
                # to keep BERT's tokens below 512
                chunk_lists.append(chunk_list)
                token_count = l
                chunk_list = []
                chunk_list.append(sent.text)
            else:
                token_count += l
                chunk_list.append(sent.text)
        chunks = [" ".join(chunk_list) for chunk_list in chunk_lists]

        # run topic modelling inference on each chunk
        topics, probabilities = model.transform(chunks)

        # extract topics and probabililties
        topic_dicts = {}

        for topic in topics:
            topic_dicts[topic] = {
                "prob": 0,
                "words": list(
                    df_fulltopic.loc[df_fulltopic["Topic"] == topic]["Representation"]
                ),
            }

        for i, topic in enumerate(topics):
            topic_dicts[topic]["prob"] += probabilities[i]

        df_topic = pd.DataFrame.from_dict(topic_dicts, orient="index")
        df_topic = df_topic.sort_values(by = 'prob', ascending=False)

        # write topic_df to file
        out_file = save_path + f"{id}.csv"
        df_topic.to_csv(out_file, index=False)
        print(f"ID:{id} done.")

    except:
        failed_ids_tm.append(id)
        print(f"ID:{id} failed.")

Batches: 100%|██████████| 2/2 [00:04<00:00,  2.10s/it]


ID:011 done.


Batches: 100%|██████████| 17/17 [00:50<00:00,  3.00s/it]


ID:012 done.


Batches: 100%|██████████| 2/2 [00:05<00:00,  2.90s/it]


ID:013 done.


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]


ID:014 done.


Batches: 100%|██████████| 2/2 [00:03<00:00,  1.79s/it]


ID:015 done.


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.74it/s]


ID:016 done.


Batches: 100%|██████████| 4/4 [00:08<00:00,  2.18s/it]


ID:017 done.


Batches: 100%|██████████| 6/6 [00:16<00:00,  2.77s/it]


ID:018 done.


Batches: 100%|██████████| 8/8 [00:24<00:00,  3.07s/it]


ID:019 done.


Batches: 100%|██████████| 21/21 [00:51<00:00,  2.43s/it]


ID:020 done.


Batches: 100%|██████████| 9/9 [00:24<00:00,  2.68s/it]


ID:021 done.


Batches: 100%|██████████| 3/3 [00:08<00:00,  2.71s/it]


ID:022 done.


Batches: 100%|██████████| 6/6 [00:15<00:00,  2.52s/it]


ID:023 done.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]


ID:024 done.


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]


ID:025 done.


Batches: 100%|██████████| 16/16 [00:46<00:00,  2.93s/it]


ID:026 done.


Batches: 100%|██████████| 2/2 [00:02<00:00,  1.47s/it]


ID:027 done.


Batches: 100%|██████████| 10/10 [00:27<00:00,  2.75s/it]


ID:028 done.


Batches: 100%|██████████| 10/10 [00:27<00:00,  2.77s/it]


ID:029 done.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.55s/it]


ID:030 done.


Batches: 100%|██████████| 7/7 [00:13<00:00,  1.97s/it]


ID:031 done.


Batches: 100%|██████████| 4/4 [00:09<00:00,  2.31s/it]


ID:032 done.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.90s/it]


ID:033 done.


Batches: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]


ID:034 done.


Batches: 100%|██████████| 1/1 [00:02<00:00,  2.31s/it]


ID:035 done.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it]


ID:036 done.


Batches: 100%|██████████| 6/6 [00:16<00:00,  2.70s/it]


ID:037 done.


Batches: 100%|██████████| 10/10 [00:26<00:00,  2.60s/it]


ID:038 done.


Batches: 100%|██████████| 1/1 [00:02<00:00,  2.70s/it]


ID:039 done.


Batches: 100%|██████████| 10/10 [00:24<00:00,  2.47s/it]


ID:040 done.


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]


ID:041 done.


Batches: 100%|██████████| 11/11 [00:32<00:00,  2.93s/it]


ID:042 done.


Batches: 100%|██████████| 5/5 [00:10<00:00,  2.14s/it]


ID:043 done.


Batches: 100%|██████████| 7/7 [00:18<00:00,  2.68s/it]


ID:044 done.


Batches: 100%|██████████| 11/11 [00:29<00:00,  2.66s/it]


ID:045 done.


Batches: 100%|██████████| 5/5 [00:12<00:00,  2.42s/it]


ID:046 done.


Batches: 100%|██████████| 4/4 [00:08<00:00,  2.18s/it]


ID:047 done.


Batches: 100%|██████████| 8/8 [00:18<00:00,  2.32s/it]


ID:048 done.


Batches: 100%|██████████| 17/17 [00:46<00:00,  2.71s/it]


ID:049 done.


Batches: 100%|██████████| 1/1 [00:02<00:00,  2.46s/it]


ID:050 done.


Batches: 100%|██████████| 8/8 [00:23<00:00,  2.89s/it]


ID:051 done.


Batches: 100%|██████████| 5/5 [00:12<00:00,  2.58s/it]


ID:052 done.


Batches: 100%|██████████| 3/3 [00:06<00:00,  2.05s/it]


ID:053 done.


Batches: 100%|██████████| 8/8 [00:22<00:00,  2.78s/it]


ID:054 done.


Batches: 100%|██████████| 8/8 [00:22<00:00,  2.83s/it]


ID:055 done.


Batches: 100%|██████████| 8/8 [00:21<00:00,  2.64s/it]


ID:056 done.


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]


ID:057 done.


Batches: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]


ID:058 done.


Batches: 100%|██████████| 4/4 [00:07<00:00,  1.92s/it]


ID:059 done.


Batches: 100%|██████████| 8/8 [00:23<00:00,  2.89s/it]


ID:060 done.


Batches: 100%|██████████| 2/2 [00:05<00:00,  2.59s/it]


ID:061 done.


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


ID:062 done.


Batches: 100%|██████████| 9/9 [00:23<00:00,  2.59s/it]


ID:063 done.


Batches: 100%|██████████| 6/6 [00:15<00:00,  2.59s/it]


ID:064 done.


Batches: 100%|██████████| 5/5 [00:13<00:00,  2.68s/it]


ID:065 done.


Batches: 100%|██████████| 10/10 [00:27<00:00,  2.78s/it]


ID:066 done.


Batches: 100%|██████████| 6/6 [00:11<00:00,  1.88s/it]


ID:067 done.


Batches: 100%|██████████| 8/8 [00:18<00:00,  2.31s/it]


ID:068 done.


Batches: 100%|██████████| 8/8 [00:22<00:00,  2.77s/it]


ID:069 done.


Batches: 100%|██████████| 8/8 [00:23<00:00,  2.95s/it]


ID:070 done.


Batches: 100%|██████████| 9/9 [00:22<00:00,  2.55s/it]


ID:071 done.


Batches: 100%|██████████| 7/7 [00:15<00:00,  2.17s/it]


ID:072 done.


Batches: 100%|██████████| 8/8 [00:22<00:00,  2.85s/it]


ID:073 done.


Batches: 100%|██████████| 6/6 [00:15<00:00,  2.58s/it]


ID:074 done.


Batches: 100%|██████████| 11/11 [00:26<00:00,  2.39s/it]


ID:075 done.


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]


ID:076 done.


Batches: 100%|██████████| 2/2 [00:04<00:00,  2.37s/it]


ID:077 done.


Batches: 100%|██████████| 8/8 [00:34<00:00,  4.27s/it]


ID:078 done.


Batches: 100%|██████████| 6/6 [00:19<00:00,  3.28s/it]


ID:079 done.


Batches: 100%|██████████| 5/5 [00:10<00:00,  2.18s/it]


ID:080 done.


Batches: 100%|██████████| 9/9 [00:28<00:00,  3.16s/it]


ID:081 done.


Batches: 100%|██████████| 4/4 [00:11<00:00,  2.96s/it]


ID:082 done.


Batches: 100%|██████████| 6/6 [00:16<00:00,  2.74s/it]


ID:083 done.


Batches: 100%|██████████| 9/9 [00:27<00:00,  3.00s/it]


ID:084 done.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]


ID:085 done.


Batches: 100%|██████████| 9/9 [00:16<00:00,  1.86s/it]


ID:086 done.


Batches: 100%|██████████| 6/6 [00:10<00:00,  1.78s/it]


ID:087 done.


Batches: 100%|██████████| 4/4 [00:07<00:00,  1.95s/it]


ID:088 done.


Batches: 100%|██████████| 10/10 [00:16<00:00,  1.63s/it]


ID:089 done.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it]


ID:090 done.


Batches: 100%|██████████| 7/7 [00:13<00:00,  1.94s/it]


ID:091 done.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.75s/it]


ID:092 done.


Batches: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it]


ID:093 done.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]


ID:094 done.


Batches: 100%|██████████| 6/6 [00:14<00:00,  2.39s/it]


ID:095 done.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.76s/it]


ID:096 done.


Batches: 100%|██████████| 9/9 [00:22<00:00,  2.47s/it]


ID:097 done.


Batches: 100%|██████████| 9/9 [00:19<00:00,  2.19s/it]


ID:098 done.


Batches: 100%|██████████| 3/3 [00:04<00:00,  1.52s/it]


ID:099 done.


Batches: 100%|██████████| 9/9 [00:23<00:00,  2.57s/it]


ID:100 done.


Batches:  67%|██████▋   | 6/9 [00:15<00:07,  2.52s/it]
Batches:   0%|          | 0/6 [00:00<?, ?it/s]