In [9]:
import json
import pathlib
import pandas as pd
import matplotlib.pyplot as plt

BASE_DIR = Path.cwd().parent
PROCESSED_DIR = BASE_DIR / "data" / "processed"

FILE = PROCESSED_DIR / "crime_and_punishment_clean.json"

with open(FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

data.keys(), type(data['chapters'])

(dict_keys(['book', 'chapters']), list)

In [10]:
rows = []

for ch_idx, chapter in enumerate(data["chapters"], start = 1):
    for seg_idx, seg_text in enumerate(chapter):
        rows.append({
            "chapter": ch_idx,
            "segment": seg_idx,
            "text": seg_text
        })

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,chapter,segment,text
0,1,0,CHAPTER I
1,1,1,On an exceptionally hot evening early in July ...
2,2,0,CHAPTER II
3,2,1,"Raskolnikov was not used to crowds, and, as we..."
4,3,0,CHAPTER III


In [15]:
import re
sent_rows = []

for _, row in df.iterrows():
    sentences = re.split(r"(?<=[.!?])\s+", row["text"].strip())
    for i, sent in enumerate(sentences):
        sent = sent.strip()
        if not sent:
            continue
        sent_rows.append({
            "chapter": row["chapter"],
            "segment": row["segment"],
            "sent_in_segment": i,
            "sentence": sent
        })

df_sent = pd.DataFrame(sent_rows)
df_sent.head()

Unnamed: 0,chapter,segment,sent_in_segment,sentence
0,1,0,0,CHAPTER I
1,1,1,0,On an exceptionally hot evening early in July ...
2,1,1,1,"Place and walked slowly, as though\nin hesitat..."
3,1,1,2,bridge.
4,1,1,3,He had successfully avoided meeting his landla...


In [None]:
characters = {
    "Raskolnikov": [
        "Raskolnikov", "Rodion", "Rodion Romanovich"
    ],
    "Sonya": [
        "Sonya", "Sonia", "Sofya Semyonovna"
    ],
    "Porfiry": [
        "Porfiry", "Porfiry Petrovich"
    ],
    "Dunya": [
        "Dunya", "Avdotya Romanovna"
    ],
    "Svidrigailov": [
        "SvidrigaÃ¯lov", "Arkady Ivanovich"
    ],
    "Luzhin": [
        "Luzhin", "Pyotr Petrovich"
    ],
    "Marmeladov": [
        "Marmeladov", "Semyon Zakharovich"
    ]
}


In [23]:
def count_mentions(sentence, aliases):
    s = sentence.lower()
    return sum(s.count(alias.lower()) for alias in aliases)

for char, aliases in characters.items():
    df_sent[char] = df_sent['sentence'].apply(lambda s: count_mentions(s, aliases))



In [24]:
mention_counts = df_sent[list(characters.keys())].sum().sort_values(ascending=False)
mention_counts

Raskolnikov     883
Sonya           464
Porfiry         206
Luzhin          113
Dunya           102
Marmeladov       42
Svidrigailov      0
dtype: int64