In [1]:
import nltk
import eda_utils as eda_utils
import pandas as pd

dream = nltk.corpus.shakespeare.xml("dream.xml")
hamlet = nltk.corpus.shakespeare.xml("hamlet.xml")
macbeth = nltk.corpus.shakespeare.xml("macbeth.xml")
randj = nltk.corpus.shakespeare.xml("r_and_j.xml") 

works = [ 
    {"work_xml": dream}, 
    {"work_xml": hamlet}, 
    {"work_xml": macbeth}, 
    {"work_xml": randj}
    ]

for w in works:
    eda_utils.extract_title_xml(w)
    main, side = eda_utils.extract_charcs_xml(w["work_xml"], print_charcs=False )
    w["main_charcs"] = main
    w["side_charcs"] = side

    w["parsed_play"] = eda_utils.parse_play_xml(w["work_xml"])

    w["merged"] = eda_utils.merge_play_data(w["parsed_play"], w["main_charcs"], w["side_charcs"])


all_data = []
for w in works:
    xml_tree = w["work_xml"]
    main, side = eda_utils.extract_charcs_xml(xml_tree, print_charcs=False)
    parsed = eda_utils.parse_play_xml(xml_tree)
    merged = eda_utils.merge_play_data(parsed, main, side)
    stats = eda_utils.summarize_play_stats(merged, main, side)
    df = stats["character_df"]  # extract the actual DataFrame
    all_data.append(df)


# --- Combine and export ---
combined_df = pd.concat(all_data, ignore_index=True)
combined_df.to_csv("../csv/all_plays_char_stats.csv", index=False)






A Midsummer Night's Dream
Acts: 5 | Scenes: 9 | Total Speeches: 500 | Total Lines: 2159
Main: 21 | Side: 2 | Ratio: 21:2

Act-level speech totals:
  Act 1: 90 speeches
  Act 2: 62 speeches
  Act 3: 180 speeches
  Act 4: 64 speeches
  Act 5: 104 speeches

Top 10 characters by line count:
                     Play Character  Total Speeches  Total Lines  Scenes Appeared  Acts Appeared  Speech Share (%)  Line Share (%)  Avg Speeches/Scene  Avg Lines/Speech  Verbosity  Talkativeness  Dominance  Focus (Lines/Act)  Breadth (Scene Ratio) Role Type
A Midsummer Night's Dream   THESEUS              48          233                3              3               9.6           10.79               16.00              4.85       4.85          16.00      10.79              77.67                   0.33      main
A Midsummer Night's Dream    HELENA              36          229                5              4               7.2           10.61                7.20              6.36       6.36           7.20 

In [5]:
def print_header(header_title: str):
    print("="*50)
    print(f"{header_title}")
    print("="*50)

In [6]:
import eda_utils as eda_utils

print_header("Character Networks")
eda_utils.build_networks_for_all(works)
print_header("Character Speeches")
eda_utils.extract_all_speeches(works)
print_header("Story Stats")
eda_utils.create_story_stats(works)


Character Networks
Saved cleaned network for A Midsummer Night's Dream: ../csv/a_midsummer_night's_dream_network.csv
Saved cleaned network for The Tragedy of Hamlet, Prince of Denmark: ../csv/the_tragedy_of_hamlet,_prince_of_denmark_network.csv
Saved cleaned network for The Tragedy of Macbeth: ../csv/the_tragedy_of_macbeth_network.csv
Saved cleaned network for The Tragedy of Romeo and Juliet: ../csv/the_tragedy_of_romeo_and_juliet_network.csv
Character Speeches
Extracting speeches for A Midsummer Night's Dream...
Saved ../csv/a_midsummer_nights_dream_speeches.csv (498 speeches)
Extracting speeches for The Tragedy of Hamlet, Prince of Denmark...
Saved ../csv/the_tragedy_of_hamlet,_prince_of_denmark_speeches.csv (1126 speeches)
Extracting speeches for The Tragedy of Macbeth...
Saved ../csv/the_tragedy_of_macbeth_speeches.csv (648 speeches)
Extracting speeches for The Tragedy of Romeo and Juliet...
Saved ../csv/the_tragedy_of_romeo_and_juliet_speeches.csv (834 speeches)
Story Stats
Saved 

Unnamed: 0,Play,Acts,Scenes,Speeches,Dialogue Lines,Main Characters,Side Characters,Total Characters,Avg Lines/Scene,Avg Speeches/Scene,Avg Lines/Speech
0,A Midsummer Night's Dream,5,9,498,2149,21,2,23,238.78,55.33,4.32
1,"The Tragedy of Hamlet, Prince of Denmark",5,20,1116,3978,21,5,26,198.9,55.8,3.56
2,The Tragedy of Macbeth,5,28,647,2373,21,5,26,84.75,23.11,3.67
3,The Tragedy of Romeo and Juliet,5,24,836,3080,18,7,25,128.33,34.83,3.68
