In [1]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from collections import defaultdict, Counter
import re

nltk.download('punkt')
nltk.download('stopwords')

# Define characters and their associated genders
genders = {
    'ALGERNON': 'male',
    'JACK': 'male',
    'GWENDOLEN': 'female',
    'CECILY': 'female',
    'LADY BRACKNELL': 'female',
    'MISS PRISM': 'female',
    'LANE': 'male',
    'MERRIMAN': 'male'
}

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Reading the play text
file_path = '../data/1895_wilde-oscar_the-importance-of-being-earnest.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Function to parse the play into dialogues by characters, tracking punctuation
def parse_play(text):
    pattern = r'^([A-Z ]+):\s*(.*)$'
    character_dialogue = defaultdict(list)
    for line in text.split('\n'):
        match = re.match(pattern, line.strip())
        if match:
            character, dialogue = match.groups()
            character = character.strip()
            if character in genders:
                character_dialogue[character].append(dialogue.strip())
    return character_dialogue

dialogues = parse_play(text)

# Initialize counters
gender_punctuation_usage = defaultdict(Counter)

# Punctuation of interest
punctuations = ['?', '.'] 

# Count punctuation by gender
for character, speeches in dialogues.items():
    gender = genders[character]
    for speech in speeches:
        # Using a simplistic method to count punctuations
        punctuation_counts = {punct: speech.count(punct) for punct in punctuations}
        gender_punctuation_usage[gender] += Counter(punctuation_counts)

print("Punctuation usage by gender:")
for gender, counts in gender_punctuation_usage.items():
    print(f" - {gender.capitalize()}: {dict(counts)}")


Punctuation usage by gender:
 - Male: {'.': 183, '?': 29}
 - Female: {'.': 266, '?': 32}


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
