# Q&A Part II

### Q: How can I track gender-based discourse? For example, a comparison of how often a female vs. male character speaks.


A: To do this we need to: 
- Collect all words spoken by male and female characters separately
- Calculate Word Frequencies by Gender
- Identify Distinctive Words

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from collections import defaultdict, Counter
import re

nltk.download('punkt')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Define characters and their associated genders
genders = {
    'ALGERNON': 'male',
    'JACK': 'male',
    'GWENDOLEN': 'female',
    'CECILY': 'female',
    'LADY BRACKNELL': 'female',
    'MISS PRISM': 'female',
    'LANE': 'male',
    'MERRIMAN': 'male'
}

# Reading the play text
file_path = '../data/1895_wilde-oscar_the-importance-of-being-earnest.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Function to parse the play into dialogues by characters
def parse_play(text):
    pattern = r'^([A-Z ]+):\s*(.*)$'
    character_dialogue = defaultdict(list)
    for line in text.split('\n'):
        match = re.match(pattern, line.strip())
        if match:
            character, dialogue = match.groups()
            character = character.strip()
            if character in genders:
                character_dialogue[character].append(dialogue.strip())
    return character_dialogue

dialogues = parse_play(text)

# Initialize counters
gendered_words = defaultdict(list)
word_frequencies_by_gender = defaultdict(Counter)
gender_speech_count = defaultdict(int)

# Aggregate words by gender, count frequencies, and measure speech volume
for character, speeches in dialogues.items():
    gender = genders[character]
    for speech in speeches:
        words = [word.lower() for word in word_tokenize(speech) if word.lower() not in stop_words]
        gendered_words[gender].extend(words)
        gender_speech_count[gender] += len(words)

for gender, words in gendered_words.items():
    word_frequencies_by_gender[gender] = Counter(words)

# Function to find distinctive words
def find_distinctive_words(male_freq, female_freq, threshold=1.5):
    distinctive_male_words = {word: count for word, count in male_freq.items()
                              if count > threshold * female_freq.get(word, 0) and count > 5}
    distinctive_female_words = {word: count for word, count in female_freq.items()
                                if count > threshold * male_freq.get(word, 0) and count > 5}
    return distinctive_male_words, distinctive_female_words

distinctive_male_words, distinctive_female_words = find_distinctive_words(
    word_frequencies_by_gender['male'], word_frequencies_by_gender['female'])

print("Speech counts by gender:")
for gender, count in gender_speech_count.items():
    print(f" - {gender.capitalize()}: {count} words")

print("\nDistinctive Male Words:", distinctive_male_words)
print("Distinctive Female Words:", distinctive_female_words)


Speech counts by gender:
 - Male: 1168 words
 - Female: 1758 words

Distinctive Male Words: {'...': 8, 'well': 9, 'christened': 12, '--': 7, 'love': 6, 'miss': 14, 'fairfax': 7, 'muffins': 13, 'eat': 7, 'ever': 6}
Distinctive Female Words: {'.': 234, 'little': 7, 'man': 6, 'would': 11, "'s": 10, 'ernest': 20, 'mr.': 20, 'worthing': 19, 'think': 12, 'quite': 15, 'course': 7, 'always': 8, 'engaged': 7, 'seems': 7, 'oh': 10, 'fact': 6, 'us': 6, 'diary': 6, 'pray': 6, 'could': 6, 'much': 7, 'dear': 6}


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Q: Can I specifically segment punctuation (question marks, exclamation points, dashes, and ellipses) by gendered speaker to understand differences in gender?

A: To do this we adjust the script to track punctuation marks within each character's dialogue. We can focus on specific punctuation marks: question marks (?), exclamation points (!), dashes (- or —), and ellipses (...)

In [9]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from collections import defaultdict, Counter
import re

nltk.download('punkt')
nltk.download('stopwords')

# Define characters and their associated genders
genders = {
    'ALGERNON': 'male',
    'JACK': 'male',
    'GWENDOLEN': 'female',
    'CECILY': 'female',
    'LADY BRACKNELL': 'female',
    'MISS PRISM': 'female',
    'LANE': 'male',
    'MERRIMAN': 'male'
}

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Reading the play text
file_path = '../data/1895_wilde-oscar_the-importance-of-being-earnest.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Function to parse the play into dialogues by characters, tracking punctuation
def parse_play(text):
    pattern = r'^([A-Z ]+):\s*(.*)$'
    character_dialogue = defaultdict(list)
    for line in text.split('\n'):
        match = re.match(pattern, line.strip())
        if match:
            character, dialogue = match.groups()
            character = character.strip()
            if character in genders:
                character_dialogue[character].append(dialogue.strip())
    return character_dialogue

dialogues = parse_play(text)

# Initialize counters
gender_punctuation_usage = defaultdict(Counter)

# Punctuation of interest
punctuations = ['?', '!', '-', '...']  # ellipses are tricky; this assumes they're well-formatted

# Count punctuation by gender
for character, speeches in dialogues.items():
    gender = genders[character]
    for speech in speeches:
        # Using a simplistic method to count punctuations
        punctuation_counts = {punct: speech.count(punct) for punct in punctuations}
        gender_punctuation_usage[gender] += Counter(punctuation_counts)

print("Punctuation usage by gender:")
for gender, counts in gender_punctuation_usage.items():
    print(f" - {gender.capitalize()}: {dict(counts)}")


Punctuation usage by gender:
 - Male: {'...': 8, '-': 27, '?': 29, '!': 21}
 - Female: {'?': 32, '!': 24, '...': 3, '-': 14}


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Other Questions

### Q: Is there a way to expand collocation to include words that are frequently used in conjunction, but aren’t exactly next to each other? 


In [27]:
import nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

# Path to the text file of "The Importance of Being Earnest"
file_path = '../data/1895_wilde-oscar_the-importance-of-being-earnest.txt'

# Reading the play text
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Initialize tokenizer and stop words
tokenizer = RegexpTokenizer(r'\w+')  # tokenizer to remove punctuation
stop_words = set(stopwords.words('english'))

# Tokenize the text and remove stopwords
tokens = [word.lower() for word in tokenizer.tokenize(text) if word.lower() not in stop_words]

# Define the window size for the collocation analysis
window_size = 6

# Setup Bigram Association Measures & Finder
measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens, window_size=window_size)

# Filter out collocations that occur less than three times
finder.apply_freq_filter(3)

# Get the top 20 collocations based on their frequency
collocations = finder.nbest(measures.raw_freq, 20)

# Print out the collocations found
print(f"Top 20 collocations in 'The Importance of Being Earnest' within window size {window_size}")
for collocation in collocations:
    print(f" - {' '.join(collocation)}")

Top 20 collocations in 'The Importance of Being Earnest' within window size 6
 - lady bracknell
 - miss prism
 - mr worthing
 - jack algernon
 - algernon cecily
 - cecily gwendolen
 - jack gwendolen
 - cecily cecily
 - gwendolen jack
 - algernon jack
 - gwendolen cecily
 - gwendolen gwendolen
 - jack well
 - algernon well
 - cecily jack
 - algernon yes
 - lane sir
 - ernest jack
 - jack lady
 - jack miss


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Character

### Q: Is there a way I can only focus on lines where three specific characters are in direct conversation with one another? How can I exclude the rest of the text?

A: This method involves analyzing word pairs (or tuples) that appear within a specified distance of each other more often than would be expected by chance. This concept is known as "window-based collocation". It can be implemented with NLTK.

In [21]:
import re
from collections import defaultdict, deque

file_path = '../data/1895_wilde-oscar_the-importance-of-being-earnest.txt'

def filter_dialogues(text, characters):
    # Regex to match character lines
    pattern = r'^([A-Z]{2,}):(.*)$'
    dialogue = defaultdict(list)
    active_dialogue = []
    last_speaker = None

    # Use a deque to check the rolling set of speakers
    speaker_window = deque(maxlen=3)

    for line in text.split('\n'):
        match = re.match(pattern, line.strip())
        if match:
            speaker, speech = match.groups()
            if speaker in characters:
                # Check the continuity of the conversation among the three
                if last_speaker and last_speaker != speaker:
                    speaker_window.append(speaker)
                    # When we have three unique speakers in the window
                    if len(speaker_window) == 3 and len(set(speaker_window)) == 3:
                        # Confirm all targeted characters are part of the conversation
                        if all(char in speaker_window for char in characters):
                            active_dialogue.append((speaker, speech.strip()))
                else:
                    # Reset if the speaker repeats before cycling through the three
                    active_dialogue = [(speaker, speech.strip())]
                last_speaker = speaker
            else:
                # Reset everything if a non-target speaks
                last_speaker = None
                active_dialogue = []
                speaker_window.clear()
        elif active_dialogue:
            # If there's a break in dialogue (e.g., a direction line or empty line), commit the active dialogue
            for spkr, spch in active_dialogue:
                dialogue[spkr].append(spch)
            active_dialogue = []
            speaker_window.clear()

    return dialogue

# Specifying the target characters
target_characters = {'ALGERNON', 'JACK', 'GWENDOLEN'}

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

filtered_dialogues = filter_dialogues(text, target_characters)

for character, speeches in filtered_dialogues.items():
    print(f"{character} said:")
    for speech in speeches:
        print(f"  {speech}")
    print("\n")


JACK said:
  Personally, darling, to speak quite candidly, I don't much care about the name of Ernest ...I don't think the name suits me at all.
  [Picking up the muffin-dish.] Oh, that is nonsense; you are always talking nonsense.
  [In a pathetic voice.] Miss Prism, more is restored to you than this hand-bag. I was the baby you placed in it.
  [Embracing her.] Yes ...mother!
  Unmarried! I do not deny that is a serious blow. But after all, who has the right to cast a stone against one who has suffered? Cannot repentance wipe out an act of folly? Why should there be one law for men, and another for women? Mother, I forgive you. [Tries to embrace her again.]
  [After a pause.] Lady Bracknell, I hate to seem inquisitive, but would you kindly inform me who I am?
  Algy's elder brother! Then I have a brother after all. I knew I had a brother! I always said I had a brother! Cecily,--how could you have ever doubted that I had a brother? [Seizes hold of Algernon.] Dr. Chasuble, my unfortunat

### Q: How can I calculate the amount of speech in a play per character (either number of words or percentage of the whole text that a certain character speaks)?  And is there a way to analyze speech spoken by a single character or to compare the speech of specific characters?

A: To do this we need to retrieve all characters and their associated lines, then  count those lines.

In [13]:
import re
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('punkt')
nltk.download('stopwords')

# Initialize tokenizer and stopwords
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

file_path = '../data/1895_wilde-oscar_the-importance-of-being-earnest.txt'

def parse_play(text):
    # Regex to match lines indicating a speaker, such as 'ALGERNON:'
    pattern = r'^([A-Z]{2,}):(.*)$'
    character_speech = defaultdict(list)
    
    for line in text.split('\n'):
        match = re.match(pattern, line.strip())
        if match:
            character, speech = match.groups()
            character_speech[character].append(speech.strip())
    
    return character_speech

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

character_speech = parse_play(text)

# Calculating the number of words spoken by each character minus stopwords and punctuation
character_word_counts = {
    char: sum(len([word for word in tokenizer.tokenize(speech.lower()) if word not in stop_words])
              for speech in speeches) 
    for char, speeches in character_speech.items()
}

total_words = sum(character_word_counts.values())
character_word_percentage = {char: (count / total_words * 100) for char, count in character_word_counts.items()}

print("Word Counts Per Character:", character_word_counts)
print("Word Percentages Per Character:", character_word_percentage)


Word Counts Per Character: {'JACK': 356, 'GWENDOLEN': 508, 'CECILY': 562, 'ALGERNON': 368, 'MERRIMAN': 35}
Word Percentages Per Character: {'JACK': 19.464188080918536, 'GWENDOLEN': 27.7747402952433, 'CECILY': 30.727173318753415, 'ALGERNON': 20.12028430836523, 'MERRIMAN': 1.913613996719519}


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Analysis of Algernon's speech
algernon_speech = ' '.join(character_speech['ALGERNON'])
algernon_words = [word for word in tokenizer.tokenize(algernon_speech.lower()) if word not in stop_words]
algernon_word_freq = Counter(algernon_words)

algernon_common_words = algernon_word_freq.most_common(10)
print("Most common words used by Algernon:", algernon_common_words)

Most common words used by Algernon: [('cecily', 11), ('muffins', 9), ('name', 7), ('jack', 5), ('like', 5), ('eat', 5), ('one', 5), ('tea', 5), ('love', 4), ('yes', 4)]


In [16]:
# Comparison of Algernon's and Lane's speech
lane_speech = ' '.join(character_speech['LANE'])
lane_words = [word for word in tokenizer.tokenize(lane_speech.lower()) if word not in stop_words]
lane_word_freq = Counter(lane_words)

# Very basic way to count "distinctive" is by checking if a word is used more by one character compared to another
distinctive_algernon = {word for word in algernon_word_freq if algernon_word_freq[word] > lane_word_freq.get(word, 0)}
distinctive_lane = {word for word in lane_word_freq if lane_word_freq[word] > algernon_word_freq.get(word, 0)}

print("Distinctive words used by Algernon:", distinctive_algernon)
print("Distinctive words used by Lane:", distinctive_lane)

Distinctive words used by Algernon: {'thing', 'till', 'dish', 'must', 'passionately', 'away', 'taken', 'practice', 'garden', 'beauty', 'perfectly', 'food', 'dear', 'muffins', 'eating', 'possible', 'settled', 'cake', 'oh', 'christening', 'talk', 'shakes', 'devotedly', 'chasuble', 'deceiving', 'takes', 'butter', 'bad', 'experienced', 'written', 'three', 'hopelessly', 'chair', 'jack', 'best', 'court', 'old', 'lady', 'next', 'though', 'marry', 'probably', 'hat', 'intimately', 'read', 'could', 'church', 'stock', 'moving', 'heartless', 'speaking', 'week', 'round', 'particularly', 'rites', 'simply', 'finished', 'everything', 'possibly', 'back', 'hereditary', 'actually', 'continues', 'broken', 'present', 'keep', 'agitated', 'bankruptcy', 'charming', 'give', 'drink', 'vulgar', 'instance', 'done', 'anybody', 'served', 'kisses', 'go', 'looked', 'angel', 'engaged', 'really', 'world', 'way', 'absurd', 'christened', 'fairfax', 'hospitality', 'miss', 'dared', 'except', 'yet', 'likelihood', 'broke', '

### Q: How can I track the mirroring of language in a text (ex: when characters are in conversation and repeat similar words back to each other vs. when specific discussions have a large variance of words)?

A: For Lexical Similarity, we can use Jaccard Similarity, which measures the similarity between two sets; or cosine Similarity, which uses the count of words (vector space model) to measure the cosine of the angle between two vectors -- this could be useful for longer texts.

For Lexical Diversity (Variance), we can use Type-Token Ratio (TTR): The ratio of unique words (types) to the total number of words (tokens) in the text.

In [14]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict, deque
import re

nltk.download('punkt')
nltk.download('stopwords')

# Define characters
characters = ['ALGERNON', 'JACK', 'GWENDOLEN', 'CECILY']

# Reading the play text
file_path = '../data/1895_wilde-oscar_the-importance-of-being-earnest.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Function to parse the play into dialogues by characters
def parse_dialogues(text):
    pattern = r'^([A-Z]{2,}):(.*)$'
    dialogues = defaultdict(deque)  # Using deque for efficient pop and append operations
    current_speaker = None
    current_dialogue = []

    for line in text.split('\n'):
        match = re.match(pattern, line.strip())
        if match:
            speaker, words = match.groups()
            if speaker in characters:
                if speaker != current_speaker and current_speaker is not None:
                    dialogues[current_speaker].append(' '.join(current_dialogue))
                    current_dialogue = []
                current_speaker = speaker
            words = word_tokenize(words)
            current_dialogue.extend(words)
        elif current_speaker:
            dialogues[current_speaker].append(' '.join(current_dialogue))
            current_dialogue = []
            current_speaker = None

    if current_dialogue:
        dialogues[current_speaker].append(' '.join(current_dialogue))

    return dialogues

dialogues = parse_dialogues(text)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
def jaccard_similarity(doc1, doc2):
    words_doc1 = set(doc1)
    words_doc2 = set(doc2)
    intersection = words_doc1.intersection(words_doc2)
    union = words_doc1.union(words_doc2)
    return float(len(intersection)) / len(union) if union else 0.0

def type_token_ratio(text):
    words = word_tokenize(text)
    return len(set(words)) / len(words) if words else 0

# Calculate similarity and diversity for each pair of consecutive dialogues
for character, speeches in dialogues.items():
    print(f"\nCharacter: {character}")
    for i in range(len(speeches) - 1):
        sim = jaccard_similarity(speeches[i], speeches[i+1])
        ttr = type_token_ratio(speeches[i])
        print(f"Dialogue {i+1} & {i+2} Jaccard Similarity: {sim:.2f}")
        print(f"Dialogue {i+1} Type-Token Ratio: {ttr:.2f}")



Character: JACK
Dialogue 1 & 2 Jaccard Similarity: 0.69
Dialogue 1 Type-Token Ratio: 0.77
Dialogue 2 & 3 Jaccard Similarity: 0.72
Dialogue 2 Type-Token Ratio: 0.75
Dialogue 3 & 4 Jaccard Similarity: 0.53
Dialogue 3 Type-Token Ratio: 0.77
Dialogue 4 & 5 Jaccard Similarity: 0.48
Dialogue 4 Type-Token Ratio: 0.74
Dialogue 5 & 6 Jaccard Similarity: 0.29
Dialogue 5 Type-Token Ratio: 1.00
Dialogue 6 & 7 Jaccard Similarity: 0.24
Dialogue 6 Type-Token Ratio: 1.00
Dialogue 7 & 8 Jaccard Similarity: 0.44
Dialogue 7 Type-Token Ratio: 0.91
Dialogue 8 & 9 Jaccard Similarity: 0.56
Dialogue 8 Type-Token Ratio: 1.00
Dialogue 9 & 10 Jaccard Similarity: 0.50
Dialogue 9 Type-Token Ratio: 0.94
Dialogue 10 & 11 Jaccard Similarity: 0.56
Dialogue 10 Type-Token Ratio: 1.00
Dialogue 11 & 12 Jaccard Similarity: 0.54
Dialogue 11 Type-Token Ratio: 0.81
Dialogue 12 & 13 Jaccard Similarity: 0.58
Dialogue 12 Type-Token Ratio: 1.00
Dialogue 13 & 14 Jaccard Similarity: 0.61
Dialogue 13 Type-Token Ratio: 1.00
Dialogue

In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
import re

nltk.download('punkt')
nltk.download('stopwords')

# Define characters
characters = ['ALGERNON', 'JACK', 'GWENDOLEN', 'CECILY', 'LADY BRACKNELL', 'MISS PRISM', 'LANE', 'MERRIMAN']

# Reading the play text
file_path = '../data/1895_wilde-oscar_the-importance-of-being-earnest.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Function to parse the play into dialogues by characters
def parse_dialogues(text):
    pattern = r'^([A-Z ]+):\s*(.*)$'
    dialogues = defaultdict(list)  # Stores dialogues along with subsequent speaker
    last_speaker = None

    for line in text.split('\n'):
        match = re.match(pattern, line.strip())
        if match:
            speaker, words = match.groups()
            speaker = speaker.strip()
            if speaker in characters:
                if last_speaker and last_speaker != speaker:
                    dialogues[(last_speaker, speaker)].append(words)
                last_speaker = speaker
            else:
                last_speaker = None  # Reset on non-character lines or scene changes
        else:
            last_speaker = None  # Ensures dialogues are between character changes

    return dialogues

dialogues = parse_dialogues(text)

# Functions to calculate Jaccard similarity and Type-Token Ratio
def jaccard_similarity(doc1, doc2):
    words_doc1 = set(word_tokenize(doc1.lower()))
    words_doc2 = set(word_tokenize(doc2.lower()))
    intersection = words_doc1.intersection(words_doc2)
    union = words_doc1.union(words_doc2)
    return float(len(intersection)) / len(union) if union else 0.0

def type_token_ratio(text):
    words = word_tokenize(text.lower())
    unique_words = set(words)
    return len(unique_words) / len(words) if words else 0

# Analyzing dialogues for similarity and diversity
for pair, convo in dialogues.items():
    if len(convo) > 1:  # Ensures there are at least two dialogues to compare
        print(f"\nDialogue between {pair[0]} and {pair[1]}:")
        for i in range(len(convo) - 1):
            sim = jaccard_similarity(convo[i], convo[i+1])
            ttr1 = type_token_ratio(convo[i])
            ttr2 = type_token_ratio(convo[i+1])
            print(f" - {pair[0]} to {pair[1]} similarity: {sim:.3f}")
            print(f" - {pair[0]} TTR: {ttr1:.3f}, {pair[1]} TTR: {ttr2:.3f}")



Dialogue between CECILY and ALGERNON:
 - CECILY to ALGERNON similarity: 0.042
 - CECILY TTR: 0.895, ALGERNON TTR: 0.800
 - CECILY to ALGERNON similarity: 0.091
 - CECILY TTR: 0.800, ALGERNON TTR: 0.824
 - CECILY to ALGERNON similarity: 0.034
 - CECILY TTR: 0.824, ALGERNON TTR: 1.000
 - CECILY to ALGERNON similarity: 0.040
 - CECILY TTR: 1.000, ALGERNON TTR: 0.649
 - CECILY to ALGERNON similarity: 0.111
 - CECILY TTR: 0.649, ALGERNON TTR: 1.000
 - CECILY to ALGERNON similarity: 0.083
 - CECILY TTR: 1.000, ALGERNON TTR: 1.000
 - CECILY to ALGERNON similarity: 0.062
 - CECILY TTR: 1.000, ALGERNON TTR: 1.000
 - CECILY to ALGERNON similarity: 0.045
 - CECILY TTR: 1.000, ALGERNON TTR: 0.867
 - CECILY to ALGERNON similarity: 0.120
 - CECILY TTR: 0.867, ALGERNON TTR: 0.833

Dialogue between ALGERNON and CECILY:
 - ALGERNON to CECILY similarity: 0.165
 - ALGERNON TTR: 0.733, CECILY TTR: 0.744
 - ALGERNON to CECILY similarity: 0.152
 - ALGERNON TTR: 0.744, CECILY TTR: 0.727
 - ALGERNON to CECIL

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomvannuenen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
