# TAHLR Supplement: Conditional Frequency Distribution tutorial

This TAHLR supplement shows how to build a conditional frequency distribution on your own set of files, usign the first three books of Homer's *Odyssey* (as found in the `data/texts/lyoc` folder from Week 2). The specific example shows which is more frequent—καί or καὶ, i.e. the word for 'and' with either an acute or grave accent—in each of the first three books.

In [1]:
# Imports

import nltk
from nltk import word_tokenize

In [2]:
# Get list of files

from glob import glob

files = glob('../data/texts/lyoc/*.txt')

print(len(files))

3


In [3]:
# Make list of texts

texts = []

for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        texts.append(text)

print(len(texts))

3


In [4]:
# Make list of tokenized texts (using list comprehension)

tokenized_texts = [word_tokenize(text) for text in texts]
print(len(tokenized_texts))

3


In [5]:
# # Make list of tokenized texts; nb: same as above without list comprehension

# tokenized_texts = []

# for text in texts:
#     tokenized_texts.append(word_tokenize(text))

# print(len(tokenized_texts))

In [6]:
# Make a list of labels; i.e. these will be the 'conditions' in the cfd

label_1 = 'odyssey-1'
label_2 = 'odyssey-2'
label_3 = 'odyssey-3'


# Note how the list of labels is the same length as the list of tokenized texts
labels_1 = [label_1] * len(tokenized_texts[0])
labels_2 = [label_2] * len(tokenized_texts[1])
labels_3 = [label_3] * len(tokenized_texts[2])

print(len(tokenized_texts[0]))
print(len(labels_1))

4102
4102


In [7]:
# Create lists of (label, token) tuples

labelled_tokens_1 = list(zip(labels_1, tokenized_texts[0]))
labelled_tokens_2 = list(zip(labels_2, tokenized_texts[1]))
labelled_tokens_3 = list(zip(labels_3, tokenized_texts[2]))

print(labelled_tokens_1[:10])

[('odyssey-1', 'ἦμος'), ('odyssey-1', 'δ'), ('odyssey-1', "'"), ('odyssey-1', 'ἠριγένεια'), ('odyssey-1', 'φάνη'), ('odyssey-1', 'ῥοδοδάκτυλος'), ('odyssey-1', '̓Ηώς'), ('odyssey-1', ','), ('odyssey-1', 'ὤρνυτ'), ('odyssey-1', "'")]


In [8]:
# Create one long list of all the (label, token) tuples

labelled_tokens = labelled_tokens_1 + labelled_tokens_2 + labelled_tokens_3

In [9]:
# Create a conditional frequency distribution from the labelled tokens

cfd = nltk.ConditionalFreqDist(labelled_tokens)

In [10]:
# Print the most common tokens for the first label

cfd['odyssey-1'].most_common(10)

[("'", 350),
 (',', 299),
 ('δ', 113),
 ('.', 111),
 (':', 106),
 ('καὶ', 83),
 ('δὲ', 36),
 ('τε', 31),
 ('οἱ', 28),
 ('δέ', 25)]

In [11]:
# Tabulate the frequency of specific tokens (i.e. samples) for each label (i.e. conditions)

cfd.tabulate(conditions=[label_1, label_2, label_3], samples=['καί', 'καὶ'])

          καί καὶ 
odyssey-1   5  83 
odyssey-2   2  88 
odyssey-3   5  69 
