In [33]:
%pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [34]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [35]:
from lxml import etree
from pathlib import Path

In [36]:
TEI_NS = "http://www.tei-c.org/ns/1.0"
XML_NS = "http://www.w3.org/XML/1998/namespace"

NAMESPACES = {
    "tei": TEI_NS,
    "xml": XML_NS,
}

In [37]:
import nltk

# download the files needed for tokenization
# the punkt tokenizer should be installed already,
# but let's download it just in case
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [38]:
files = Path("./xml/tlg0012").glob("**/*perseus-eng*.xml")

In [None]:
from collections import Counter

# Using our `tokenized_texts` dictionary, we'll iterate
# through each key-value pair — remember, the keys are
# filenames and the values are lists of tokens.
# We'll get a count of the tokens by passing the list to
# `Counter`, then we'll change the value for that key to
# a dictionary with its own keys, `tokens` and `counts`.

# for filename, tokens in tokenized_texts.items():
#     counts = Counter(tokens)

#     tokenized_texts[filename] = {"tokens": tokens, "counts": counts}
#     tokenized_texts["tlg0012.tlg001.perseus-eng3.txt"]["counts"]["odysseus"]

# Create a new dictionary to store the updated structure
updated_tokenized_texts = {}

filename = "tlg0012.tlg001.perseus-eng3.txt"
tokens = ["odysseus", "zeus", "odysseus", "athena"]  # Example tokenized words

# Add it to tokenized_texts
#tokenized_texts[filename] = tokens


updated_tokenized_texts = {}

#

tokenized_texts = updated_tokenized_texts  # Update dictionary

# Check if the file is now included
#



File successfully included!


In [40]:
for file in files:
    # print the name of the file as a sanity check
    print(file)
    
    # etree.parse() reads the file and turns the raw XML into an object that we can use in Python
    tree = etree.parse(file)

    # xpath() is a method that applies **xpath expressions** to search through the XML.
    # This xpath expression says, "Find any `tei:div` element with a `subtype` of `'card'`.
    # Under that element, get any text." The second argument, `namespaces=`, tells the
    # method to use the supplied namespaces as shortcuts, so we don't have to type out
    # "http://www.tei-c.org/ns/1.0" every time we want an element in the TEI namespace.

    text = tree.xpath(f"//tei:div[@subtype='card']//text()", namespaces=NAMESPACES)

    # xpath() returns an array of matches, so we initialize an empty array to store the
    # results. We could use a list comprehension, but for now rewriting these
    # lines as a list comprehension is left as an exercise for the reader.
    cleaned_text = []

    # Now we iterate through each string returned by `xpath()`
    for t in text:
        # `strip()` removes leading and trailing whitespace; if all that's left is an empty
        # string, we don't care about it.
        if t.strip() != "":
            cleaned_text.append(t.strip())

    # We make sure that we actually *have* text before writing just the text, without
    # TEI elements, to a separate file. No need to write an empty file, right?
    if len(cleaned_text) > 0:
        # A lot is happening here:
        #
        # 1. `str(file)` turns the `Path` object into a `str`
        # 2. `split("/")` splits the resulting string at every "/"
        # 3. `[-1]` takes the last element of the list returned by `split("/")`
        # 4. `replace(".xml", ".txt")` changes the extension of the file
        # 
        # So something like "xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml"
        # is transformed into "tlg0012.tlg001.perseus-eng3.txt".
        with open(str(file).split("/")[-1].replace(".xml", ".txt"), "w+") as f:
            # We write the text to the file, `join`-ing each element in
            # `cleaned_text` with a newline ("\n")
            f.write("\n".join(cleaned_text))

In [41]:
# Initialize the tokenizer
from nltk.tokenize import word_tokenize

# Initialize an empty dictionary to store the tokenized texts
tokenized_texts = {}

# Get a Path.glob() iterator for the .txt files that you've created in this directory.
# Can you figure out what the new `[1-4]` segment is doing?
text_files = Path(".").glob("tlg0012.tlg00*.perseus-eng[1-4].txt")

# Iterate through the text files, reading and tokenizing them one by one,
# then storing the list of tokens in our `tokenized_texts` dictionary —
# so we'll be getting a dictionary of lists.
for file in text_files:
    name = str(file)

    with open(file) as f:
        # Notice we're lowercasing the text. You don't *have*
        # to do this, but it helps eliminate some noise for
        # our purposes.
        text = f.read().lower()
        tokens = word_tokenize(text)

        # Let's just print the length of the tokens list to make
        # sure we're getting sane results. We'll use string interpolation
        # to identify which text we're working with.
        print(f"There are {len(tokens)} tokens in {name}.")

        # Store each file's `tokens` list in the `tokenized_texts`
        # dictionary, using the filename as the key.
        tokenized_texts[name] = tokens


In [42]:
from collections import Counter
from math import log10

df_achilles = 0
df_odysseus = 0

# Iterate through the dictionary to count DF values
for filename, values in tokenized_texts.items():
    if "odysseus" in values['counts']:
        df_odysseus += 1
    
    if "achilles" in values["counts"]:
        df_achilles += 1

n_docs = len(tokenized_texts.keys())

# Prevent division by zero
idf_achilles = log10(n_docs / df_achilles) if df_achilles > 0 else 0
idf_odysseus = log10(n_docs / df_odysseus) if df_odysseus > 0 else 0

print(idf_achilles)
print(idf_odysseus)


0
0


In [43]:
# Now let's calculate the TF-IDF "score" for each term in each document.

# Once again, iterate through the dictionary.
for filename, values in tokenized_texts.items():
    # Get the total number of terms in each file — we'll
    # use this to calculate the relative frequency as our
    # TF.
    total_terms = len(values['tokens'])

    # Get the TF for each term in this file.
    tf_achilles = values['counts']['achilles'] / total_terms
    tf_odysseus = values['counts']['odysseus'] / total_terms

    # Remember, the simplest version of TF-IDF is just
    # TF * 1/DF
    tf_idf_achilles = tf_achilles * idf_achilles
    tf_idf_odysseus = tf_odysseus * idf_odysseus

    # Now we can report on the statistics for this file
    print(f"""In {filename}:
TF of achilles: {tf_achilles}
TF of odysseus: {tf_odysseus}
TF-IDF of achilles: {tf_idf_achilles}
TF-IDF of odysseus: {tf_idf_odysseus}
""")