# TAHLR Supplement: Document Term Matrices 

This TAHLR supplement shows how to build a document term matrix (DTM) on your own set of files, using the first three books of Homer's *Odyssey* (as found in the `data/texts/lyoc` folder from Week 2) and CountVectorizer and TfidfTransformer from scikit-learn.

In [None]:
# Imports

import nltk
from nltk import word_tokenize

In [None]:
# Get list of files

from glob import glob

files = sorted(glob('../data/texts/lyoc/*.txt'))

print(len(files))

In [None]:
# Make list of texts

texts = []

for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        texts.append(text)

print(len(texts))

In [None]:
type(texts[0])

In [None]:
## NB: Need to install scikit-learn first
# pip install -U scikit-learn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
MIN_DF = 2
MAX_DF = 2
LOWERCASE = True
STOPWORDS = None
TOKEN_PATTERN = r'\b[^\W\d_]+\b'

# cf. https://stackoverflow.com/a/35615151
# min_df is used for removing terms that appear too infrequently. For example:
#     min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
#     min_df = 5 means "ignore terms that appear in less than 5 documents".
# max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:
#     max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
#     max_df = 25 means "ignore terms that appear in more than 25 documents".

CV = CountVectorizer(input='content', min_df=MIN_DF, max_df=MAX_DF, stop_words=None, lowercase=LOWERCASE, token_pattern=TOKEN_PATTERN)

In [None]:
type(CV)

In [None]:
CV.get_params()

In [None]:
dtm = CV.fit_transform(texts)

In [None]:
print(type(dtm))

In [None]:
vocab = CV.get_feature_names_out()

In [None]:
print(vocab[:10])

In [None]:
def filename2label(filename):
    return filename.split('/')[-1].split('.')[0]

labels = [filename2label(file) for file in files]
print(labels)

In [None]:
## NB: Need to install scikit-learn first
# pip install -U pandas

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(dtm.toarray(), columns=vocab, index=labels)

In [None]:
df

In [None]:
df['ῥοδοδάκτυλος']

In [None]:
df.loc['homer-odyssey-1']

In [None]:
df.loc['homer-odyssey-1'].sort_values(ascending=False).head(10)

In [None]:
od1_dict = df.loc['homer-odyssey-1'].to_dict()

In [None]:
print(dict(list(od1_dict.items())[:10]))

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

TT = TfidfTransformer(use_idf=True).fit(dtm)
tfidf_dtm = TT.transform(dtm)

In [None]:
tfidf_df = pd.DataFrame(tfidf_dtm.toarray(), columns=CV.get_feature_names_out(), index=labels)

In [None]:
tfidf_df.iloc[0].sort_values(ascending=False).head(10)

In [None]:
tfidf_df.iloc[1].sort_values(ascending=False).head(10)

In [None]:
tfidf_df.iloc[2].sort_values(ascending=False).head(10)