# Term Frequency - Inverse Document Frequency

In [167]:
from nltk.corpus import PlaintextCorpusReader, stopwords
import regex
import pandas as pd
import os

## Preprocess

First, let's decide which corpus we'd like to work with (pick the name of the folder!)

In [168]:
corpus_name = 'ChiLit'

### Load the corpus

In [169]:
r = os.path.join('data', corpus_name)

# next lines check that the directory is good; it raise an exception if not
# otherwhise, 'All set!' will be printed
assert os.path.isdir(r), f'You did not specify a valid path ({r} does not exist)'
print('All set!')

All set!


In [170]:
corpus = PlaintextCorpusReader(r, r'.*\.txt')
len(corpus.fileids())

71

### Tokenize and clean

In [171]:
stops = stopwords.words('english')
stops[:5]

['i', 'me', 'my', 'myself', 'we']

In [172]:
def clean(tok):
    return regex.sub(r'[^\p{L}]+', '', tok)

In [173]:
files = corpus.fileids()

docs = []
for f in files:
    words = [clean(w.lower()) for w in corpus.words(f) if w.lower() not in stops]
    docs.append(' '.join([w for w in words if w != '']))


## Calculate TD-IDF

We will build 2 dataframes:
* `raw_df`: raw frequencies obtained with sklearn's vectorizer
* `idf_df`: the tf-idf normalized dataframe created following Jurafsky 

In the Appendix, we will add a third:
* `dt_df`: tf-idf normalized frequencies using `scikit-learn`


Before we start, we define a couple of functions to get the top-n words and texts from the dataframes

In [218]:
# We pass a dataframe, a text and a number n; returns the top-n words attested in the text
def get_top_from_df(df, text, n=20):
    return df.loc[text].sort_values(ascending=False).head(n)

# We pass a dataframe, a word and a number n; returns the top-n texts where the word is most attested
def get_top_text_word(df, word, n=20):
    return df[word].sort_values(ascending=False).head(n)

### Document-Term Matrix with raw frequencies

In [184]:
from sklearn.feature_extraction.text import CountVectorizer

# Create the CountVectorizer object without normalization (default is normalization='l2')
raw_vectorizer = CountVectorizer(analyzer='word',
                                #  max_df=.9,
                                #  min_df=2, 
                                 lowercase=False, 
                                 stop_words=None, 
                                 binary=False,
                                 )

# Fit and transform the documents
dtm = raw_vectorizer.fit_transform(docs)

# Convert the DTM to a dense array for easier manipulation
dtm_array = dtm.toarray()

# Get the feature (word) names
feature_names = raw_vectorizer.get_feature_names_out()

raw_df = pd.DataFrame(dtm_array, columns=feature_names, index=files)

raw_df.head()

Unnamed: 0,aaa,aaarh,aah,aall,aamash,aaron,ab,aback,abaft,abana,...,zéphyrine,zōōt,zōōts,æneas,æons,æschylus,æsthetic,æsthetical,éperon,ôsso
alice.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alone.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
amulet.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
beauty.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
brass.txt,0,0,0,0,8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [185]:
raw_df['boy']

alice.txt          3
alone.txt         53
amulet.txt        20
beauty.txt        78
brass.txt         10
                ... 
water.txt         36
willows.txt        4
wind.txt         119
winning.txt       17
woodmagic.txt      6
Name: boy, Length: 71, dtype: int64

These are the top-5 documents with the highest number of attestations for the target word

In [219]:
get_top_text_word(raw_df, 'boy', n=5)

eric.txt        288
dominics.txt    286
tombrown.txt    215
vice.txt        209
daisy.txt       207
Name: boy, dtype: int64

And these are the top-20 most frequent words in R.L. Stevenson's *Treasure Island*

In [220]:
get_top_from_df(raw_df, 'treasure.txt', n=20)

said       323
one        278
man        260
captain    234
silver     222
like       207
could      174
doctor     172
would      169
us         166
well       156
upon       152
see        142
time       131
good       129
hand       122
still      120
old        119
ship       116
long       115
Name: treasure.txt, dtype: int64

### DF, IDF, Tf-Idf

We define a function to calculate idf

In [None]:
def get_idf(col):
    docf = col[col != 0].count()
    idf = log10(len(col) / docf)
    return idf

then we create a vector of all idf's

In [None]:
idfs = raw_df.apply(get_idf)

here we scale all term freq's with log10

In [None]:
norm_raw_df = raw_df.replace(0, 1)
tf_norms = np.log10(norm_raw_df)

`tf_norms` now holds all term freq scaled with log10

In [None]:
tf_norms.head()

Unnamed: 0,aaa,aaarh,aah,aall,aamash,aaron,ab,aback,abaft,abana,...,zéphyrine,zōōt,zōōts,æneas,æons,æschylus,æsthetic,æsthetical,éperon,ôsso
alice.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
alone.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
amulet.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
beauty.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
brass.txt,0.0,0.0,0.0,0.0,0.90309,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
raw_df.abandon['brass.txt']

3

Now we multiply the scaled df x the idf vector

In [None]:
idf_df = tf_norms * idfs
idf_df.head()

Unnamed: 0,aaa,aaarh,aah,aall,aamash,aaron,ab,aback,abaft,abana,...,zéphyrine,zōōt,zōōts,æneas,æons,æschylus,æsthetic,æsthetical,éperon,ôsso
alice.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
alone.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
amulet.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
beauty.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
brass.txt,0.0,0.0,0.0,0.0,1.671853,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Abandon in `brass.txt` should be...

In [None]:
log10(3) * log10((71 / 20))

0.2625256421919799

In [None]:
idf_df.abandon['brass.txt']

0.2625256421919799

Captain in `treasure.txt` should be...

In [None]:
log10(234) * log10((71 / 47))

0.42446987578531514

In [None]:
idf_df.captain['treasure.txt']

0.42446987578531514

Zoo in `amulet.txt` should be...

In [None]:
# 0.896655
log10(raw_df.zoo['amulet.txt']) * log10((71 / 5))

0.896654615984174

### Inspect the idf list

In [242]:
idfs.sort_values(ascending=True).head(100)

one       0.000000
back      0.000000
head      0.000000
came      0.000000
went      0.000000
            ...   
tell      0.025184
used      0.025184
many      0.025184
night     0.025184
things    0.025184
Length: 100, dtype: float64

Now let us define a function to get the ranking in the idf list of a given word

In [None]:
def get_rank(word, series):
    sorted_series = series.sort_values(ascending=True)
    ranked_series = sorted_series.rank()
    return ranked_series.loc[word]


In [243]:
get_rank('wild', idfs)

440.0

### Filter using DF

We may be interested to filter out words that are:
* too frequent: they occur in all, or nearly all documents ("said", "went", "boy"...)
* too specific: they occur only in text (mostly, specific proper nouns, like Irene or Curdie for *Princess and the Goblins*)

As the list of words corresponds to the list of the columns, we can use `pandas`' column filter to save a copy of our dataframes.

First, let's recalculate the document frequency.

In [224]:
docfs = raw_df.apply( lambda x: x[x != 0].count())

In [253]:
idfs['listen']

0.06592851370830824

In [225]:
docfs.head()

aaa       2
aaarh     1
aah       1
aall      1
aamash    1
dtype: int64

We keep only the words that have:
* document frequency > 1
* document frequency / tot doc < 0.9 (i.e. they are attested in less than 90% of the documents); this means in at most 63 docs

In [234]:
keep = docfs[(docfs > 1) & (docfs <= 63)].index

Out of 43,936, we keep only...

In [235]:
len(keep)

25706

We make the dataframe `pruned_raw` and `pruned_idf` to hold the filtered versions

In [236]:
pruned_raw = raw_df[keep]
pruned_raw.shape

(71, 25706)

In [237]:
pruned_raw.head()

Unnamed: 0,aaa,ab,aback,abaft,abandon,abandoned,abandoning,abandonment,abase,abasement,...,zone,zones,zoo,zoological,zu,zulu,zululand,zulus,æneas,æsthetic
alice.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alone.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
amulet.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,6,1,0,0,0,0,0,0
beauty.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
brass.txt,0,0,0,0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [239]:
pruned_idf = idf_df[keep]
pruned_idf.shape

(71, 25706)

In [240]:
get_top_from_df(pruned_raw, 'treasure.txt')

captain     234
silver      222
doctor      172
ship        116
sea         110
squire      106
sir         102
jim          97
john         84
island       81
mr           64
cap          61
treasure     59
flint        58
rum          57
hawkins      52
ben          50
dr           46
already      45
cut          45
Name: treasure.txt, dtype: int64

## Concordances using `nltk`

In [18]:
from nltk import Text

In [19]:
princess_text = Text(corpus.words('princess.txt'))
treasure_text = Text(corpus.words('treasure.txt'))

In [20]:
princess_text.concordance('cobs', width=100)

Displaying 19 of 19 matches:
n ' t like it ." " Who don ' t like it ?" " The cobs , as we call them ." " Don ' t !" said the nurs
r lack of time to wind it up as he " dodged the cobs ," would be in what seemed the most hopeless en
e goblin miners were about . CHAPTER XIII . THE COBS ' CREATURES ABOUT this time , the gentlemen who
wing mass , which he knew must be a knot of the cobs ' creatures . Before he could recover his feet 
nd still stood thinking . It was clear that the cobs ' creatures had found his axe , had between the
nly surrounded by about half - a - dozen of the cobs , the first I had ever seen , although I had he
 or another moon or anything of that sort . The cobs dropped persecuting me , and looked dazed , and
a white pigeon . But whatever it was , when the cobs caught sight of it coming straight down upon th
it no more . But I had no more trouble with the cobs that night , or at any time afterward ." " How 
 I have nearly discovered in what direction the cobs are minin

In [21]:
treasure_text.concordance('stockade', width=100)

Displaying 25 of 39 matches:
the ship we could see nothing of the house or stockade , for they were quite buried among trees ; an
lutter in the air above a wood . PART 4 . The Stockade CHAPTER 16 . Narrative Continued by the Docto
 pulled straight in , in the direction of the stockade upon the chart . The two who were left guardi
d not gone a hundred yards when I reached the stockade . This was how it was : a spring of clear wat
r out of the ship , but not yet ashore in our stockade . CHAPTER 17 . Narrative Continued by the Doc
any moment . " I cannot keep her head for the stockade , sir ," said I to the captain . I was steeri
not only the danger of being cut off from the stockade in our half - crippled state but the fear bef
he strip of wood that now divided us from the stockade , and at every step we took the voices of the
r we came to the edge of the wood and saw the stockade in front of us . We struck the enclosure abou
 get the poor old gamekeeper hoisted over the stockade and car

## Appendix: Tf-Idf normalization with the `TfIdfVectorizer` from `scikit-learn` 

The `TfIdfVectorizer` from Python's library for data analysis `scikit-learn` implements a different calculation of the metric than the one discussed by Jurafksy and Martin. Most notably, it doesn't scale the term frequencies with the log10. Eventually, this gives higher relevance to more frequent words.

See:
* the official [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) for the `TfIdfVectorizer`
* this [discussion](https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf) in the popular (and very useful) blog *The Programming Historian*

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=.9, 
                             min_df=2, stop_words=None, use_idf=True, norm=None)
dt_matrix_idf = vectorizer.fit_transform(docs)

In [None]:
feature_names = vectorizer.get_feature_names_out()

dense = dt_matrix_idf.todense()
denselist = dense.tolist()
dt_df = pd.DataFrame(denselist, columns=feature_names, index=files)

In [None]:
dt_df.head()

Unnamed: 0,aaa,ab,aback,abaft,abandon,abandoned,abandoning,abandonment,abase,abasement,...,zone,zones,zoo,zoological,zu,zulu,zululand,zulus,æneas,æsthetic
alice.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
alone.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
amulet.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,20.90944,2.791759,0.0,0.0,0.0,0.0,0.0,0.0
beauty.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
brass.txt,0.0,0.0,0.0,0.0,6.696431,1.980829,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
dt_df.loc['treasure.txt'].sort_values(ascending=False).head(20)

captain      328.878835
jim          298.705830
silver       266.206566
doctor       256.705955
squire       236.607230
hawkins      217.258799
ship         165.476148
rum          146.411107
john         137.682717
ben          135.585838
stockade     129.899483
sea          124.688453
flint        124.187970
island       120.890595
sir          117.252237
morgan       116.711153
anchorage    101.062293
mutineers     96.095238
aboard        94.910087
dr            94.658354
Name: treasure.txt, dtype: float64