# Experimenting with Tf-Idf

In [108]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import PlaintextCorpusReader, stopwords
import regex
import pandas as pd

In [109]:
corpus = PlaintextCorpusReader('../data/ChiLit', r'.*\.txt')
len(corpus.fileids())

71

In [110]:
stops = stopwords.words('english')
stops[:5]

['i', 'me', 'my', 'myself', 'we']

In [111]:
def clean(tok):
    import regex
    return regex.sub(r'[^\p{L}]+', '', tok)

In [112]:
files = [f for f  in corpus.fileids()]

docs = []
for f in files:
    words = [clean(w.lower()) for w in corpus.words(f)]
    docs.append([w for w in words if w != '' and w not in stops])

I want to add The Hobbit to it:

In [113]:
with open('hobbit.txt') as f:
    fname = 'hobbit.txt'
    hbt_txt = f.read()

tks = corpus._word_tokenizer
hbt_toks = tks.tokenize(hbt_txt)

hbt_words = [clean(h.lower()) for h in hbt_toks]
docs.append([w for w in hbt_words if w != '' and w not in stops])

In [114]:
docs[-1][:20]

['unexpected',
 'party',
 'hole',
 'ground',
 'lived',
 'hobbit',
 'nasty',
 'dirty',
 'wet',
 'hole',
 'filled',
 'ends',
 'worms',
 'oozy',
 'smell',
 'yet',
 'dry',
 'bare',
 'sandy',
 'hole']

In [115]:
files.append(fname)
len(files)

72

In [116]:
len(docs)

72

### Raw counts (`CountVectorizer`)

In [117]:
raw_vectorizer = CountVectorizer(analyzer=lambda x: x,
                                 max_df=.90,
                                 min_df=2,
                                #  max_features=10000,
                                 lowercase=False, 
                                 stop_words=None, 
                                 binary=False,
                                 )

dtm = raw_vectorizer.fit_transform(docs)

dtm_array = dtm.toarray()

# Get the feature (word) names
feature_names = raw_vectorizer.get_feature_names_out()

raw_df = pd.DataFrame(dtm_array, columns=feature_names, index=files)

raw_df.head()


Unnamed: 0,aaa,ab,aback,abaft,abandon,abandoned,abandoning,abandonment,abase,abasement,...,zones,zoo,zoological,zu,zulu,zululand,zulus,à,æneas,æsthetic
alice.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alone.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
amulet.txt,0,0,0,0,0,0,0,0,0,0,...,0,6,1,0,0,0,0,0,0,0
beauty.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
brass.txt,0,0,0,0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [118]:
raw_df.tail()

Unnamed: 0,aaa,ab,aback,abaft,abandon,abandoned,abandoning,abandonment,abase,abasement,...,zones,zoo,zoological,zu,zulu,zululand,zulus,à,æneas,æsthetic
willows.txt,0,0,2,0,1,2,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
wind.txt,0,0,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
winning.txt,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
woodmagic.txt,0,0,1,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hobbit.txt,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Calculate Tf-Idf

In [119]:
df = raw_df

### Using Jurafski and Martin's formula

#### Term frequency (df)

This is the formula they use:

$
\text{tf}_{t,d} =
\begin{cases}
1 + \log_{10}(\text{count}(t, d)) & \text{if } \text{count}(t, d) > 0 \\
0 & \text{otherwise}
\end{cases}$

In [120]:
import numpy as np

def calculate_tf(count):
    return 1 + np.log10(count) if count > 0 else 0

In [121]:
tf_matrix = df.applymap(calculate_tf)

In [122]:
tf_matrix.head()

Unnamed: 0,aaa,ab,aback,abaft,abandon,abandoned,abandoning,abandonment,abase,abasement,...,zones,zoo,zoological,zu,zulu,zululand,zulus,à,æneas,æsthetic
alice.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
alone.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
amulet.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.778151,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
beauty.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
brass.txt,0.0,0.0,0.0,0.0,1.477121,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.30103,0.0,0.0


#### Inverse Document frequency (idf)

Here's the formula they use:

$\text{idf}_t = \log_{10} \left( \frac{N}{\text{df}_t} \right)$

In [123]:
idf_df = tf_matrix * idfs
idf_df.tail()

Unnamed: 0,aaa,ab,aback,abaft,abandon,abandoned,abandoning,abandonment,abase,abasement,...,zones,zoo,zoological,zu,zulu,zululand,zulus,à,æneas,æsthetic
willows.txt,0.0,0.0,0.917396,0.0,0.550228,0.56762,1.233595,1.00616,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wind.txt,0.0,0.0,0.0,0.0,0.550228,0.436285,0.0,0.0,0.0,0.0,...,0.0,0.0,0.809866,0.0,0.0,0.0,0.0,0.0,0.0,0.0
winning.txt,0.0,0.0,0.0,0.0,0.0,0.436285,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
woodmagic.txt,0.0,0.0,0.70513,0.0,0.715864,0.436285,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hobbit.txt,0.0,0.0,0.70513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


"Abandon" in `brass.txt` should be:

In [124]:
txt = 'brass.txt'
term = 'abandon'

t_tf = 1 + log10(raw_df.loc[txt][term])
t_idf = log10(len(docs) / raw_df[raw_df[term] != 0][term].count())
print(t_tf * t_idf)

0.8217262479370613


"Captain" in *Treasure Island* is:

In [125]:
txt = 'treasure.txt'
term = 'captain'

t_tf = 1 + log10(raw_df.loc[txt][term])
t_idf = log10(len(docs) / raw_df[raw_df[term] != 0][term].count())
print(t_tf * t_idf)

0.5932894623617186


In [126]:
idf_df.loc['hobbit.txt'].sort_values(ascending=False).head(20)

elves          4.116413
trolls         3.883555
goblins        3.672943
wizard         3.217640
goblin         3.209503
hoods          2.685397
carven         2.633791
elf            2.609114
dale           2.607026
ravens         2.535418
beards         2.507485
burglar        2.492381
necromancer    2.483559
ponies         2.451000
splintered     2.304893
baa            2.289875
waterfalls     2.289875
scuttling      2.201450
dragons        2.166269
thrush         2.159349
Name: hobbit.txt, dtype: float64

In [127]:
idf_df.loc['treasure.txt'].sort_values(ascending=False).head(20)

hawkins       4.210425
mutineers     3.661217
joyce         3.532588
buccaneers    3.457705
morgan        3.403904
dooty         3.277095
bristol       3.064943
shipmate      2.950224
gigs          2.949074
barbecue      2.860323
anchorage     2.837393
jim           2.831962
stockade      2.780490
davy          2.756540
anderson      2.685397
israel        2.625781
duff          2.483559
jine          2.483559
scuppers      2.483559
swab          2.483559
Name: treasure.txt, dtype: float64

In [128]:
term = 'goblins'

s = raw_df[term]

print("Term frequency", s.sum())
print("Document frequency",  s[s!= 0].count())
print("Inverse document frequency",  get_idf(s))

Term frequency 269
Document frequency 6
Inverse document frequency 1.0791812460476249


In [131]:
idf_df.loc['treasure.txt']['jim']

2.8319624621321293

In [132]:
for v in raw_vectorizer.vocabulary_:
    if v.startswith('treas'):
        print(v)

treasure
treasures
treasured
treasury
treason
treasurer


In [133]:
idf_df[idf_df['beards'] != 0]['beards'].sort_values(ascending=False)

hobbit.txt        2.507485
amulet.txt        1.702070
mulgars.txt       1.499162
coral.txt         1.152288
jungle.txt        1.152288
quatermain.txt    1.152288
Name: beards, dtype: float64

### Using `scikitlearn`'s Tf-Idf Vectorizer 

In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [174]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x,
                            max_df=.90,
                            min_df=3,
                            norm='l2',
                            #  max_features=10000,
                            lowercase=False, 
                            stop_words=None, 
                            binary=False)

In [175]:
vect_matrix = vectorizer.fit_transform(docs)

# Get the feature (word) names
feature_names = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(vect_matrix.toarray(), columns=feature_names, index=files)

tfidf_df.head()

Unnamed: 0,ab,aback,abandon,abandoned,abandoning,abandonment,abasement,abashed,abate,abated,...,zig,zigzag,zone,zones,zoo,zoological,zu,zulu,à,æsthetic
alice.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
alone.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
amulet.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018632,0.00249,0.0,0.0,0.0,0.0
beauty.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
brass.txt,0.0,0.0,0.003209,0.00095,0.0,0.0,0.0,0.001336,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002846,0.0


How is the score of the vectorizer computed? Let's take a word like "mountains"

In [176]:
term = 'mountain'
txt = 'hobbit.txt'

In [177]:
df = raw_df[raw_df[term] != 0][term].count()
tf = raw_df.loc[txt][term]
print(tf, df)

173 43


In [178]:
tfidf_df.loc[txt][term]

0.21970729409810297

### Comparing

We can appreciate the differences by comparing the results of:
- tf-idf, J&M's formula
- tf-idf, scikitlearn vectorizer's implementation
- raw frequency

for any given text

In [179]:
txt = 'hobbit.txt'

In [180]:
vectorizer.vocabulary_

{'alice': 446,
 'adventures': 282,
 'wonderland': 20145,
 'chapter': 2770,
 'rabbit': 14091,
 'hole': 8664,
 'beginning': 1498,
 'sister': 16200,
 'bank': 1279,
 'twice': 18761,
 'peeped': 12741,
 'reading': 14253,
 'pictures': 12989,
 'conversations': 3797,
 'considering': 3649,
 'sleepy': 16310,
 'stupid': 17339,
 'whether': 19904,
 'pleasure': 13183,
 'daisy': 4351,
 'chain': 2726,
 'worth': 20204,
 'picking': 12979,
 'daisies': 4350,
 'pink': 13046,
 'remarkable': 14577,
 'afterwards': 348,
 'occurred': 12077,
 'ought': 12257,
 'wondered': 20140,
 'natural': 11688,
 'actually': 186,
 'watch': 19734,
 'waistcoat': 19633,
 'pocket': 13242,
 'hurried': 8902,
 'flashed': 6932,
 'burning': 2321,
 'curiosity': 4277,
 'field': 6787,
 'fortunately': 7242,
 'pop': 13319,
 'hedge': 8462,
 'straight': 17189,
 'tunnel': 18712,
 'dipped': 4981,
 'stopping': 17158,
 'falling': 6549,
 'deep': 4544,
 'plenty': 13192,
 'happen': 8269,
 'sides': 16094,
 'noticed': 11934,
 'cupboards': 4261,
 'shelve

In [181]:
tfidf_df.loc[txt].sort_values(ascending=False).head(20)

goblins      0.434264
elves        0.307255
mountain     0.219707
dragon       0.188354
wizard       0.174834
goblin       0.172014
ponies       0.131696
lake         0.118012
mountains    0.112937
forest       0.109969
river        0.108892
trolls       0.105335
dale         0.101712
dwarf        0.100597
king         0.095806
valley       0.090629
tunnel       0.085054
eagles       0.082713
spiders      0.078728
burglar      0.078240
Name: hobbit.txt, dtype: float64

In [164]:
idf_df.loc[txt].sort_values(ascending=False).head(20)

elves          4.116413
trolls         3.883555
goblins        3.672943
wizard         3.217640
goblin         3.209503
hoods          2.685397
carven         2.633791
elf            2.609114
dale           2.607026
ravens         2.535418
beards         2.507485
burglar        2.492381
necromancer    2.483559
ponies         2.451000
splintered     2.304893
baa            2.289875
waterfalls     2.289875
scuttling      2.201450
dragons        2.166269
thrush         2.159349
Name: hobbit.txt, dtype: float64

In [165]:
raw_df.loc[txt].sort_values(ascending=False).head(20)

mountain     173
goblins      154
dragon       112
river        105
elves         99
forest        96
king          95
mountains     93
lake          82
gold          73
ring          72
balin         69
path          68
mr            68
valley        64
stone         64
town          63
wizard        62
goblin        61
gate          59
Name: hobbit.txt, dtype: int64