# How to calculate TF-IDF

TF-IDF is a method for generating features from textual documents. It's result of multiplying two methods:
* Term Frequency: $TF_{i,j} = \frac{n_{i,j}}{\sum_k n_{i,j}}$
* Inverse Document Frequency: $IDF(w)=log(\frac{N}{df_t})$

In [2]:
import pandas as pd
import numpy as np
import collections
import math
from numpy import linalg as LA
import re

In [None]:
doc1 = "The car is driven on the road"
doc2 = "The truck is driven on the highway"

In [3]:
doc1 = 'Game of Thrones is an amazing tv series!'
doc2 = 'Game of Thrones is the best tv series!'
doc3 = 'Game of Thrones is so great'

In [4]:
l_doc1 = re.sub(r"[^a-zA-Z0-9]", " ", doc1.lower()).split()
l_doc2 = re.sub(r"[^a-zA-Z0-9]", " ", doc2.lower()).split()
l_doc3 = re.sub(r"[^a-zA-Z0-9]", " ", doc3.lower()).split()

In [5]:
print(len(l_doc1))
print(len(l_doc2))
print(len(l_doc3))

8
8
6


In [6]:
wordset12 = np.union1d(l_doc1,l_doc2)
wordset =  np.union1d(wordset12,l_doc3)
print(wordset)

['amazing' 'an' 'best' 'game' 'great' 'is' 'of' 'series' 'so' 'the'
 'thrones' 'tv']


In [7]:
len(wordset)

12

## 1. TF

In [8]:
def calculateTF(wordset,bow):
    termfreq_diz = dict.fromkeys(wordset,0)
    counter1 =  dict(collections.Counter(bow))
    for w in bow:
       termfreq_diz[w]=counter1[w]/len(bow)
    return termfreq_diz   

In [9]:
termfreq1_diz = calculateTF(wordset,l_doc1)
print(termfreq1_diz)
termfreq2_diz = calculateTF(wordset,l_doc2)
print(termfreq2_diz)
termfreq3_diz = calculateTF(wordset,l_doc3)
print(termfreq3_diz)

{'amazing': 0.125, 'an': 0.125, 'best': 0, 'game': 0.125, 'great': 0, 'is': 0.125, 'of': 0.125, 'series': 0.125, 'so': 0, 'the': 0, 'thrones': 0.125, 'tv': 0.125}
{'amazing': 0, 'an': 0, 'best': 0.125, 'game': 0.125, 'great': 0, 'is': 0.125, 'of': 0.125, 'series': 0.125, 'so': 0, 'the': 0.125, 'thrones': 0.125, 'tv': 0.125}
{'amazing': 0, 'an': 0, 'best': 0, 'game': 0.16666666666666666, 'great': 0.16666666666666666, 'is': 0.16666666666666666, 'of': 0.16666666666666666, 'series': 0, 'so': 0.16666666666666666, 'the': 0, 'thrones': 0.16666666666666666, 'tv': 0}


In [10]:
df = pd.DataFrame([termfreq1_diz,termfreq2_diz,termfreq3_diz])
df.head()

Unnamed: 0,amazing,an,best,game,great,is,of,series,so,the,thrones,tv
0,0.125,0.125,0.0,0.125,0.0,0.125,0.125,0.125,0.0,0.0,0.125,0.125
1,0.0,0.0,0.125,0.125,0.0,0.125,0.125,0.125,0.0,0.125,0.125,0.125
2,0.0,0.0,0.0,0.166667,0.166667,0.166667,0.166667,0.0,0.166667,0.0,0.166667,0.0


## 2. IDF

### classical approach

In [11]:
def calculate_IDF(wordset,bow):
    d_bow = {'bow_{}'.format(i):list(set(b)) for i,b in enumerate(bow)}
    N=len(d_bow.keys())
    l_bow = []
    for b in d_bow.values():
      l_bow+=b
    counter = dict(collections.Counter(l_bow))
    idf_diz=dict.fromkeys(wordset,0)
    for w in wordset:
      idf_diz[w]=round(math.log10(N/counter[w]),3)
    return idf_diz

In [12]:
idf_diz = calculate_IDF(wordset,[l_doc1,l_doc2,l_doc3])
print(idf_diz)

{'amazing': 0.477, 'an': 0.477, 'best': 0.477, 'game': 0.0, 'great': 0.477, 'is': 0.0, 'of': 0.0, 'series': 0.176, 'so': 0.477, 'the': 0.477, 'thrones': 0.0, 'tv': 0.176}


### sklearn approach

In [13]:
def calculate_IDF(wordset,bow):
    d_bow = {'bow_{}'.format(i):list(set(b)) for i,b in enumerate(bow)}
    N=len(d_bow.keys())
    l_bow = []
    for b in d_bow.values():
      l_bow+=b
    counter = dict(collections.Counter(l_bow))
    idf_diz=dict.fromkeys(wordset,0)
    for w in wordset:
      idf_diz[w] = np.log((1+N)/(1+counter[w]))+1
    return idf_diz

In [82]:
idf_diz = calculate_IDF(wordset,[l_doc1,l_doc2,l_doc3])
print(idf_diz)

{'amazing': 1.6931471805599454, 'an': 1.6931471805599454, 'best': 1.6931471805599454, 'game': 1.0, 'great': 1.6931471805599454, 'is': 1.0, 'of': 1.0, 'series': 1.2876820724517808, 'so': 1.6931471805599454, 'the': 1.6931471805599454, 'thrones': 1.0, 'tv': 1.2876820724517808}


In [14]:
df_idf = pd.DataFrame([idf_diz])
df_idf

Unnamed: 0,amazing,an,best,game,great,is,of,series,so,the,thrones,tv
0,0.477,0.477,0.477,0.0,0.477,0.0,0.0,0.176,0.477,0.477,0.0,0.176


## 3. TF-IDF

Combining these two we come up with the TF-IDF score (w) for a word in a document in the corpus. It is the product of tf and idf:
$w_{i,j}=TF_{i,j}\times IDF(w)$

In [15]:
def calculate_TF_IDF(wordset,tf_diz,idf_diz):
    tf_idf_diz = dict.fromkeys(wordset,0)
    for w in wordset:
       tf_idf_diz[w]=tf_diz[w]*idf_diz[w]
    tdidf_values = list(tf_idf_diz.values())
    l2_norm = LA.norm(tdidf_values)   
    tf_idf_norm = {w:tf_idf_diz[w]/l2_norm for w in wordset}
    return tf_idf_norm

In [16]:
tf_idf_1 = calculate_TF_IDF(wordset,termfreq1_diz,idf_diz)
print(tf_idf_1)
tf_idf_2 = calculate_TF_IDF(wordset,termfreq2_diz,idf_diz)
print(tf_idf_2)
tf_idf_3 = calculate_TF_IDF(wordset,termfreq3_diz,idf_diz)
print(tf_idf_3)

{'amazing': 0.6633899797675505, 'an': 0.6633899797675505, 'best': 0.0, 'game': 0.0, 'great': 0.0, 'is': 0.0, 'of': 0.0, 'series': 0.24477282272345682, 'so': 0.0, 'the': 0.0, 'thrones': 0.0, 'tv': 0.24477282272345682}
{'amazing': 0.0, 'an': 0.0, 'best': 0.6633899797675505, 'game': 0.0, 'great': 0.0, 'is': 0.0, 'of': 0.0, 'series': 0.24477282272345682, 'so': 0.0, 'the': 0.6633899797675505, 'thrones': 0.0, 'tv': 0.24477282272345682}
{'amazing': 0.0, 'an': 0.0, 'best': 0.0, 'game': 0.0, 'great': 0.7071067811865476, 'is': 0.0, 'of': 0.0, 'series': 0.0, 'so': 0.7071067811865476, 'the': 0.0, 'thrones': 0.0, 'tv': 0.0}


In [17]:
df_tfidf = pd.DataFrame([tf_idf_1,tf_idf_2,tf_idf_3])
df_tfidf.head()

Unnamed: 0,amazing,an,best,game,great,is,of,series,so,the,thrones,tv
0,0.66339,0.66339,0.0,0.0,0.0,0.0,0.0,0.244773,0.0,0.0,0.0,0.244773
1,0.0,0.0,0.66339,0.0,0.0,0.0,0.0,0.244773,0.0,0.66339,0.0,0.244773
2,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0,0.0,0.0


In [18]:
termfreq1_diz

{'amazing': 0.125,
 'an': 0.125,
 'best': 0,
 'game': 0.125,
 'great': 0,
 'is': 0.125,
 'of': 0.125,
 'series': 0.125,
 'so': 0,
 'the': 0,
 'thrones': 0.125,
 'tv': 0.125}

In [19]:
idf_diz

{'amazing': 0.477,
 'an': 0.477,
 'best': 0.477,
 'game': 0.0,
 'great': 0.477,
 'is': 0.0,
 'of': 0.0,
 'series': 0.176,
 'so': 0.477,
 'the': 0.477,
 'thrones': 0.0,
 'tv': 0.176}

In [21]:
tf_idf_diz = dict.fromkeys(wordset,0)
for w in wordset:
       tf_idf_diz[w]=termfreq1_diz[w]*idf_diz[w]
tdidf_values = list(tf_idf_diz.values())
print(tdidf_values)
l2_norm = LA.norm(tdidf_values)   
print(l2_norm)
tf_idf_norm = {w:tf_idf_diz[w]/l2_norm for w in wordset}
print(tf_idf_norm)

[0.059625, 0.059625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.022, 0.0, 0.0, 0.0, 0.022]
0.0898792592871125
{'amazing': 0.6633899797675505, 'an': 0.6633899797675505, 'best': 0.0, 'game': 0.0, 'great': 0.0, 'is': 0.0, 'of': 0.0, 'series': 0.24477282272345682, 'so': 0.0, 'the': 0.0, 'thrones': 0.0, 'tv': 0.24477282272345682}


In [22]:
LA.norm([0.06, 0.06,0.022,0.022]) 

0.09037698822156003

In [20]:
LA.norm(tdidf_values)

NameError: ignored

## Compare with sklearn

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [87]:
corpus = [doc1,doc2,doc3]
vectorizer = TfidfVectorizer()
response = vectorizer.fit_transform(corpus)

In [88]:
print(response)

  (0, 7)	0.35645740147620697
  (0, 11)	0.35645740147620697
  (0, 0)	0.4686986463592042
  (0, 1)	0.4686986463592042
  (0, 5)	0.27682097087637686
  (0, 10)	0.27682097087637686
  (0, 6)	0.27682097087637686
  (0, 3)	0.27682097087637686
  (1, 2)	0.4686986463592043
  (1, 9)	0.4686986463592043
  (1, 7)	0.356457401476207
  (1, 11)	0.356457401476207
  (1, 5)	0.27682097087637686
  (1, 10)	0.27682097087637686
  (1, 6)	0.27682097087637686
  (1, 3)	0.27682097087637686
  (2, 4)	0.5427006131762078
  (2, 8)	0.5427006131762078
  (2, 5)	0.32052772458725637
  (2, 10)	0.32052772458725637
  (2, 6)	0.32052772458725637
  (2, 3)	0.32052772458725637


In [89]:
vectorizer.get_feature_names()

['amazing',
 'an',
 'best',
 'game',
 'great',
 'is',
 'of',
 'series',
 'so',
 'the',
 'thrones',
 'tv']

In [90]:
feature_names = vectorizer.get_feature_names()
for col in response.nonzero()[1]:
     print('{}: {}'.format(feature_names[col],response[0,col]))

series: 0.35645740147620697
tv: 0.35645740147620697
amazing: 0.4686986463592042
an: 0.4686986463592042
is: 0.27682097087637686
thrones: 0.27682097087637686
of: 0.27682097087637686
game: 0.27682097087637686
best: 0.0
the: 0.0
series: 0.35645740147620697
tv: 0.35645740147620697
is: 0.27682097087637686
thrones: 0.27682097087637686
of: 0.27682097087637686
game: 0.27682097087637686
great: 0.0
so: 0.0
is: 0.27682097087637686
thrones: 0.27682097087637686
of: 0.27682097087637686
game: 0.27682097087637686


In [91]:
feature_names = vectorizer.get_feature_names()
for col in response.nonzero()[1]:
     print('{}:{}'.format(feature_names[col],response[1,col]))

series:0.356457401476207
tv:0.356457401476207
amazing:0.0
an:0.0
is:0.27682097087637686
thrones:0.27682097087637686
of:0.27682097087637686
game:0.27682097087637686
best:0.4686986463592043
the:0.4686986463592043
series:0.356457401476207
tv:0.356457401476207
is:0.27682097087637686
thrones:0.27682097087637686
of:0.27682097087637686
game:0.27682097087637686
great:0.0
so:0.0
is:0.27682097087637686
thrones:0.27682097087637686
of:0.27682097087637686
game:0.27682097087637686


In [92]:
df_tfidf_sklearn = pd.DataFrame(response.toarray(),columns=vectorizer.get_feature_names())
df_tfidf_sklearn

Unnamed: 0,amazing,an,best,game,great,is,of,series,so,the,thrones,tv
0,0.468699,0.468699,0.0,0.276821,0.0,0.276821,0.276821,0.356457,0.0,0.0,0.276821,0.356457
1,0.0,0.0,0.468699,0.276821,0.0,0.276821,0.276821,0.356457,0.0,0.468699,0.276821,0.356457
2,0.0,0.0,0.0,0.320528,0.542701,0.320528,0.320528,0.0,0.542701,0.0,0.320528,0.0


In [94]:
np.log(4/3)+1

1.2876820724517808

In [None]:
np.log(4/3)+1

In [93]:
vectorizer.idf_

array([1.69314718, 1.69314718, 1.69314718, 1.        , 1.69314718,
       1.        , 1.        , 1.28768207, 1.69314718, 1.69314718,
       1.        , 1.28768207])

# How to calculate BOW

In [23]:
def calculateBOW(wordset,bow):
    termfreq_diz = dict.fromkeys(wordset,0)
    counter1 =  dict(collections.Counter(bow))
    for w in bow:
       termfreq_diz[w]=counter1[w]
    return termfreq_diz   

In [24]:
bow1 = calculateBOW(wordset,l_doc1)
bow2 = calculateBOW(wordset,l_doc2)
bow3 = calculateBOW(wordset,l_doc3)

In [25]:
df_bow = pd.DataFrame([bow1,bow2,bow3])
df_bow.head()

Unnamed: 0,amazing,an,best,game,great,is,of,series,so,the,thrones,tv
0,1,1,0,1,0,1,1,1,0,0,1,1
1,0,0,1,1,0,1,1,1,0,1,1,1
2,0,0,0,1,1,1,1,0,1,0,1,0


## Compare with sklearn

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([doc1,doc2,doc3])
print(X.toarray())

[[1 1 0 1 0 1 1 1 0 0 1 1]
 [0 0 1 1 0 1 1 1 0 1 1 1]
 [0 0 0 1 1 1 1 0 1 0 1 0]]


In [34]:
X.toarray()[0]

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1])

In [35]:
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

In [36]:
df_bow_sklearn.head()

Unnamed: 0,amazing,an,best,game,great,is,of,series,so,the,thrones,tv
0,1,1,0,1,0,1,1,1,0,0,1,1
1,0,0,1,1,0,1,1,1,0,1,1,1
2,0,0,0,1,1,1,1,0,1,0,1,0
