Refer: https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dkaithav\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [9]:
#stopwords.words('english')

In [10]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

In [11]:
bagofwordsA = documentA.split(" ")
bagofwordsB = documentB.split(" ")

In [12]:
uniqueWords =set(bagofwordsA).union(set(bagofwordsB))
uniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

In [13]:
# createa dictionary for the words in the vocabulary
numofwordsA = dict.fromkeys(uniqueWords, 0)
for word in bagofwordsA:
    numofwordsA[word] +=1

numofwordsB = dict.fromkeys(uniqueWords, 0)
for word in bagofwordsB:
    numofwordsB[word]+=1

In [14]:
numofwordsA

{'for': 1,
 'sat': 0,
 'a': 1,
 'walk': 1,
 'the': 1,
 'man': 1,
 'children': 0,
 'out': 1,
 'fire': 0,
 'went': 1,
 'around': 0}

In [20]:
numofwordsB

{'for': 0,
 'sat': 1,
 'a': 0,
 'walk': 0,
 'the': 2,
 'man': 0,
 'children': 1,
 'out': 0,
 'fire': 1,
 'went': 0,
 'around': 1}

### Term frequency

In [16]:
def computeTF(bagofwords, numofwords):
    tfDict= dict()
    length = len(bagofwords)
    for word, count in numofwords.items():
        tfDict[word] = count/ length
    return tfDict

In [31]:
tfA = computeTF(bagofwordsA, numofwordsA)
tfA

{'for': 0.14285714285714285,
 'sat': 0.0,
 'a': 0.14285714285714285,
 'walk': 0.14285714285714285,
 'the': 0.14285714285714285,
 'man': 0.14285714285714285,
 'children': 0.0,
 'out': 0.14285714285714285,
 'fire': 0.0,
 'went': 0.14285714285714285,
 'around': 0.0}

In [32]:
tfB = computeTF(bagofwordsB, numofwordsB)
tfB

{'for': 0.0,
 'sat': 0.16666666666666666,
 'a': 0.0,
 'walk': 0.0,
 'the': 0.3333333333333333,
 'man': 0.0,
 'children': 0.16666666666666666,
 'out': 0.0,
 'fire': 0.16666666666666666,
 'went': 0.0,
 'around': 0.16666666666666666}

### Inverse document frequency

In [29]:
import math

def computeIDF(documents):
    #number of documents
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, count in document.items():
            if count > 0:
                idfDict[word]+=1
    
    for word, count in idfDict.items():
        idfDict[word] = math.log(N / count)
    return idfDict

In [33]:
idfs = computeIDF([numofwordsA, numofwordsB])
idfs

{'for': 0.6931471805599453,
 'sat': 0.6931471805599453,
 'a': 0.6931471805599453,
 'walk': 0.6931471805599453,
 'the': 0.0,
 'man': 0.6931471805599453,
 'children': 0.6931471805599453,
 'out': 0.6931471805599453,
 'fire': 0.6931471805599453,
 'went': 0.6931471805599453,
 'around': 0.6931471805599453}

### Tf-Idf ( Term frequency inverse document frequency)

In [34]:
# Tf-Idf = termfrequency * Idf

def computeTFIDF(tf, idfs):
    tfidf = dict.fromkeys()
    for word, count in tf.items():
        tfidf[word] = count * idfs[word]
    return tfidf  


In [38]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

In [39]:
tfidfA

{'for': 0.09902102579427789,
 'sat': 0.0,
 'a': 0.09902102579427789,
 'walk': 0.09902102579427789,
 'the': 0.0,
 'man': 0.09902102579427789,
 'children': 0.0,
 'out': 0.09902102579427789,
 'fire': 0.0,
 'went': 0.09902102579427789,
 'around': 0.0}

In [40]:
tfidfB

{'for': 0.0,
 'sat': 0.11552453009332421,
 'a': 0.0,
 'walk': 0.0,
 'the': 0.0,
 'man': 0.0,
 'children': 0.11552453009332421,
 'out': 0.0,
 'fire': 0.11552453009332421,
 'went': 0.0,
 'around': 0.11552453009332421}

In [44]:
pd.DataFrame([tfidfA, tfidfB])

Unnamed: 0,for,sat,a,walk,the,man,children,out,fire,went,around
0,0.099021,0.0,0.099021,0.099021,0.0,0.099021,0.0,0.099021,0.0,0.099021,0.0
1,0.0,0.115525,0.0,0.0,0.0,0.0,0.115525,0.0,0.115525,0.0,0.115525


### Using the TF-IDF in sklearn

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
vectorizer.get_feature_names()

['around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went']

In [56]:
pd.DataFrame(vectors.todense().tolist(), columns = vectorizer.get_feature_names())

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0
