Source:
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

In [3]:
#bag of words 
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [4]:
bagOfWordsA

['the', 'man', 'went', 'out', 'for', 'a', 'walk']

In [5]:
bagOfWordsB

['the', 'children', 'sat', 'around', 'the', 'fire']

In [7]:
#remove duplicate words

uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [8]:
#create a dictionary of words and their occurence for each document in the corpus (collection of documents)
numOfWordsA = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsA:
    numOfWordsA[word] += 1
    
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [9]:
numOfWordsA

{'a': 1,
 'around': 0,
 'children': 0,
 'fire': 0,
 'for': 1,
 'man': 1,
 'out': 1,
 'sat': 0,
 'the': 1,
 'walk': 1,
 'went': 1}

In [10]:
numOfWordsB

{'a': 0,
 'around': 1,
 'children': 1,
 'fire': 1,
 'for': 0,
 'man': 0,
 'out': 0,
 'sat': 1,
 'the': 2,
 'walk': 0,
 'went': 0}

In [12]:
#stop words: useless words
#a list of english stop words:
from nltk.corpus import stopwords
stopwords.words('english')

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/faeze/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/faeze/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
#stop words: useless words
#a list of english stop words:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

**Term Frequency (TF)**

The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

In [15]:
#TF
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
        
    return tfDict

In [16]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [17]:
tfA

{'a': 0.14285714285714285,
 'around': 0.0,
 'children': 0.0,
 'fire': 0.0,
 'for': 0.14285714285714285,
 'man': 0.14285714285714285,
 'out': 0.14285714285714285,
 'sat': 0.0,
 'the': 0.14285714285714285,
 'walk': 0.14285714285714285,
 'went': 0.14285714285714285}

In [18]:
tfB

{'a': 0.0,
 'around': 0.16666666666666666,
 'children': 0.16666666666666666,
 'fire': 0.16666666666666666,
 'for': 0.0,
 'man': 0.0,
 'out': 0.0,
 'sat': 0.16666666666666666,
 'the': 0.3333333333333333,
 'walk': 0.0,
 'went': 0.0}

**Inverse Data Frequency (IDF)**

The log of the number of documents divided by the number of documents that contain the word w. 
Inverse data frequency determines the weight of rare words across all documents in the corpus.

In [19]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0
                           )
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [20]:
#The IDF is computed once for all documents.
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [21]:
idfs

{'a': 0.6931471805599453,
 'around': 0.6931471805599453,
 'children': 0.6931471805599453,
 'fire': 0.6931471805599453,
 'for': 0.6931471805599453,
 'man': 0.6931471805599453,
 'out': 0.6931471805599453,
 'sat': 0.6931471805599453,
 'the': 0.0,
 'walk': 0.6931471805599453,
 'went': 0.6931471805599453}

Lastly, the TF-IDF is simply the TF multiplied by IDF.

In [22]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [23]:
#compute the TF-IDF scores for all the words in the corpus.
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

df = pd.DataFrame([tfidfA, tfidfB])

In [24]:
df

Unnamed: 0,a,around,children,fire,for,man,out,sat,the,walk,went
0,0.099021,0.0,0.0,0.0,0.099021,0.099021,0.099021,0.0,0.0,0.099021,0.099021
1,0.0,0.115525,0.115525,0.115525,0.0,0.0,0.0,0.115525,0.0,0.0,0.0


In [25]:
#implementing TF-IDF using the class provided by sklearn
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [26]:
df

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0


The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations. In an example with more text, the score for the word the would be greatly reduced.
Towards Data Science
Sharing concepts, ideas, and codes.
Your journey starts here.
Data Science
Sorry, Projects Don’t Get You Jobs
Machine Learning
Recent Advancements in NLP
Programming
Introducing Bamboolib — a GUI for Pandas
Data Science
Top 3 Pandas Functions You Don't Know About (Probably)
)