In [7]:
import pandas as pd
import math

In [22]:
doc1 = ["The sky is blue.", "The sun is bright today."]

doc2 = ["The sun in the sky is bright.", "We can see the shining sun, the bright sun."]
        


**Create tokens for each document**

In [9]:
doc1_tokens = sum([doc.lower().replace(',', '').replace('.', '').split() for doc in doc1], [])
doc2_tokens = sum([doc.lower().replace(',', '').replace('.', '').split() for doc in doc2], [])

print(doc1_tokens)
print('\n')
print(doc2_tokens)


['the', 'sky', 'is', 'blue', 'the', 'sun', 'is', 'bright', 'today']


['the', 'sun', 'in', 'the', 'sky', 'is', 'bright', 'we', 'can', 'see', 'the', 'shining', 'sun', 'the', 'bright', 'sun']


**Remove stop words**

In [23]:
stopwords = ['a', 'the', 'i', 'me',  'is', 'to', 'then', 'what', 'are', 'for', 'my', 'as', 'can', 'and', 'in', 'of', 'am', 'it']

doc1_tokens = set(doc1_tokens) - set(stopwords)
doc2_tokens = set(doc2_tokens) - set(stopwords)


print(doc1_tokens)
print(doc2_tokens)


{'sky', 'sun', 'today', 'blue', 'bright'}
{'shining', 'sky', 'sun', 'see', 'we', 'bright'}


**Find the unique set of tokens.**

In [24]:
unique_tokens = set(doc1_tokens).union(set(doc2_tokens))

print(unique_tokens)

{'shining', 'sky', 'sun', 'see', 'we', 'today', 'blue', 'bright'}



**Maintain a dict to keep track of count of the unique words in the individual doc**

In [25]:
count_doc1 = dict.fromkeys(unique_tokens, 0)
for token in doc1_tokens:
    count_doc1[token] += 1
    
count_doc2 = dict.fromkeys(unique_tokens, 0)
for token in doc2_tokens:
    count_doc2[token] += 1
    
  
print(count_doc1)
print('\n')
print(count_doc2)


{'shining': 0, 'sky': 1, 'sun': 1, 'see': 0, 'we': 0, 'today': 1, 'blue': 1, 'bright': 1}


{'shining': 1, 'sky': 1, 'sun': 1, 'see': 1, 'we': 1, 'today': 0, 'blue': 0, 'bright': 1}


**Calculate term frequency (TF):**

In [27]:
def calculate_tf(count_doc, doc_tokens):
    tf = dict()
    for token, count in count_doc.items():
        tf[token] = count / float(len(doc_tokens))
    return tf
    
tf1 = calculate_tf(count_doc=count_doc1, doc_tokens=doc1_tokens)
tf2 = calculate_tf(count_doc=count_doc2, doc_tokens=doc2_tokens)

print(tf1)
print('\n', tf2)


{'shining': 0.0, 'sky': 0.2, 'sun': 0.2, 'see': 0.0, 'we': 0.0, 'today': 0.2, 'blue': 0.2, 'bright': 0.2}

 {'shining': 0.16666666666666666, 'sky': 0.16666666666666666, 'sun': 0.16666666666666666, 'see': 0.16666666666666666, 'we': 0.16666666666666666, 'today': 0.0, 'blue': 0.0, 'bright': 0.16666666666666666}


**Calculate inverse document frequency IDF**


In [28]:
def calculate_idf(doc_counts):
    idf = dict.fromkeys(doc_counts[0].keys(), 0)
    for doc in doc_counts: 
        for token, count in doc.items():
            if count!=0:
                idf[token] += 1

             
    for token, count in idf.items():
        idf[token] = math.log(len(doc_counts) / float(count))
       
    return idf

idf = calculate_idf([count_doc1, count_doc2])
print(idf)


{'shining': 0.6931471805599453, 'sky': 0.0, 'sun': 0.0, 'see': 0.6931471805599453, 'we': 0.6931471805599453, 'today': 0.6931471805599453, 'blue': 0.6931471805599453, 'bright': 0.0}


**Calculate TF-IDF**

In [29]:
def calculate_tfidf(tf, idf):
    tfidf = dict()
    for token, count in tf.items():
        tfidf[token] = count * idf[token]
    return tfidf

In [30]:

tfidf1 = calculate_tfidf(tf1, idf)
tfidf2 = calculate_tfidf(tf2, idf)


**Create a dataframe for all the calculated values**

In [31]:
tfidf_df = pd.DataFrame([tfidf1, tfidf2])
tfidf_df

Unnamed: 0,shining,sky,sun,see,we,today,blue,bright
0,0.0,0.0,0.0,0.0,0.0,0.138629,0.138629,0.0
1,0.115525,0.0,0.0,0.115525,0.115525,0.0,0.0,0.0
