In [1]:
""" 25 minutes
implement a TF-IDF
2. # In this test, you need to compute the TF-IDF score for each word in two documents.  
Recall from NLP that Term Frequency (tf) gives us the frequency of the word in each document in the corpus.

Inverse Data Frequency (idf) is used to calculate the weight of rare words across all documents in the corpus.
TF-IDF is the combination of these two, with words that occur rarely in the corpus having a high IDF score.  
tfIdf = tf * idf;    TFij = number of frequence of word i in document j;  
DFi = frequency of documents containing word i;    N = number of documents.    IDF = log(N/DFi)

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).
IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

example input/output:

doc1 = ['Do', 'you', 'have', 'a' , 'cat' ,'or', 'dog', 'dog']
doc2 = ['Do', 'you', 'have', 'a' , 'car', 'or', 'bike']

def tfidf(doc1, doc2):


    return [{}, {}]

"""

# test cases
# tf_idf(dog,doc1) = tf(dog,doc1) * idf(dog,corpus) = 2/8 * log(2/1)
# tf_idf(you,doc1) = 1/8 * log(2/2)

from collections import defaultdict
import math


def count_words(doc):
    word_counts = defaultdict(int)
    for word in doc:
        word_counts[word] += 1
    return word_counts
    
def test_count_words(f):
    assert f(['a','b']) == {'a':1, 'b':1}
    assert f(['a','a']) == {'a':2}

test_count_words(count_words)




In [2]:

def tfidf(doc1, doc2):
    """ """
    # compute count of words in each doc
    words1 = count_words(doc1) 
    words2 = count_words(doc2) 
    
    # compute tf for each doc
    n_words1 = sum(words1.values())
    tf1 = {w: c * 1.0 / n_words1 for w, c in words1.items()}
    n_words2 = sum(words2.values())
    tf2 = {w: c *1.0 / n_words2 for w, c in words2.items()}
    
    # compute idf: log(N/DFi)
    df = defaultdict(int)
    for w, c in words1.items():
        df[w] += 1
    for w, c in words2.items():
        df[w] += 1
    
    n_docs = 2
    idf = {w: math.log(n_docs * 1.0 / word_df) for w, word_df in df.items() }
    
    # combine each tf and idf
    tfidf1 = {}
    for w, tf in tf1.items():
        tfidf1[w] = tf * idf[w]
    tfidf2 = {}
    for w, tf in tf2.items():
        tfidf2[w] = tf * idf[w]
    
    return [tfidf1, tfidf2]
        
    
    
doc1 = ['Do', 'you', 'have', 'a' , 'cat' ,'or', 'dog', 'dog']
doc2 = ['Do', 'you', 'have', 'a' , 'car', 'or', 'bike']

print(tfidf(doc1, doc2))

[{'Do': 0.0, 'you': 0.0, 'have': 0.0, 'a': 0.0, 'cat': 0.08664339756999316, 'or': 0.0, 'dog': 0.17328679513998632}, {'Do': 0.0, 'you': 0.0, 'have': 0.0, 'a': 0.0, 'car': 0.09902102579427789, 'or': 0.0, 'bike': 0.09902102579427789}]
