In [1]:
import math
import pandas as pd

In [2]:
def tokenize(sentence):
    return sentence.lower().split()

In [3]:
def calculate_tf(tokens):
    tf_dict = {}
    total_tokens = len(tokens)
    for token in tokens:
        if token in tf_dict:
            tf_dict[token] += 1
        else:
            tf_dict[token] = 1
    for token, count in tf_dict.items():
        tf_dict[token] = count / total_tokens
    return tf_dict

In [4]:
def calculate_idf(sentences):
    idf_dict = {}
    total_sentences = len(sentences)
    
    for sentence in sentences:
        tokens = set(tokenize(sentence))
        for token in tokens:
            if token in idf_dict:
                idf_dict[token] += 1
            else:
                idf_dict[token] = 1
                
    for token, count in idf_dict.items():
        idf_dict[token] = math.log(total_sentences / count)
        
    return idf_dict

In [5]:
def calculate_tfidf(tf, idf):
    tfidf = {}
    for token, tf_value in tf.items():
        tfidf[token] = tf_value * idf.get(token, 0)
    return tfidf

In [6]:
tfidf_list = []
sent_one = "This movie is very scary and long"
sent_two = "This movie is not scary and is slow"
sent_three = "This movie is spooky and good"

sentences = [sent_one, sent_two, sent_three]

# Calculate IDF
idf_dict = calculate_idf(sentences)

# Calculate TF-IDF for each sentence
for sentence in sentences:
    tokens = tokenize(sentence)
    tf = calculate_tf(tokens)
    tfidf = calculate_tfidf(tf, idf_dict)
    tfidf_list.append(tfidf)

#  Create a DataFrame to display the results
df = pd.DataFrame(tfidf_list)
df.index = ['Sentence 1', 'Sentence 2', 'Sentence 3']
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,this,movie,is,very,scary,and,long,not,slow,spooky,good
Sentence 1,0.0,0.0,0.0,0.156945,0.057924,0.0,0.156945,0.0,0.0,0.0,0.0
Sentence 2,0.0,0.0,0.0,0.0,0.050683,0.0,0.0,0.137327,0.137327,0.0,0.0
Sentence 3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183102,0.183102
