In [11]:
import math
from collections import defaultdict, Counter

# Sample corpus
corpus = [
    "He is Walter",
    "He is William",
    "He isn’t Peter or September"
]

# Tokenize the corpus
def tokenize(sentence):
    return sentence.split()

# Calculate term frequency (TF)
def compute_tf(corpus):
    tf = []
    for document in corpus:
        tokens = tokenize(document)
        counter = Counter(tokens)
        total_terms = len(tokens)
        tf_doc = {term: count / total_terms for term, count in counter.items()}
        tf.append(tf_doc)
    print(tf)
    return tf

# Calculate document frequency (DF)
def compute_df(corpus):
    df = defaultdict(int)
    for document in corpus:
        tokens = set(tokenize(document))
        for token in tokens:
            df[token] += 1
    return df

# Calculate inverse document frequency (IDF)
def compute_idf(corpus, df):
    idf = {}
    N = len(corpus)
    for term, count in df.items():
        idf[term] = math.log10(N / count)
    print(idf)
    return idf

# Calculate TF-IDF
def compute_tf_idf(tf, idf):
    tf_idf = []
    for tf_doc in tf:
        tf_idf_doc = {term: tf_val * idf[term] for term, tf_val in tf_doc.items()}
        tf_idf.append(tf_idf_doc)
    return tf_idf

# Get vocabulary
def get_vocabulary(corpus):
    vocab = set()
    for document in corpus:
        tokens = tokenize(document)
        vocab.update(tokens)
    return vocab

# Main function to calculate TF-IDF
def main(corpus):
    tf = compute_tf(corpus)
    df = compute_df(corpus)
    idf = compute_idf(corpus, df)
    tf_idf = compute_tf_idf(tf, idf)

    vocab = get_vocabulary(corpus)
    tf_idf_matrix = []

    for tf_idf_doc in tf_idf:
        row = []
        for term in vocab:
            row.append(tf_idf_doc.get(term, 0))
        tf_idf_matrix.append(row)

    return vocab, tf_idf_matrix

vocab, tf_idf_matrix = main(corpus)
print(vocab)
#print(tf_idf_matrix)

#print("Vocabulary:", vocab)
#print("TF-IDF Matrix:")
#for row in tf_idf_matrix:
    #print(row)


[{'He': 0.3333333333333333, 'is': 0.3333333333333333, 'Walter': 0.3333333333333333}, {'He': 0.3333333333333333, 'is': 0.3333333333333333, 'William': 0.3333333333333333}, {'He': 0.2, 'isn’t': 0.2, 'Peter': 0.2, 'or': 0.2, 'September': 0.2}]
{'is': 0.17609125905568124, 'He': 0.0, 'Walter': 0.47712125471966244, 'William': 0.47712125471966244, 'September': 0.47712125471966244, 'Peter': 0.47712125471966244, 'isn’t': 0.47712125471966244, 'or': 0.47712125471966244}
{'Walter', 'September', 'Peter', 'is', 'isn’t', 'He', 'or', 'William'}
