In [None]:
from __future__ import division
import math
import string

In [None]:
tokenize = lambda doc: doc.lower().split(" ")

In [None]:
document_0  = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_6 = "Vladimir Putin was found to be riding a horse, again, without a shirt on while hunting deer. Vladimir Putin always seems so serious about things - even riding horses."

In [None]:
all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]

In [None]:
all_documents

In [None]:
# tokenized docs
tokenized_documents = [tokenize(d) for d in all_documents]

In [None]:
tokenized_documents

In [None]:
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])

In [None]:
all_tokens_set

In [None]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [None]:
# comparing document_2 and document_4
jaccard_similarity(tokenized_documents[2],tokenized_documents[4])

In [None]:
set(tokenized_documents[2]).intersection(set(tokenized_documents[4]))

In [None]:
jaccard_similarity(tokenized_documents[1],tokenized_documents[6])

In [None]:
set(tokenized_documents[1]).intersection(set(tokenized_documents[6]))

In [None]:
def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)

In [None]:
def sublinear_term_frequency(term, tokenizeddocument): 
    count = tokenizeddocument.count(term) 
    if count == 0: 
        return 0 
    return 1 + math.log(count)

In [None]:
def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))


In [None]:
def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        doc_count = sum(contains_token)
        if doc_count == 0:
            idf_values[tkn] = 0  # If term is not in any document, return zero
        else:
            idf_values[tkn] = 1 + math.log(len(tokenized_documents) / doc_count)
    return idf_values

In [None]:
idf_values = inverse_document_frequencies(tokenized_documents)

In [None]:
def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents

In [None]:
tfidf_representation = tfidf(all_documents)

In [None]:
tfidf_representation[0]

In [None]:
document_0

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0.0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)

In [None]:
sklearn_representation = sklearn_tfidf.fit_transform(all_documents)

In [None]:
sklearn_representation

In [None]:
# Get feature names (terms)
feature_names = sklearn_tfidf.get_feature_names_out()

In [None]:
feature_names

In [None]:
# Convert the TF-IDF matrix to a dense format
dense_tfidf = sklearn_representation.todense()

In [None]:
dense_tfidf

In [None]:
# Print the TF-IDF representation
for doc_idx, doc in enumerate(dense_tfidf):
    print(f"Document {doc_idx}:")
    for term_idx, score in enumerate(doc.tolist()[0]):
        if score > 0:  # Only print terms with non-zero TF-IDF scores
            print(f"  {feature_names[term_idx]}: {score:.4f}")