In [6]:
import re
import math
import numpy as np
def preprocess_document(doc):
    tokens = re.findall(r'\b\w+\b', doc.lower())
    return tokens

documents = ["Covid-19 affects different people in different ways.",
             "Most people infected with the Covid-19",
             "Dengue affects different people in different ways.",
             "Most people infected with the dengue."]

preprocessed_documents = [preprocess_document(doc) for doc in documents]

In [7]:
def calculate_tf(document):
    tf = {}
    total_words = len(document)
    for word in document:
        tf[word] = tf.get(word, 0) + 1 / total_words
    return tf

tf_documents = [calculate_tf(doc) for doc in preprocessed_documents]

In [8]:
def calculate_idf(corpus, term):
    document_count = len(corpus)
    term_count = sum(1 for doc in corpus if term in doc)
    if term_count == 0:
        return 0
    return math.log(document_count / term_count)

idf_values = {}
for doc in preprocessed_documents:
    for term in doc:
        idf_values[term] = calculate_idf(preprocessed_documents, term)

In [9]:
def calculate_tfidf(tf, idf):
    tfidf = {}
    for term, tf_value in tf.items():
        tfidf[term] = tf_value * idf.get(term, 0)
    return tfidf

tfidf_documents = [calculate_tfidf(tf, idf_values) for tf in tf_documents]

In [10]:
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

document_sets = [set(tfidf.keys()) for tfidf in tfidf_documents]

num_documents = len(documents)
jaccard_distances = np.zeros((num_documents, num_documents))

for i in range(num_documents):
    for j in range(i, num_documents):
        jaccard_sim = jaccard_similarity(document_sets[i], document_sets[j])
        jaccard_distances[i, j] = 1 - jaccard_sim
        jaccard_distances[j, i] = 1 - jaccard_sim

In [11]:
from sklearn.cluster import AgglomerativeClustering

num_clusters = 2
agnes = AgglomerativeClustering(n_clusters=num_clusters, linkage="complete", affinity="precomputed")
agnes.fit(jaccard_distances)
labels = agnes.labels_

document_clusters = {}
for i, label in enumerate(labels):
    if label not in document_clusters:
        document_clusters[label] = []
    document_clusters[label].append(documents[i])

for cluster, docs in document_clusters.items():
    print(f"Cluster {cluster + 1}:")
    for doc in docs:
        print(doc)
    print("\n")

Cluster 2:
Covid-19 affects different people in different ways.
Dengue affects different people in different ways.


Cluster 1:
Most people infected with the Covid-19
Most people infected with the dengue.




