In [None]:
#NAME: DEVASHISH MAYUR POTNIS
#CLASS: BE-AIML
#ROLL NO: 43557
#PRACTICAL:A.2
#PROBLEM STATEMENT:- Implement Single-pass Algorithm for the clustering of files. (Consider 4 to 5 files).

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def single_pass_clustering(files, threshold=0.8):
    """
    Performs single-pass clustering on a list of files.

    Args:
        files (list): List of file paths to be clustered.
        threshold (float): Similarity threshold for merging clusters.

    Returns:
        list: List of clusters, where each cluster is a list of file paths.
    """

    clusters = []
    vectorizer = TfidfVectorizer()

    # Read all files and fit the vectorizer
    file_contents = [open(file_path, 'r').read() for file_path in files]
    vectors = vectorizer.fit_transform(file_contents).toarray()

    for i, file_path in enumerate(files):
        file_vector = vectors[i].reshape(1, -1)

        if not clusters:
            # Create the first cluster
            clusters.append([file_path])
        else:
            max_similarity = 0
            cluster_to_merge = None

            # Find the most similar cluster
            for j, cluster in enumerate(clusters):
                cluster_vectors = vectors[[files.index(f) for f in cluster]]  # Get vectors of the current cluster
                similarity = cosine_similarity(file_vector, np.mean(cluster_vectors, axis=0).reshape(1, -1))

                # Extract the scalar similarity value
                if similarity[0][0] > max_similarity:
                    max_similarity = similarity[0][0]
                    cluster_to_merge = j

            # Add to existing cluster or create new cluster
            if max_similarity > threshold:
                clusters[cluster_to_merge].append(file_path)
            else:
                clusters.append([file_path])

    return clusters

# Example usage with a small number of files
if __name__ == "__main__":
    # Sample file names (make sure these files exist with appropriate content)
    files = ['file1.txt', 'file2.txt', 'file3.txt', 'file4.txt', 'file5.txt']
    
    # Call the clustering function
    clusters = single_pass_clustering(files, threshold=0.5)

    # Print the resulting clusters
    for i, cluster in enumerate(clusters):
        print(f"Cluster {i + 1}: {cluster}")


Cluster 1: ['file1.txt', 'file5.txt']
Cluster 2: ['file2.txt']
Cluster 3: ['file3.txt']
Cluster 4: ['file4.txt']
