In [1]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gokul\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
documents = [
    "Text of document 1 with some punctuation!",
    "Text of document 2 with numbers like 12345.",
    "Text of document 3 with stopwords and stemming",]

In [4]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Stemming (you can also use lemmatization)
    stemmer = PorterStemmer()
    text = ' '.join(stemmer.stem(word) for word in text.split())
    return text

In [5]:
# Apply text preprocessing to each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]


In [6]:
# Convert the preprocessed documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_documents)

In [7]:
# Determine the optimal number of clusters using the silhouette score
best_score = -1
best_k = 2  # default to 2 clusters
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    labels = kmeans.labels_
    score = silhouette_score(X, labels)
    if score > best_score:
        best_score = score
        best_k = k


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


ValueError: Number of labels is 3. Valid values are 2 to n_samples - 1 (inclusive)

In [8]:
# Apply KMeans clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_

# Create a DataFrame with the original text, preprocessed text, and corresponding cluster labels
df = pd.DataFrame({'Original Text': documents, 'Preprocessed Text': preprocessed_documents, 'Cluster': labels})

# Display the results
print("Optimal number of clusters:", best_k)
print(df)

  super()._check_params_vs_input(X, default_n_init=10)


Optimal number of clusters: 2
                                    Original Text  \
0       Text of document 1 with some punctuation!   
1     Text of document 2 with numbers like 12345.   
2  Text of document 3 with stopwords and stemming   

             Preprocessed Text  Cluster  
0       text document punctuat        0  
1    text document number like        0  
2  text document stopword stem        1  
