In [44]:
#Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import movie_reviews, stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score

# Download required NLTK resources
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/doreenquisido/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/doreenquisido/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/doreenquisido/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [45]:
#Step 2: Load and Preprocess Data

# 1. Load the IMDB Movie Reviews dataset
documents = []
for fileid in movie_reviews.fileids():
    review_text = movie_reviews.raw(fileid)
    documents.append(review_text)

# 2. Preprocess Text
stop_words = set(stopwords.words('english'))
punct = set(string.punctuation)
stop_words = set(stopwords.words('english'))
punct = set(string.punctuation)

def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    # tokenize
    tokens = word_tokenize(text)
    # remove punctuation and stopwords
    cleaned_tokens = [
        word for word in tokens 
        if word not in punct and word not in stop_words
    ]
    return " ".join(cleaned_tokens)

cleaned_reviews = [preprocess_text(review) for review in documents]

# 3. Store cleaned text in a DataFrame
df = pd.DataFrame({
    "original_review": documents,
    "cleaned_review": cleaned_reviews
})

df.head()

Unnamed: 0,original_review,cleaned_review
0,"plot : two teen couples go to a church party ,...",plot two teen couples go church party drink dr...
1,the happy bastard's quick movie review \ndamn ...,happy bastard 's quick movie review damn y2k b...
2,it is movies like these that make a jaded movi...,movies like make jaded movie viewer thankful i...
3,""" quest for camelot "" is warner bros . ' firs...",`` quest camelot `` warner bros first feature-...
4,synopsis : a mentally unstable man undergoing ...,synopsis mentally unstable man undergoing psyc...


In [46]:
#Step 3: Convert Text to Numerical Features

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the cleaned reviews and transform the data
X = tfidf_vectorizer.fit_transform(df['cleaned_review'])

print("Numerical Features Created")
print(f"Shape of the feature matrix (X): {X.shape}")
print(f"This means we have {X.shape[0]} reviews and {X.shape[1]} unique features/words.")

Numerical Features Created
Shape of the feature matrix (X): (2000, 5000)
This means we have 2000 reviews and 5000 unique features/words.


In [47]:
#Step 4: Apply Clustering Algorithms
# 1. K-Means Clustering

print("K-Means Clustering Results")
k = 5 # Set the desired number of clusters

# Initialize and fit K-Means
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X)

# Assign the cluster labels
df['kmeans_cluster'] = kmeans.labels_

# Display the desired result
kmeans_result = df[['cleaned_review', 'kmeans_cluster']]
print(kmeans_result.head())
print(f"\nCluster distribution:\n{df['kmeans_cluster'].value_counts().sort_index()}")

K-Means Clustering Results
                                      cleaned_review  kmeans_cluster
0  plot two teen couples go church party drink dr...               2
1  happy bastard 's quick movie review damn y2k b...               1
2  movies like make jaded movie viewer thankful i...               2
3  `` quest camelot `` warner bros first feature-...               3
4  synopsis mentally unstable man undergoing psyc...               3

Cluster distribution:
kmeans_cluster
0     15
1    243
2    808
3    889
4     45
Name: count, dtype: int64


In [48]:
#Step 4: Apply Clustering Algorithms
# 2. Hierarchical Clustering- Ward Linkage

print("Hierarchical Clustering (Ward Linkage) Results")
k = 5 # Set the desired number of clusters

# Initialize and fit Agglomerative Clustering (Ward Linkage)
agg_ward = AgglomerativeClustering(n_clusters=k, linkage='ward')

# Fit and assign cluster labels
df['agg_cluster_ward'] = agg_ward.fit_predict(X.toarray())

# Display the desired result
agg_ward_result = df[['cleaned_review', 'agg_cluster_ward']]
print(agg_ward_result.head())
print(f"\nCluster distribution:\n{df['agg_cluster_ward'].value_counts().sort_index()}")

Hierarchical Clustering (Ward Linkage) Results
                                      cleaned_review  agg_cluster_ward
0  plot two teen couples go church party drink dr...                 0
1  happy bastard 's quick movie review damn y2k b...                 0
2  movies like make jaded movie viewer thankful i...                 0
3  `` quest camelot `` warner bros first feature-...                 0
4  synopsis mentally unstable man undergoing psyc...                 0

Cluster distribution:
agg_cluster_ward
0    1645
1     305
2      17
3      10
4      23
Name: count, dtype: int64


In [49]:
#Step 4: Apply Clustering Algorithms
# 2.1 Hierarchical Clustering- Average Linkage

print("Hierarchical Clustering (Average Linkage) Results")
k = 5 # Set the desired number of clusters

# Initialize and fit Agglomerative Clustering (Average Linkage)
agg_average = AgglomerativeClustering(n_clusters=k, linkage='average')

# Fit and assign cluster labels
df['agg_cluster_average'] = agg_average.fit_predict(X.toarray())

# Display the desired result
agg_average_result = df[['cleaned_review', 'agg_cluster_average']]
print(agg_average_result.head())
print(f"\nCluster distribution:\n{df['agg_cluster_average'].value_counts().sort_index()}")

Hierarchical Clustering (Average Linkage) Results
                                      cleaned_review  agg_cluster_average
0  plot two teen couples go church party drink dr...                    0
1  happy bastard 's quick movie review damn y2k b...                    0
2  movies like make jaded movie viewer thankful i...                    0
3  `` quest camelot `` warner bros first feature-...                    0
4  synopsis mentally unstable man undergoing psyc...                    0

Cluster distribution:
agg_cluster_average
0    1996
1       1
2       1
3       1
4       1
Name: count, dtype: int64


In [50]:
#Step 4: Apply Clustering Algorithms
# 2.2 Hierarchical Clustering- Complete Linkage

print("Hierarchical Clustering (Complete Linkage) Results")
k = 5 # Set the desired number of clusters

# Initialize and fit Agglomerative Clustering (Complete Linkage)
agg_complete = AgglomerativeClustering(n_clusters=k, linkage='complete')

# Fit and assign cluster labels
df['agg_cluster_complete'] = agg_complete.fit_predict(X.toarray())

# Display the desired result
agg_complete_result = df[['cleaned_review', 'agg_cluster_complete']]
print(agg_complete_result.head())
print(f"\nCluster distribution:\n{df['agg_cluster_complete'].value_counts().sort_index()}")

Hierarchical Clustering (Complete Linkage) Results
                                      cleaned_review  agg_cluster_complete
0  plot two teen couples go church party drink dr...                     4
1  happy bastard 's quick movie review damn y2k b...                     4
2  movies like make jaded movie viewer thankful i...                     4
3  `` quest camelot `` warner bros first feature-...                     1
4  synopsis mentally unstable man undergoing psyc...                     4

Cluster distribution:
agg_cluster_complete
0    812
1    496
2      1
3     13
4    678
Name: count, dtype: int64


In [51]:
#Step 4: Apply Clustering Algorithms
# 3. DBSCAN

print("DBSCAN Clustering Results")

# Parameters
dbscan = DBSCAN(eps=0.8, min_samples=3, metric='cosine')

# Fit and assign cluster labels
df['dbscan_cluster'] = dbscan.fit_predict(X)

# Display the desired result
dbscan_result = df[['cleaned_review', 'dbscan_cluster']]
print(dbscan_result.head())
print(f"\nCluster distribution:\n{df['dbscan_cluster'].value_counts().sort_index()}")

DBSCAN Clustering Results
                                      cleaned_review  dbscan_cluster
0  plot two teen couples go church party drink dr...               0
1  happy bastard 's quick movie review damn y2k b...               0
2  movies like make jaded movie viewer thankful i...               0
3  `` quest camelot `` warner bros first feature-...              -1
4  synopsis mentally unstable man undergoing psyc...              -1

Cluster distribution:
dbscan_cluster
-1      346
 0     1513
 1        4
 2        5
 3        5
 4        3
 5        4
 6        5
 7        3
 8        5
 9        8
 10       3
 11       4
 12       6
 13       7
 14       6
 15       3
 16       4
 17       3
 18       3
 19       6
 20       4
 21       4
 22       3
 23       5
 24       4
 25       3
 26       3
 27       3
 28       6
 29       3
 30       3
 31       3
 32       6
 33       4
Name: count, dtype: int64


In [66]:
#Step 5 Evaluate and Compare Clusters

def evaluate_clustering(X, labels, name):
    labels = np.array(labels)

    # For DBSCAN: ignore noise points (-1) when evaluating
    mask = labels != -1
    if mask.sum() == 0 or len(set(labels[mask])) < 2:
        print(f"{name}: not enough valid clusters for evaluation.")
        return

    X_eval = X[mask]
    labels_eval = labels[mask]

    sil = silhouette_score(X_eval, labels_eval, metric='cosine')
    ch  = calinski_harabasz_score(
        X_eval.toarray() if hasattr(X_eval, "toarray") else X_eval,
        labels_eval
    )

    print("\nCompare the Clustering Results")
    print("Scores are based on the cluster labels assigned in Step 4.")
    print(f"{name} -> Silhouette Score: {sil:.4f}")
    print(f"{name} -> Calinski-Harabasz Index: {ch:.1f}\n")

# Evaluate each algorithm
evaluate_clustering(X, df['kmeans_cluster'], "KMeans")
evaluate_clustering(X, df['agg_cluster_ward'], "Hierarchical (Agglomerative)")
evaluate_clustering(X, df['dbscan_cluster'], "DBSCAN")

print("\nSummary Interpretation")
print("Silhouette Score:Higher is better (range -1 to +1). Values near +1 indicate good, distinct clusters.")
print("Calinski-Harabasz Index:** Higher is better (no upper bound), indicating more distinct and dense clusters.")


Compare the Clustering Results
Scores are based on the cluster labels assigned in Step 4.
KMeans -> Silhouette Score: 0.0065
KMeans -> Calinski-Harabasz Index: 5.7


Compare the Clustering Results
Scores are based on the cluster labels assigned in Step 4.
Hierarchical (Agglomerative) -> Silhouette Score: 0.0021
Hierarchical (Agglomerative) -> Calinski-Harabasz Index: 6.1


Compare the Clustering Results
Scores are based on the cluster labels assigned in Step 4.
DBSCAN -> Silhouette Score: -0.0031
DBSCAN -> Calinski-Harabasz Index: 2.2


Summary Interpretation
Silhouette Score:Higher is better (range -1 to +1). Values near +1 indicate good, distinct clusters.
Calinski-Harabasz Index:** Higher is better (no upper bound), indicating more distinct and dense clusters.
