<a href="https://colab.research.google.com/github/chaewoncutie/Algorithm-Tests/blob/main/HDBscan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Packages


In [None]:
# Install necessary packages
!pip install -U scikit-learn nltk pandas matplotlib seaborn scipy wordcloud ipywidgets umap-learn hdbscan


# Import Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, precision_score, accuracy_score
import hdbscan
from wordcloud import WordCloud
import numpy as np
import umap
from google.colab import files
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Data Cleaning


In [None]:
# Upload the JSON file
uploaded = files.upload()

# Read the JSON file
file_name = next(iter(uploaded))
df = pd.read_json(file_name, lines=True)

# Display the original dataset information
print("Original Dataset Shape:", df.shape)
print("Original Dataset Preview:")
print(df.head())

In [None]:
# Remove whitespace from column names
df.columns = df.columns.str.strip()

# Count and identify null values
print("\nNull Values per Column:")
print(df.isnull().sum())

In [None]:
# Drop rows with null values in 'headline' or 'short_description'
df = df.dropna(subset=['headline', 'short_description'])

# Drop duplicate rows
df = df.drop_duplicates()

# Drop rows where the date is from 2012 to 2016
df = df[~df['date'].astype(str).str.startswith(('2012', '2013', '2014', '2015', '2016'))]

# Display dataset shape after cleaning
print("\nDataset Shape After Cleaning:", df.shape)

# Combine relevant text columns
df['text'] = df[['headline', 'short_description']].astype(str).apply(lambda x: ' '.join(x), axis=1)

# Data Pre-Processing

In [None]:
# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\\s]', '', text)  # Remove punctuation & numbers
    text = re.sub(r'\\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['processed_text'] = df['text'].apply(preprocess_text)

In [None]:
# Tokenize, remove stopwords, and apply lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words])

df['filtered_text'] = df['processed_text'].apply(tokenize_and_lemmatize)

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', max_df=0.7, min_df=5, ngram_range=(1,2))
X = vectorizer.fit_transform(df['filtered_text'])

In [None]:
# Apply TruncatedSVD for Dimensionality Reduction
svd = TruncatedSVD(n_components=3800, random_state=42)
X_svd = svd.fit_transform(X)
explained_variance_svd = svd.explained_variance_ratio_.sum()
print(f"Explained Variance (SVD): {explained_variance_svd:.4f}")

In [None]:
# Apply UMAP for Non-Linear Dimensionality Reduction
umap_model = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.3, random_state=42)
X_umap = umap_model.fit_transform(X_svd)
explained_variance_umap = np.var(X_umap, axis=0).sum()
print(f"Explained Variance (UMAP): {explained_variance_umap:.4f}")

In [None]:
# Normalize data
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X_umap)

# HDBSCAN


In [None]:
# ----------------- Applying HDBSCAN -----------------
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=50, min_samples=5, metric='euclidean')
hdbscan_labels = hdbscan_clusterer.fit_predict(X_svd)
df['cluster_hdbscan'] = hdbscan_labels

# Plots

In [None]:
# ----------------- Evaluate Clustering Quality -----------------
silhouette = silhouette_score(X_svd, hdbscan_labels) if len(set(hdbscan_labels)) > 1 else -1
calinski_harabasz = calinski_harabasz_score(X_svd, hdbscan_labels) if len(set(hdbscan_labels)) > 1 else -1
precision = precision_score(hdbscan_labels, hdbscan_labels, average='macro') if len(set(hdbscan_labels)) > 1 else -1
accuracy = accuracy_score(hdbscan_labels, hdbscan_labels) if len(set(hdbscan_labels)) > 1 else -1
similarity = cosine_similarity(X_svd).mean()

print(f'Silhouette Score: {silhouette:.4f}')
print(f'Calinski-Harabasz Score: {calinski_harabasz:.4f}')
print(f'Precision Score: {precision:.4f}')
print(f'Accuracy Score: {accuracy:.4f}')
print(f'Average Similarity Score: {similarity:.4f}')

# Word Cloud

In [None]:
# Generate word cloud for each cluster
for cluster in set(hdbscan_labels):
    if cluster == -1:
        continue
    cluster_texts = ' '.join(df.loc[df['cluster_hdbscan'] == cluster, 'filtered_text'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cluster_texts)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Cluster {cluster}')
    plt.show()