<a href="https://colab.research.google.com/github/chaewoncutie/ADV-ML-tests/blob/main/fixed_format.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Packages


In [None]:
!pip install -U scikit-learn nltk pandas matplotlib seaborn scipy wordcloud ipywidgets umap-learn hdbscan

# Import Libraries

In [None]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
import hdbscan
from wordcloud import WordCloud
import numpy as np
import umap
from google.colab import files
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# lagay mo rito ginawa mo for first part sa data cleaning

# Data Pre-Processing

In [None]:
# Limit dataset size for faster processing
df = df.sample(n=50000, random_state=42)

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['processed_text'] = df['text'].apply(preprocess_text)

In [None]:
# Tokenize, remove stopwords, and apply lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words])

df['filtered_text'] = df['processed_text'].apply(tokenize_and_lemmatize)

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', max_df=0.7, min_df=5, ngram_range=(1,2))
X = vectorizer.fit_transform(df['filtered_text'])

In [None]:
# Apply TruncatedSVD for Dimensionality Reduction
svd = TruncatedSVD(n_components=3600, random_state=42)
X_svd = svd.fit_transform(X)

In [None]:
# Apply UMAP for Non-Linear Dimensionality Reduction
umap_model = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.3, random_state=42)
X_umap = umap_model.fit_transform(X_svd)

In [None]:
# Normalize data
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X_umap)

# K-Means

In [None]:
# ----------------- Finding the Best K for K-Means -----------------
inertia_values = []
silhouette_scores = []
k_values = range(2, 20)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_svd)
    inertia_values.append(kmeans.inertia_)
    labels_kmeans = kmeans.predict(X_svd)
    silhouette_scores.append(silhouette_score(X_svd, labels_kmeans))

plt.figure(figsize=(10,5))
plt.plot(k_values, inertia_values, marker='o', label='Inertia')
plt.plot(k_values, silhouette_scores, marker='s', label='Silhouette Score')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Score')
plt.title('Elbow Method & Silhouette Scores for Optimal K')
plt.legend()
plt.grid()
plt.show()

best_k = k_values[np.argmax(silhouette_scores)]
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_svd)
df['cluster_kmeans'] = kmeans_labels

# DBScan

In [None]:
# ----------------- Apply DBSCAN -----------------
dbscan = DBSCAN(eps=0.5, min_samples=10, metric='euclidean')
dbscan_labels = dbscan.fit_predict(X_svd)
df['cluster_dbscan'] = dbscan_labels

# GMM

In [None]:
# ----------------- Apply GMM (Gaussian Mixture Model) -----------------
gmm = GaussianMixture(n_components=best_k, random_state=42)
gmm_labels = gmm.fit_predict(X_svd)
df['cluster_gmm'] = gmm_labels

# Plots

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

sns.scatterplot(x=X_umap[:, 0], y=X_umap[:, 1], hue=df['cluster_kmeans'], palette='tab20', ax=axes[0, 0])
axes[0, 0].set_title(f'K-Means Clustering (k={best_k})')

sns.scatterplot(x=X_umap[:, 0], y=X_umap[:, 1], hue=df['cluster_dbscan'], palette='tab20', ax=axes[0, 1])
axes[0, 1].set_title('DBSCAN Clustering')

sns.scatterplot(x=X_umap[:, 0], y=X_umap[:, 1], hue=df['cluster_gmm'], palette='tab20', ax=axes[1, 0])
axes[1, 0].set_title('GMM Clustering')

plt.tight_layout()
plt.show()

print("Clustering process completed.")