<a href="https://colab.research.google.com/github/chaewoncutie/ADV-ML-tests/blob/main/Hypertuning_TF_IDF_and_SVD_(draft).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing Packages**



In [2]:
# Install necessary packages
!pip install -U scikit-learn nltk pandas matplotlib seaborn scipy wordcloud ipywidgets umap-learn hdbscan

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting comm>=0.1.3 (from ipywidgets)
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jedi>=0.16 (from ipython>=6.1.0->ipywidgets)
  Downloading

**Import necessary libraries**

In [3]:
# Import necessary libraries
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, adjusted_rand_score, normalized_mutual_info_score, accuracy_score, precision_score
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
import hdbscan
from wordcloud import WordCloud
import numpy as np
import umap
from google.colab import files
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [4]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Reading the Data

In [5]:
# Upload the JSON file
uploaded = files.upload()

# Read the JSON file
df = pd.read_json(next(iter(uploaded)), lines=True)
df = df[['headline', 'short_description']].dropna()
df['text'] = df['headline'] + " " + df['short_description']

df.head()

Saving News_Category_Dataset_v3.json to News_Category_Dataset_v3.json


Unnamed: 0,headline,short_description,text
0,Over 4 Million Americans Roll Up Sleeves For O...,Health experts said it is too early to predict...,Over 4 Million Americans Roll Up Sleeves For O...
1,"American Airlines Flyer Charged, Banned For Li...",He was subdued by passengers and crew when he ...,"American Airlines Flyer Charged, Banned For Li..."
2,23 Of The Funniest Tweets About Cats And Dogs ...,"""Until you have a dog you don't understand wha...",23 Of The Funniest Tweets About Cats And Dogs ...
3,The Funniest Tweets From Parents This Week (Se...,"""Accidentally put grown-up toothpaste on my to...",The Funniest Tweets From Parents This Week (Se...
4,Woman Who Called Cops On Black Bird-Watcher Lo...,Amy Cooper accused investment firm Franklin Te...,Woman Who Called Cops On Black Bird-Watcher Lo...


# Data Pre-Processing

In [6]:
# Limit dataset size for faster processing
df = df.sample(n=50000, random_state=42)

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['processed_text'] = df['text'].apply(preprocess_text)

In [7]:
# Tokenize, remove stopwords, and apply lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words])

df['filtered_text'] = df['processed_text'].apply(tokenize_and_lemmatize)

# Hyperparameter Tuning

In [None]:
# ----------------- Grid Search for TF-IDF & SVD -----------------
param_grid = {
    'tfidf__max_df': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'tfidf__min_df': [1, 2, 5, 10, 20, 50],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'svd__n_components': list(range(3600, 3701, 10))
}

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('svd', TruncatedSVD(random_state=42))
])

grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, scoring='explained_variance')
grid_search.fit(df['filtered_text'])

best_params = grid_search.best_params_
print(f'Best TF-IDF & SVD Parameters: {best_params}')

best_pipeline = grid_search.best_estimator_
X_svd = best_pipeline.transform(df['filtered_text'])

In [None]:
# TF-IDF Vectorization with best parameters
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', max_df=0.7, min_df=5, ngram_range=(1,2))
X = vectorizer.fit_transform(df['filtered_text'])

In [None]:
# ----------------- SVD COMPONENT SELECTION -----------------
target_variance = 0.95
n_components = 10
explained_variance = 0

while explained_variance < target_variance and n_components <= X.shape[1]:
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    X_svd = svd.fit_transform(X)
    explained_variance = svd.explained_variance_ratio_.sum()
    print(f'n_components={n_components}, explained variance={explained_variance:.4f}')
    n_components += 10

print(f'Selected n_components for SVD: {n_components - 10}')
svd = TruncatedSVD(n_components=n_components - 10, random_state=42)
X_svd = svd.fit_transform(X)

# UMAP with best parameters
umap_model = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.3, random_state=42)
X_umap = umap_model.fit_transform(X_svd)

# Normalize data
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X_umap)

In [None]:
# ----------------- Grid Search for TF-IDF & SVD -----------------
param_grid = {
    'tfidf__max_df': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'tfidf__min_df': [1, 2, 5, 10, 20, 50],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'svd__n_components': list(range(3600, 3701, 10))
}

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('svd', TruncatedSVD(random_state=42))
])

grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, scoring='explained_variance')
grid_search.fit(df['filtered_text'])

best_params = grid_search.best_params_
print(f'Best TF-IDF & SVD Parameters: {best_params}')

best_pipeline = grid_search.best_estimator_
X_svd = best_pipeline.transform(df['filtered_text'])

# Looking for best K (K-Means)

In [None]:
# ----------------- Finding the Best K for K-Means -----------------
inertia_values = []
k_values = range(2, 20)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_svd)
    inertia_values.append(kmeans.inertia_)

plt.figure(figsize=(10,5))
plt.plot(k_values, inertia_values, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.grid()
plt.show()

best_k = k_values[np.argmin(np.gradient(inertia_values))]
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_svd)

In [None]:
# ----------------- UMAP Dimensionality Reduction -----------------
best_umap_score = -1
best_umap_params = {}

for n_neighbors in [10, 30, 50]:
    for min_dist in [0.1, 0.3, 0.5]:
        umap_model = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=min_dist, random_state=42)
        X_umap_temp = umap_model.fit_transform(X_svd)
        silhouette = silhouette_score(X_umap_temp, KMeans(n_clusters=best_k, random_state=42).fit_predict(X_umap_temp))

        if silhouette > best_umap_score:
            best_umap_score = silhouette
            best_umap_params = {'n_neighbors': n_neighbors, 'min_dist': min_dist}

print(f'Best UMAP Parameters: {best_umap_params}')
umap_model = umap.UMAP(n_components=2, **best_umap_params, random_state=42)
X_umap = umap_model.fit_transform(X_svd)

In [None]:
# Evaluate Clustering Quality
silhouette = silhouette_score(X_svd, kmeans_labels)
calinski_harabasz = calinski_harabasz_score(X_svd, kmeans_labels)
print(f'Optimal K: {best_k}')
print(f'Silhouette Score: {silhouette:.4f}')
print(f'Calinski-Harabasz Score: {calinski_harabasz:.4f}')

if 'category' in df.columns:
    ari = adjusted_rand_score(df['category'], kmeans_labels)
    nmi = normalized_mutual_info_score(df['category'], kmeans_labels)
    precision = precision_score(df['category'], kmeans_labels, average='macro')
    accuracy = accuracy_score(df['category'], kmeans_labels)
    similarity = cosine_similarity(X_svd).mean()

    print(f'Adjusted Rand Index: {ari:.4f}')
    print(f'Normalized Mutual Information: {nmi:.4f}')
    print(f'Precision Score: {precision:.4f}')
    print(f'Accuracy Score: {accuracy:.4f}')
    print(f'Average Similarity Score: {similarity:.4f}')