<a href="https://colab.research.google.com/github/chaewoncutie/Algorithm-Tests/blob/main/KMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Packages


In [1]:
!pip install -U scikit-learn nltk pandas matplotlib seaborn scipy wordcloud ipywidgets umap-learn hdbscan



# Import Libraries

In [2]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, precision_score, accuracy_score
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import numpy as np
import umap
from google.colab import files
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Data Cleaning


In [4]:
# Upload the JSON file
uploaded = files.upload()

# Read the JSON file
file_name = next(iter(uploaded))
df = pd.read_json(file_name, lines=True)

# Display the original dataset information
print("Original Dataset Shape:", df.shape)
print("Original Dataset Preview:")
print(df.head())

Saving News_Category_Dataset_v3.json to News_Category_Dataset_v3 (2).json
Original Dataset Shape: (209527, 6)
Original Dataset Preview:
                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to pred

In [5]:
# Remove whitespace from column names
df.columns = df.columns.str.strip()

# Count and identify null values
print("\nNull Values per Column:")
print(df.isnull().sum())


Null Values per Column:
link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64


In [6]:
# Drop rows with null values in 'headline' or 'short_description'
df = df.dropna(subset=['headline', 'short_description'])

# Drop duplicate rows
df = df.drop_duplicates()

# Drop rows where the date is from 2012 to 2016
df = df[~df['date'].astype(str).str.startswith(('2012', '2013', '2014', '2015', '2016'))]

# Display dataset shape after cleaning
print("\nDataset Shape After Cleaning:", df.shape)


Dataset Shape After Cleaning: (47146, 6)


In [None]:
# Combine relevant text columns
df['text'] = df[['headline', 'short_description']].astype(str).apply(lambda x: ' '.join(x), axis=1)

# Data Pre-Processing

In [8]:
# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['processed_text'] = df['text'].apply(preprocess_text)

In [9]:
# Tokenize, remove stopwords, and apply lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words])

df['filtered_text'] = df['processed_text'].apply(tokenize_and_lemmatize)

In [10]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', max_df=0.7, min_df=5, ngram_range=(1,2))
X = vectorizer.fit_transform(df['filtered_text'])

In [11]:
# Apply TruncatedSVD for Dimensionality Reduction
svd = TruncatedSVD(n_components=3800, random_state=42)
X_svd = svd.fit_transform(X)
explained_variance_svd = svd.explained_variance_ratio_.sum()
print(f"Explained Variance (SVD): {explained_variance_svd:.4f}")

Explained Variance (SVD): 0.9295


In [12]:
# Apply UMAP for Non-Linear Dimensionality Reduction
umap_model = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.3, random_state=42)
X_umap = umap_model.fit_transform(X_svd)
explained_variance_umap = np.var(X_umap, axis=0).sum()
print(f"Explained Variance (UMAP): {explained_variance_umap:.4f}")

  warn(


Explained Variance (UMAP): 15.3273


In [13]:
# Normalize data
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X_umap)

# K-Means

In [None]:
# ----------------- Finding the Best K for K-Means -----------------
inertia_values = []
silhouette_scores = []
k_values = range(2, 20)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_svd)
    inertia_values.append(kmeans.inertia_)
    labels_kmeans = kmeans.predict(X_svd)
    silhouette_scores.append(silhouette_score(X_svd, labels_kmeans))

plt.figure(figsize=(10,5))
plt.plot(k_values, inertia_values, marker='o', label='Inertia')
plt.plot(k_values, silhouette_scores, marker='s', label='Silhouette Score')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Score')
plt.title('Elbow Method & Silhouette Scores for Optimal K')
plt.legend()
plt.grid()
plt.show()

best_k = k_values[np.argmax(silhouette_scores)]
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_svd)
df['cluster_kmeans'] = kmeans_labels

# Plots

In [None]:
# ----------------- Evaluate Clustering Quality -----------------
silhouette = silhouette_score(X_svd, kmeans_labels)
calinski_harabasz = calinski_harabasz_score(X_svd, kmeans_labels)
precision = precision_score(kmeans_labels, kmeans_labels, average='macro')
accuracy = accuracy_score(kmeans_labels, kmeans_labels)
similarity = cosine_similarity(X_svd).mean()

print(f'Optimal K: {best_k}')
print(f'Silhouette Score: {silhouette:.4f}')
print(f'Calinski-Harabasz Score: {calinski_harabasz:.4f}')
print(f'Precision Score: {precision:.4f}')
print(f'Accuracy Score: {accuracy:.4f}')
print(f'Average Similarity Score: {similarity:.4f}')

In [None]:
# Generate word cloud for each cluster
for cluster in set(kmeans_labels):
    cluster_texts = ' '.join(df.loc[df['kmeans_labels'] == cluster, 'filtered_text'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cluster_texts)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Cluster {cluster}')
    plt.show()