In [None]:
import pandas as pd
import os
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from gensim.models import Word2Vec
import numpy as np

nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Read csv file and get the training data
def get_training_data():
    # Get the first file in data folder
    file = os.listdir('data')[0]
    file = os.path.join('data', file)

    # Read the csv file in data folder, only read the first 50K "headline_text"
    data = pd.read_csv(file, usecols=['headline_text'], nrows=50000)

    return data

In [None]:
def data_cleaning(headline):
  # lemmatizer = WordNetLemmatizer()
  # Get the stopwords list
  stop_words = set(stopwords.words('english'))

  # Remove stopwords from the headline from nltk library
  headline = ' '.join([word for word in headline.split() if word not in stop_words])
  return headline

In [None]:
def preprocess_headlines(headlines):
  stop_words = set(stopwords.words('english'))
  processed_headlines = []

  for headline in headlines:
    tokens = word_tokenize(headline.lower())  # Tokenize and lowercase
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove stopwords and punctuation
    processed_headlines.append(filtered_tokens)
  return processed_headlines

In [None]:
def clustering(data):
    vectorizer = TfidfVectorizer()
    x = TfidfVectorizer().fit_transform(data['headline_text'].values)

    # kmeans = KMeans(n_clusters=k)
    # kmeans.fit(x)
    #
    # data['cluster'] = kmeans.labels_

    return x, vectorizer

In [None]:
def clustering_with_stem_and_token(data):
    stemmer = SnowballStemmer('english')
    tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

    def tokenize(text):
        return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

    vectorizer = TfidfVectorizer(tokenizer=tokenize, max_features=1000)
    x = vectorizer.fit_transform(data['headline_text'].values)

    # kmeans = KMeans(n_clusters=k, max_iter=300, n_init=10)
    # kmeans.fit(x)
    #
    # data['cluster'] = kmeans.labels_
    return x, vectorizer


In [None]:
def k_mean(x, k):
    kmeans = KMeans(n_clusters=k, max_iter=1000, n_init=10)
    kmeans.fit(x)

    return kmeans

In [None]:
def plotting_PCA(data, x, k):
    pca = PCA(2)
    x_reduced = pca.fit_transform(x)
    plt.figure(figsize=(8, 6))
    for cluster in range(k):
        plt.scatter(x_reduced[data['cluster'] == cluster, 0],
                    x_reduced[data['cluster'] == cluster, 1],
                    label=f'Cluster {cluster}')

    plt.title('K_means clustering')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
def calculate_silhouette(x, clusters):
    return silhouette_score(x, clusters)

In [None]:
training_data = get_training_data()

In [None]:
# Data preprocess
# training_data['headline_text'] = training_data['headline_text'].apply(data_cleaning)
processed_data = preprocess_headlines(training_data['headline_text'].tolist())

print(processed_data)
cleaned_data = training_data.copy()
cleaned_data['headline_text'] = cleaned_data['headline_text'].apply(data_cleaning)

print(cleaned_data)

                                      headline_text
0        aba decides community broadcasting licence
1          act fire witnesses must aware defamation
2          g calls infrastructure protection summit
3                 air nz staff aust strike pay rise
4        air nz strike affect australian travellers
...                                             ...
49995   two dead israel targets hamas militants car
49996      un agencies pledge boost maternal health
49997           uncommon gene may cause poor memory
49998    united approach sought fix pacific highway
49999  us australia need compromise free trade deal

[50000 rows x 1 columns]


TfIdf + Stemming + PCA + KMeans(11)

In [None]:
cluster_x, vectorizer = clustering_with_stem_and_token(cleaned_data)
pca = PCA(n_components=10)
pca_vectors = pca.fit_transform(cluster_x)
kmean = KMeans(n_clusters=11, random_state=42)
kmean.fit(pca_vectors)

tfidf_labels = kmean.labels_



In [None]:
calculate_silhouette(pca_vectors, tfidf_labels)

0.5213145900076315

TfIdf + Stemming + LSA + KMeans(10)

In [None]:
cluster_x, vectorizer = clustering_with_stem_and_token(cleaned_data)
lsa = TruncatedSVD(n_components=10)
lsa_x = lsa.fit_transform(cluster_x)
kmean = KMeans(n_clusters=10, random_state=42)
kmean.fit(lsa_x)

tfidf_labels = kmean.labels_




In [None]:
calculate_silhouette(lsa_x, tfidf_labels)

0.6036103171856646

TfIdf + PCA + KMeans(11)

In [None]:
cluster_x, vectorizer = clustering(cleaned_data)
pca = PCA(n_components=10)
pca_vectors = pca.fit_transform(cluster_x)
kmean = KMeans(n_clusters=11, random_state=42)
kmean.fit(pca_vectors)

tfidf_labels = kmean.labels_

In [None]:
calculate_silhouette(pca_vectors, tfidf_labels)

0.7301145824525351

TfIdf + LSA + KMeans(10)

In [None]:
cluster_x, vectorizer = clustering(cleaned_data)
lsa = TruncatedSVD(n_components=10)
lsa_x = lsa.fit_transform(cluster_x)
kmean = KMeans(n_clusters=10, random_state=42)
kmean.fit(lsa_x)

tfidf_labels = kmean.labels_

In [None]:
calculate_silhouette(lsa_x, tfidf_labels)

0.6946195827967254

TfIdf + KMeans(3)

In [None]:
cluster_x, vectorizer = clustering(cleaned_data)

kmean = KMeans(n_clusters=3, random_state=42)
kmean.fit(cluster_x)

labels = kmean.labels_

In [None]:
calculate_silhouette(cluster_x, labels)

0.001335958296614811

Word2Vec + PCA + KMeans(3)

In [None]:
model = Word2Vec(sentences=processed_data, vector_size=100, window=5, min_count=1, workers=4)

# Get word vectors
def get_sentence_vector(sentence):
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

sentence_vectors = np.array([get_sentence_vector(sentence) for sentence in processed_data])

pca = PCA(n_components=10)
pca_vectors = pca.fit_transform(sentence_vectors)


kmean = KMeans(n_clusters=3, random_state=42)
kmean.fit(pca_vectors)
pca_labels = kmean.labels_

In [None]:
calculate_silhouette(pca_vectors, pca_labels)

0.3792002

Word2Vec + LSA + Kmeans(3)

In [None]:
model = Word2Vec(sentences=processed_data, vector_size=100, window=5, min_count=1, workers=4)

# Get word vectors
def get_sentence_vector(sentence):
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

sentence_vectors = np.array([get_sentence_vector(sentence) for sentence in processed_data])

lsa = TruncatedSVD(n_components=10)
lsa_x = lsa.fit_transform(sentence_vectors)

kmean = KMeans(n_clusters=3, random_state=42)
kmean.fit(lsa_x)
pca_labels = kmean.labels_

In [None]:
calculate_silhouette(lsa_x, pca_labels)

0.38110477

Word2Vec + Kmeans(3)

In [None]:
model = Word2Vec(sentences=processed_data, vector_size=100, window=5, min_count=1, workers=4)

# Get word vectors
def get_sentence_vector(sentence):
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

sentence_vectors = np.array([get_sentence_vector(sentence) for sentence in processed_data])

kmean = KMeans(n_clusters=3, random_state=42)
kmean.fit(sentence_vectors)
labels = kmean.labels_

In [None]:
calculate_silhouette(sentence_vectors, labels)

0.3760345

In [None]:
cluster_x, vectorizer = clustering(cleaned_data)
lsa = TruncatedSVD(n_components=10)
lsa_x = lsa.fit_transform(cluster_x)

max_score = {'k': -1, 'SilhouetteScore': -1}

for k in range(2, 20):
  kmean = KMeans(n_clusters=k)
  kmean.fit(lsa_x)

  # Calculate silhouette score
  silhouette_avg = silhouette_score(lsa_x, kmean.labels_)
  if silhouette_avg > max_score['SilhouetteScore']:
    max_score['k'] = k
    max_score['SilhouetteScore'] = silhouette_avg
  print(f'K: {k}\tSilhouette Score: {silhouette_avg:.5f}')

# print(f'Highest: {max_score}')

K: 2	Silhouette Score: 0.56500
K: 3	Silhouette Score: 0.59142
K: 4	Silhouette Score: 0.60452
K: 5	Silhouette Score: 0.62269
K: 6	Silhouette Score: 0.62700
K: 7	Silhouette Score: 0.66993
K: 8	Silhouette Score: 0.67924
K: 9	Silhouette Score: 0.69429
K: 10	Silhouette Score: 0.69617
K: 11	Silhouette Score: 0.67219
K: 12	Silhouette Score: 0.66984


KeyboardInterrupt: 

Hierarchical Clustering (Agglomerative)

In [None]:
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

Word2Vec + LSA + Hierarchical Clustering

In [None]:
model = Word2Vec(sentences=processed_data, vector_size=100, window=5, min_count=1, workers=4)
def get_sentence_vector(sentence):
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)
sentence_vectors = np.array([get_sentence_vector(sentence) for sentence in processed_data])
lsa = TruncatedSVD(n_components=10)
lsa_x = lsa.fit_transform(sentence_vectors)

n_clusters = 10
hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters)
labels = hierarchical_clustering.fit_predict(lsa_x)

In [None]:
calculate_silhouette(sentence_vectors, labels)

NameError: name 'calculate_silhouette' is not defined

Word2Vec + PCA + Hierarchical Clustering

In [None]:
model = Word2Vec(sentences=processed_data, vector_size=100, window=5, min_count=1, workers=4)
def get_sentence_vector(sentence):
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)
sentence_vectors = np.array([get_sentence_vector(sentence) for sentence in processed_data])
pca = PCA(n_components=10)
pca_x = pca.fit_transform(sentence_vectors)

n_clusters = 10
hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters)
labels = hierarchical_clustering.fit_predict(pca_x)

In [None]:
calculate_silhouette(sentence_vectors, labels)

TfIdf + LSA + Hierarchical Clustering

In [None]:
tfidf_x, tfidf = clustering(cleaned_data)
lsa = TruncatedSVD(n_components=10)
lsa_x = lsa.fit_transform(tfidf_x)

n_clusters = 10
hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters)
labels = hierarchical_clustering.fit_predict(lsa_x)

In [None]:
calculate_silhouette(tfidf_x, labels)

TfIdf + PCA + Hierarchical Clustering

In [None]:
tfidf_x, tfidf = clustering(cleaned_data)
pca = PCA(n_components=10)
pca_x = pca.fit_transform(tfidf_x)

n_clusters = 10
hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters)
labels = hierarchical_clustering.fit_predict(pca_x)

In [None]:
calculate_silhouette(tfidf_x, labels)