In [5]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
import nltk
nltk.download('punkt')
import string

[nltk_data] Downloading package punkt to C:\Users\Nur
[nltk_data]     Adilah\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [6]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [7]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
#defining the function to remove punctuations in the documents
def remove_punctuation(text):
    # Initialize an empty string to store the result
    punctuation_free = ""
    
    # Iterate over each character in the text
    for i in text:
        # Check if the character is not in the string.punctuation set
        if i not in string.punctuation:
            # If not, add the character to the result string
            punctuation_free += i
    return punctuation_free

dataset = list(map(remove_punctuation, dataset))

dataset

['I love playing football on the weekends',
 'I enjoy hiking and camping in the mountains',
 'I like to read books and watch movies',
 'I prefer playing video games over sports',
 'I love listening to music and going to concerts']

In [9]:
#to standardize the cases in the documents into lower case
dataset = list(map(str.lower,dataset))

dataset

['i love playing football on the weekends',
 'i enjoy hiking and camping in the mountains',
 'i like to read books and watch movies',
 'i prefer playing video games over sports',
 'i love listening to music and going to concerts']

In [11]:
tokenized_dataset = [doc.split() for doc in dataset]
tokenized_dataset

[['i', 'love', 'playing', 'football', 'on', 'the', 'weekends'],
 ['i', 'enjoy', 'hiking', 'and', 'camping', 'in', 'the', 'mountains'],
 ['i', 'like', 'to', 'read', 'books', 'and', 'watch', 'movies'],
 ['i', 'prefer', 'playing', 'video', 'games', 'over', 'sports'],
 ['i', 'love', 'listening', 'to', 'music', 'and', 'going', 'to', 'concerts']]

In [12]:
#download stopwords
nltk.download('stopwords')

#Get the list of English stop words present in the library 
stopwords = nltk.corpus.stopwords.words('english')

stopwords

[nltk_data] Downloading package stopwords to C:\Users\Nur
[nltk_data]     Adilah\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

#Applying the remove_stopwords function to the 'token_data' column and storing the result in a new column 'clean_xstopwords'
tokenized_dataset = list(map(remove_stopwords,tokenized_dataset))
tokenized_dataset

[['love', 'playing', 'football', 'weekends'],
 ['enjoy', 'hiking', 'camping', 'mountains'],
 ['like', 'read', 'books', 'watch', 'movies'],
 ['prefer', 'playing', 'video', 'games', 'sports'],
 ['love', 'listening', 'music', 'going', 'concerts']]

In [21]:
## Perform word stemming using Lancaster Stemmer in nltk librarY

#importing the Stemming function from nltk library
from nltk.stem.lancaster import LancasterStemmer

#defining the object for stemming
lancaster_stemmer = LancasterStemmer()

#defining a function for stemming
def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = lancaster_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

#applying the stemming function to the 'clean_xstopwords' column and storing the result in a new column 'clean_stemmed' 
stemmed_dataset = list(map(stemming,tokenized_dataset))
stemmed_dataset


# In[11]:


## Perform word lemmatization using WordNetLemmatizer( ) in nltk library

nltk.download('wordnet')

#importing the Lemmatizer function from nltk library
from nltk.stem import WordNetLemmatizer

#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

# #applying the lemmatizer function to the 'clean_xstopwords' column and storing the result in a new column 'clean_lemmatized1'
dataset_lemm_stem = list(map(lemmatizer,stemmed_dataset))

# #applying the lemmatizer function to the 'clean_stemmed' column and storing the result in a new column 'clean_lemmatized2'
dataset_lemm = list(map(lemmatizer,tokenized_dataset))

print(dataset_lemm_stem)

print(dataset_lemm)


[['lov', 'play', 'footbal', 'weekend'], ['enjoy', 'hik', 'camp', 'mountain'], ['lik', 'read', 'book', 'watch', 'movy'], ['pref', 'play', 'video', 'gam', 'sport'], ['lov', 'list', 'mu', 'going', 'concert']]
[['love', 'playing', 'football', 'weekend'], ['enjoy', 'hiking', 'camping', 'mountain'], ['like', 'read', 'book', 'watch', 'movie'], ['prefer', 'playing', 'video', 'game', 'sport'], ['love', 'listening', 'music', 'going', 'concert']]


[nltk_data] Downloading package wordnet to C:\Users\Nur
[nltk_data]     Adilah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
# Assuming dataset_lemm_stem is a list of lists
dataset_lemm_concatenated = [' '.join(words) for words in dataset_lemm]

# Now, use the TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset_lemm_concatenated)

In [24]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))
# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
i love playing football on the weekends                            1
i enjoy hiking and camping in the mountains                        0
i like to read books and watch movies                              0
i prefer playing video games over sports                           0
i love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 camping
 mountain
 hiking
 enjoy
 video
 sport
 prefer
 game
 book
 read

Cluster 1:
 love
 football
 weekend
 going
 music
 concert
 listening
 playing
 sport
 camping



In [25]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6
