In [1]:
#importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("max_colwidth", 400)

In [2]:
#Load data
df = pd.read_excel(r'C:\Users\Dhruv Kumar Jha\Downloads\Graphene AI\Unsupervised-Text-Clustering using NLP\interns-dataset.xlsx')
df

Unnamed: 0,cluster_id,id,phrase
0,0,0334a0d055104e9a931c079e338be9a1,Would use the product again if needed Joe .
1,0,796d6c25ab8849cbba427f1f3e250d80,Have been using the product for a week now
2,0,661f5299cd8944a8a3841fd4f049dee9,Will continue to use this product when I have a issue .
3,0,da831e4bc58d4505aec3c583f0248f8b,Have always had good luck with this product .
4,0,0ea997675e7344419d1540d3e0bc26c3,Will continue to use This product as This product gets the job done .
...,...,...,...
599,10,93f874167d11473f8d36d1cda0a0081c,Spray has no strong odor
600,10,d50fe37fab064408a891aa9ef45dcd70,Spray is nice to keep out on the porch on a summer day
601,10,3e1e8901d5ab4fc9b602ecfdca1220cb,Spray does not leave any oily stinky stains
602,10,c84e1b1196a242d18938af6c60403afc,Love that the scent of this spray is not chemical smelling .


In [3]:
df.drop(df.columns[[1]], axis=1, inplace=True)
df

Unnamed: 0,cluster_id,phrase
0,0,Would use the product again if needed Joe .
1,0,Have been using the product for a week now
2,0,Will continue to use this product when I have a issue .
3,0,Have always had good luck with this product .
4,0,Will continue to use This product as This product gets the job done .
...,...,...
599,10,Spray has no strong odor
600,10,Spray is nice to keep out on the porch on a summer day
601,10,Spray does not leave any oily stinky stains
602,10,Love that the scent of this spray is not chemical smelling .


In [4]:
%%time

# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
df["phrase_clean"] = df["phrase"].apply(lambda x: clean_text(x))

df

Wall time: 2.5 s


Unnamed: 0,cluster_id,phrase,phrase_clean
0,0,Would use the product again if needed Joe .,would use product need joe
1,0,Have been using the product for a week now,use product week
2,0,Will continue to use this product when I have a issue .,continue use product issue
3,0,Have always had good luck with this product .,always good luck product
4,0,Will continue to use This product as This product gets the job done .,continue use product product get job do
...,...,...,...
599,10,Spray has no strong odor,spray strong odor
600,10,Spray is nice to keep out on the porch on a summer day,spray nice keep porch summer day
601,10,Spray does not leave any oily stinky stains,spray leave oily stinky stain
602,10,Love that the scent of this spray is not chemical smelling .,love scent spray chemical smelling


## Clustering

### K means using CountVectorizer

In [5]:
count_vect = CountVectorizer()

bow = count_vect.fit_transform(df['phrase_clean'].values)
bow.shape

(604, 504)

In [6]:
terms = count_vect.get_feature_names()
terms[0:20]

['able',
 'absolutely',
 'action',
 'active',
 'actually',
 'advertise',
 'advertised',
 'aerosol',
 'affordable',
 'afterwards',
 'ago',
 'air',
 'almost',
 'along',
 'already',
 'also',
 'alternative',
 'always',
 'amaze',
 'amazing']

In [7]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 10, init='k-means++', random_state=99)
model.fit(bow)

KMeans(n_clusters=10, random_state=99)

In [8]:
labels = model.labels_
cluster_center=model.cluster_centers_

In [9]:
cluster_center

array([[0.        , 0.01388889, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01639344,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.02564103, 0.        , ..., 0.02564103, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [10]:
df['CountVec Label'] = model.labels_ # the last column you can see the label
df.head(100)

Unnamed: 0,cluster_id,phrase,phrase_clean,CountVec Label
0,0,Would use the product again if needed Joe .,would use product need joe,0
1,0,Have been using the product for a week now,use product week,0
2,0,Will continue to use this product when I have a issue .,continue use product issue,0
3,0,Have always had good luck with this product .,always good luck product,0
4,0,Will continue to use This product as This product gets the job done .,continue use product product get job do,0
5,0,Have continued to use both products .,continue use product,0
6,0,Will use this product as often as necessary,use product often necessary,0
7,0,Would definitely use this product again,would definitely use product,0
8,0,For Short Term Use the product is a good choice .,short term use product good choice,0
9,0,Great to use a product,great use product,0


In [11]:
df.groupby(['CountVec Label'])['phrase'].count()

CountVec Label
0     72
1     61
2     33
3    245
4     63
5     14
6     35
7      8
8     39
9     34
Name: phrase, dtype: int64

### K means using TF-IDF

In [12]:
#tfidf vector initililization
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(df['phrase_clean'].values)
tfidf.shape

(604, 504)

In [13]:
from sklearn.cluster import KMeans
model_tf = KMeans(n_clusters = 10, random_state=99)
model_tf.fit(tfidf)

KMeans(n_clusters=10, random_state=99)

In [14]:
labels_tf = model_tf.labels_
cluster_center_tf=model_tf.cluster_centers_

In [15]:
cluster_center_tf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.02328771, 0.        , ..., 0.01274955, 0.        ,
        0.        ],
       [0.00317939, 0.        , 0.        , ..., 0.0036752 , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.01210953, 0.01527081, ..., 0.        , 0.        ,
        0.        ]])

In [16]:
# to understand what kind of words generated
terms1 = tfidf_vect.get_feature_names()
terms1[0:20]

['able',
 'absolutely',
 'action',
 'active',
 'actually',
 'advertise',
 'advertised',
 'aerosol',
 'affordable',
 'afterwards',
 'ago',
 'air',
 'almost',
 'along',
 'already',
 'also',
 'alternative',
 'always',
 'amaze',
 'amazing']

In [17]:
df['TF-IDF Label'] = model_tf.labels_
df.head(100)

Unnamed: 0,cluster_id,phrase,phrase_clean,CountVec Label,TF-IDF Label
0,0,Would use the product again if needed Joe .,would use product need joe,0,5
1,0,Have been using the product for a week now,use product week,0,5
2,0,Will continue to use this product when I have a issue .,continue use product issue,0,5
3,0,Have always had good luck with this product .,always good luck product,0,2
4,0,Will continue to use This product as This product gets the job done .,continue use product product get job do,0,5
5,0,Have continued to use both products .,continue use product,0,5
6,0,Will use this product as often as necessary,use product often necessary,0,5
7,0,Would definitely use this product again,would definitely use product,0,5
8,0,For Short Term Use the product is a good choice .,short term use product good choice,0,5
9,0,Great to use a product,great use product,0,5


In [18]:
df.groupby(['TF-IDF Label'])['phrase'].count()

TF-IDF Label
0     48
1     51
2    167
3     76
4     46
5     46
6     66
7     35
8     35
9     34
Name: phrase, dtype: int64

### K Means using Word2Vec

In [19]:
import gensim
# Training the wor2vec model using train dataset
w2v_model=gensim.models.Word2Vec(df['phrase_clean'], workers=4)

In [20]:
import numpy as np
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this train
for sent in df['phrase_clean']: # for each review/sentence
    sent_vec = np.zeros(100) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
sent_vectors = np.array(sent_vectors)
sent_vectors = np.nan_to_num(sent_vectors)
sent_vectors.shape

(604, 100)

In [21]:
from sklearn.cluster import KMeans
model2 = KMeans(n_clusters = 11)
model2.fit(sent_vectors)

KMeans(n_clusters=11)

In [22]:
word_cluster_pred=model2.predict(sent_vectors)
word_cluster_pred_2=model2.labels_
word_cluster_center=model2.cluster_centers_

In [23]:
word_cluster_center[1:2]

array([[-0.09260895,  0.16572437,  0.13977677,  0.15598832,  0.08511004,
        -0.12989071,  0.12747005,  0.40739085, -0.20954402, -0.29586509,
        -0.01243362, -0.24831662, -0.02546505,  0.05638389,  0.10088732,
        -0.05968428,  0.25352442,  0.06658701, -0.11024402, -0.46888449,
         0.09471703,  0.0423522 ,  0.21518838, -0.04867775, -0.04661389,
         0.167281  , -0.22502339,  0.20293523, -0.08936943,  0.1591153 ,
         0.09809003, -0.15991559,  0.01369779, -0.2401232 , -0.12418306,
         0.1399931 ,  0.08974744, -0.01320858, -0.15226001, -0.02090183,
         0.16343019, -0.05859762, -0.15757992,  0.20271529,  0.0663956 ,
        -0.0948647 , -0.11028021, -0.15760459,  0.09485988,  0.11635858,
         0.07767166, -0.18719044, -0.07381982, -0.10791028, -0.10529752,
        -0.00601392,  0.03803943, -0.07909078,  0.06283338,  0.08445688,
        -0.09991269, -0.0671043 ,  0.31768153,  0.07428109, -0.08669293,
         0.30712624,  0.10863828,  0.29208305, -0.2

In [24]:
df['Word2Vec Label'] = model2.labels_
df.head(100)

Unnamed: 0,cluster_id,phrase,phrase_clean,CountVec Label,TF-IDF Label,Word2Vec Label
0,0,Would use the product again if needed Joe .,would use product need joe,0,5,2
1,0,Have been using the product for a week now,use product week,0,5,10
2,0,Will continue to use this product when I have a issue .,continue use product issue,0,5,9
3,0,Have always had good luck with this product .,always good luck product,0,2,0
4,0,Will continue to use This product as This product gets the job done .,continue use product product get job do,0,5,2
5,0,Have continued to use both products .,continue use product,0,5,0
6,0,Will use this product as often as necessary,use product often necessary,0,5,0
7,0,Would definitely use this product again,would definitely use product,0,5,9
8,0,For Short Term Use the product is a good choice .,short term use product good choice,0,5,6
9,0,Great to use a product,great use product,0,5,0


In [25]:
df.groupby(['Word2Vec Label'])['phrase'].count()

Word2Vec Label
0     94
1     20
2     18
3     57
4     76
5     57
6     73
7      2
8     24
9     84
10    99
Name: phrase, dtype: int64

## Clustering using DBSCAN

Density-based spatial clustering of applications with noise

In [26]:
from sklearn.cluster import DBSCAN

In [27]:
# Training DBSCAN :
model = DBSCAN(eps = 1, min_samples = 5)
model.fit(tfidf)

DBSCAN(eps=1)

In [28]:
df['DBSCAN Label'] = model.labels_
df.head(100)

Unnamed: 0,cluster_id,phrase,phrase_clean,CountVec Label,TF-IDF Label,Word2Vec Label,DBSCAN Label
0,0,Would use the product again if needed Joe .,would use product need joe,0,5,2,-1
1,0,Have been using the product for a week now,use product week,0,5,10,0
2,0,Will continue to use this product when I have a issue .,continue use product issue,0,5,9,0
3,0,Have always had good luck with this product .,always good luck product,0,2,0,-1
4,0,Will continue to use This product as This product gets the job done .,continue use product product get job do,0,5,2,0
5,0,Have continued to use both products .,continue use product,0,5,0,0
6,0,Will use this product as often as necessary,use product often necessary,0,5,0,-1
7,0,Would definitely use this product again,would definitely use product,0,5,9,0
8,0,For Short Term Use the product is a good choice .,short term use product good choice,0,5,6,-1
9,0,Great to use a product,great use product,0,5,0,0


In [29]:
df.tail(100)

Unnamed: 0,cluster_id,phrase,phrase_clean,CountVec Label,TF-IDF Label,Word2Vec Label,DBSCAN Label
504,8,Keeps the pests away with regular preventative use,keep pests away regular preventative use,3,4,10,-1
505,8,Great pest control company seems to have mitigated all of our issues,great pest control company seem mitigate issue,8,9,10,0
506,8,Nice to have a pest free home,nice pest free home,3,9,10,-1
507,9,Was so helpful in getting rid of them without having to call an exterminator,helpful get rid without call exterminator,7,2,4,-1
508,9,Gallon refill 2 pack saves me money compared to an exterminator,gallon refill pack save money compare exterminator,3,2,4,-1
509,9,Works better than what exterminator used .,work well exterminator use,3,2,4,0
510,9,Came in handy until the exterminator could come,come handy exterminator could come,3,2,4,0
511,9,No need to call an exterminator again .,need call exterminator,3,2,10,-1
512,9,Saved me a lot of money from getting an exterminator,save lot money get exterminator,3,2,6,-1
513,9,Extra cleaning saved me hundreds on an exterminator !,extra clean save hundred exterminator,3,2,6,-1


In [30]:
df.groupby(['DBSCAN Label'])['phrase'].count()

DBSCAN Label
-1    250
 0    321
 1      5
 2     11
 3      5
 4      6
 5      6
Name: phrase, dtype: int64

In [31]:
df.to_excel("output v2.xlsx")

## Final Observations

**K Means using TF-IDF** embedding is best for small dataset like this, followed by **K Means using CountVectorizer**, and **K Means using Word2Vec**. 

All the clusters are clearly reflecting that they were grouped based on different aspects. However, in few places it is not correlating with the given cluster labels.

**DBSCAN** is performing **poorly on small dataset** as it is grouping all reviews in **one cluster**.