In [20]:
'''
This script outlines the procedure for LDA clustering (unsupervised) using a TFIDF Vectorizer as embeddings
'''


import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.cluster import KMeans
import pandas as pd


In [2]:
df = pd.read_csv('dataset.csv')
df1 = df['description'].copy()


In [14]:
# preprocessing text

def preprocess_text(texts):
    ans = []
    for text in texts:
        text = str(text).lower().strip()
        cities = ['pune', 'mumbai', 'bangalore', 'bangalor', 'new delhi' ,'new delh', 'delhi ncr', 'delhi', 'hyderabad', 'trivandrum', 'ahmedabad', 
               'gurgaon', 'jaipur', 'raigarh', 'chennai', 'prague', 'hyderaba', 'vizag', 'noida', 'mysore', 
               'thane', 'bengaluru', 'kolkatta', 'kolkata', 'dubai', 'varanasi', 'london', 'bhubaneshwar', 'bhubaneswar',
               'bengaluru', 'faridabad', 'chandigarh', 'lucknow', 'bhopal', 'ghaziabad', 'kanchipuram', 'indore', 
               'gwalior', 'udaipur', 'kanyakumari', 'amsterdam', 'andheri', 'jodhpur', 'jamnagar', 'faridabad', 'cochin',
               'nasik', 'tirupati', 'san francisco', 'mumba', 'singapore', 'singapor', 'powai', 'surat', 'jodhpur',
               'chandigarh', 'gurgaon', 'dhanbad', 'puducherry', 'thiruvanantha']
        for city in cities:
            text = text.replace(city, '') #'city')
        text = re.sub("(?i)[.@#]", "", text)
        text = re.sub('[^a-zA-Zа-яА-Я0-9]+', ' ', text)
        #text = text.translate(str.maketrans('',''), string.punctuation)
        text = re.sub(r'\d+', '', text)
        text = text.strip()
        ans += [text]
    return ans
  

In [15]:
df1.values

array(['Tires where delivered to the garage of my choice,the garage notified me when they had been delivered. A day and time was arranged with the garage and I went and had them fitted,a Hassel free experience.',
       'dropped the car off at the time stated on the order and 30 mins later ready to drive away simple great job.',
       'Very easy to use and good value for money.', ...,
       'I ordered the tyre I needed on line, booked a specified time at a local garage and I had the tyre fitted. All worked very well, to time, and I would use [REDACTED] again. Good price for the tyre, too, as I did a quick search on-line.',
       'Excellent service from point of order to fitting. No complaints at all. Thank You.',
       'Seamless, well managed at both ends. I would recommend'],
      dtype=object)

In [16]:
text = preprocess_text(df1.values)


In [25]:
topic_list = list()
# print topics based on LDA model
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        topic_list.append(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [22]:
vectorizer = TfidfVectorizer(stop_words = 'english')
X = vectorizer.fit_transform(text)

In [28]:
number_topics = 10
number_words = 2
lda = LDA(n_components = number_topics,n_jobs = -1)
lda.fit(X)


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [29]:
print_topics(lda,vectorizer,number_words)
 



Topic #0:
value money

Topic #1:
easy tyres

Topic #2:
thank friends

Topic #3:
forward straight

Topic #4:
tyres redacted

Topic #5:
service cheap

Topic #6:
service good

Topic #7:
tyres time

Topic #8:
easy simple

Topic #9:
friendly service


In [30]:
# the visualization of the clusters can be viewed through the html files in the repo
# need to install the pyLDAvis package (pip install pyLDAvis)
from pyLDAvis import sklearn as sklearn_lda
import pickle
import pyLDAvis
import os
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
LDAvis_prepared = sklearn_lda.prepare(lda, X, vectorizer)
#with open(LDAvis_data_filepath, 'w') as f:
#        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
#with open(LDAvis_data_filepath) as f:
#    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
