# LDA

In [5]:
!pip install gensim
!pip install spacy
!pip install pyLDAvis
!pip install datatable
!pip install filelock
!pip install nltk

Collecting spacy
  Downloading spacy-3.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)
     |████████████████████████████████| 6.3 MB 20.4 MB/s            
[?25hCollecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
     |████████████████████████████████| 181 kB 126.4 MB/s            
[?25hCollecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (457 kB)
     |████████████████████████████████| 457 kB 111.8 MB/s            
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.7-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
     |█████████████

In [6]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

from pprint import pprint

import spacy

import pickle
import re 
import pyLDAvis
import pyLDAvis.gensim_models
from gensim.models.ldamodel import LdaModel

import matplotlib.pyplot as plt 
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

Download the data from [here](https://datapane.com/u/khuyentran1401/reports/processed_tweets/) then put the data in your current working directory

In [13]:
#text_data = pd.read_csv('/Users/gbethardy/Downloads/dp-export-8927.csv') #Change this to the name of the csv file you downloaded
#text_data = pd.read_csv('/Users/gbethardy/Documents/Customers/Chipotle/datasets/sample10K_customercare_category.csv')
text_data = pd.read_csv('./sample10K_customercare_category.csv')



In [14]:
def remove_special_characters(dataframe):
    no_special_characters = pd.DataFrame(dataframe).replace(r'[^A-Za-z0-9 ]+', '', regex=True)
    return no_special_characters

text_data = remove_special_characters(text_data)
text_data.head()

Unnamed: 0,SUPPORTCASENUMBER,COMMENT_DATE,COMMENT,SUPPORTCASECATEGORYNAME,SUPPORTCASECATEGORYNAME_CLEAN,CATEGORY,SUBCATEGORY
0,3539912,20200101,CX asking for order status Was not informed th...,IT Above Store Refund,Other,Other,Other
1,3539912,20200101,CX asking for order status Was not informed th...,Delivery DoorDash,Other,Other,Other
2,3539912,20200101,CX asking for order status Was not informed th...,Delivery Canceled by DoorDash,Other,Other,Other
3,3539917,20200101,This was a disappointing experience First the ...,Out of Store Cold Food,Other,Other,Other
4,3539917,20200101,This was a disappointing experience First the ...,IT Above Store Refund,Other,Other,Other


In [15]:
"""Preprocess the text column by stemming, lemmatization and stop word removal"""
import datatable as dt
import numpy as np
import shutil
import os
from zipfile import ZipFile

import filelock
# from h2oaicore.transformer_utils import CustomTransformer
# from h2oaicore.systemutils import config, remove, user_dir
# from h2oaicore.systemutils_more import download


class TextPreprocessingTransformer():
    """Transformer to preprocess the text"""
    _numeric_output = False
    _is_reproducible = True
    _modules_needed_by_name = ["nltk==3.4.3"]
    _testing_can_skip_failure = False  # ensure tested as if shouldn't fail

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.do_stemming = True  # turn off as needed
        self.do_lemmatization = False  # turn off as needed
        self.remove_stopwords = True  # turn off as needed

        import nltk
        #nltk_data_path = os.path.join(user_dir(), config.contrib_env_relative_directory, "nltk_data")
        #nltk_data_path = os.path.join("/", config.contrib_env_relative_directory, "nltk_data")
        nltk_data_path = os.path.join("/tmp", "nltk_data")
        #nltk_temp_path = os.path.join(user_dir(), "nltk_data")
        nltk_temp_path = os.path.join("/tmp", "nltk_data")
        nltk.data.path.append(nltk_data_path)
        os.makedirs(nltk_data_path, exist_ok=True)
        nltk_download_lock_file = os.path.join(nltk_data_path, "nltk.lock")
        with filelock.FileLock(nltk_download_lock_file):
            nltk.download('stopwords', download_dir=nltk_data_path)
            nltk.download('punkt', download_dir=nltk_data_path)
            nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)
            nltk.download('maxent_treebank_pos_tagger', download_dir=nltk_data_path)
            nltk.download('wordnet', download_dir=nltk_data_path)
            nltk.download('sonoritysequencing', download_dir=nltk_data_path)

        # download resources for stemming if needed
        if self.do_stemming:
            try:
                self.stemmer = nltk.stem.porter.PorterStemmer()
                self.stemmer.stem("test")
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                tokenizer_path = os.path.join(nltk_data_path, "tokenizers")
                os.makedirs(tokenizer_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, tokenizer_path)
                self.atomic_copy(file1, tokenizer_path)
                self.stemmer = nltk.stem.porter.PorterStemmer()
                self.stemmer.stem("test")

        # download resources for lemmatization if needed
        if self.do_lemmatization:
            try:
                from nltk.corpus import wordnet
                self.lemmatizer = nltk.stem.WordNetLemmatizer()
                self.pos_tagger = nltk.pos_tag
                self.lemmatizer.lemmatize("test", wordnet.NOUN)
                self.pos_tagger("test")
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                tagger_path = os.path.join(nltk_data_path, "taggers")
                corpora_path = os.path.join(nltk_data_path, "corpora")
                os.makedirs(tagger_path, exist_ok=True)
                os.makedirs(corpora_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip",
                    dest_path=nltk_temp_path)
                file2 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip",
                    dest_path=nltk_temp_path)
                file3 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, tagger_path)
                self.unzip_file(file2, tagger_path)
                self.unzip_file(file3, corpora_path)
                self.atomic_copy(file1, tagger_path)
                self.atomic_copy(file2, tagger_path)
                self.atomic_copy(file3, corpora_path)
                from nltk.corpus import wordnet
                self.lemmatizer = nltk.stem.WordNetLemmatizer()
                self.pos_tagger = nltk.pos_tag
                self.lemmatizer.lemmatize("test", wordnet.NOUN)
                self.pos_tagger("test")
            self.wordnet_map = {"N": wordnet.NOUN,
                                "V": wordnet.VERB,
                                "J": wordnet.ADJ,
                                "R": wordnet.ADV,
                                "O": wordnet.NOUN}

        # download resources for stopwords if needed
        if self.remove_stopwords:
            try:
                self.stopwords = set(nltk.corpus.stopwords.words('english'))
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                corpora_path = os.path.join(nltk_data_path, "corpora")
                os.makedirs(corpora_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, corpora_path)
                self.atomic_copy(file1, corpora_path)
                self.stopwords = set(nltk.corpus.stopwords.words('english'))

    def unzip_file(self, src, dst_dir):
        with ZipFile(src, 'r') as zip_ref:
            zip_ref.extractall(dst_dir)

    def atomic_move(self, src, dst):
        try:
            shutil.move(src, dst)
        except shutil.Error:
            pass
        remove(src)

    def atomic_copy(self, src=None, dst=None):
        import uuid
        my_uuid = uuid.uuid4()
        src_tmp = src + str(my_uuid)
        shutil.copy(src, src_tmp)
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        self.atomic_move(src_tmp, dst)
        remove(src_tmp)

    @staticmethod
    def is_enabled():
        return True

    @staticmethod
    def get_default_properties():
        return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)

    @property
    def display_name(self):
        return "PreprocessedText"

    def preprocess(self, text):
        if self.do_stemming:
            text = " ".join([self.stemmer.stem(word) for word in text.split()])
        if self.do_lemmatization:
            pos_tagged_text = self.pos_tagger(text.split())
            text = " ".join([self.lemmatizer.lemmatize(word, self.wordnet_map.get(pos[0], self.wordnet_map["O"]))
                             for word, pos in pos_tagged_text])
        if self.remove_stopwords:
            text = " ".join([word for word in str(text).split()
                             if word.lower() not in self.stopwords])
        return text

    def fit_transform(self, X: dt.Frame, y: np.array = None):
        return self.transform(X)

    def transform(self, X: dt.Frame):
        return X.to_pandas().astype(str).fillna("NA").iloc[:, 0].apply(lambda x: self.preprocess(x))



In [16]:
import datatable as dt

text_data_dt = dt.Frame(text_data['COMMENT'])
text_data_dt
 
tpt = TextPreprocessingTransformer()
test_data_clean = tpt.transform(text_data_dt)

test_data_clean.head()


[nltk_data] Downloading package stopwords to /tmp/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /tmp/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /tmp/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /tmp/nltk_data...
[nltk_data]   Unzipping taggers/maxent_treebank_pos_tagger.zip.
[nltk_data] Downloading package wordnet to /tmp/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Error loading sonoritysequencing: Package
[nltk_data]     'sonoritysequencing' not found in index


0    cx ask order statu wa inform order got cancell...
1    cx ask order statu wa inform order got cancell...
2    cx ask order statu wa inform order got cancell...
3    thi wa disappoint experi first food arriv late...
4    thi wa disappoint experi first food arriv late...
Name: COMMENT, dtype: object

In [17]:
pd.DataFrame(text_data).head()

Unnamed: 0,SUPPORTCASENUMBER,COMMENT_DATE,COMMENT,SUPPORTCASECATEGORYNAME,SUPPORTCASECATEGORYNAME_CLEAN,CATEGORY,SUBCATEGORY
0,3539912,20200101,CX asking for order status Was not informed th...,IT Above Store Refund,Other,Other,Other
1,3539912,20200101,CX asking for order status Was not informed th...,Delivery DoorDash,Other,Other,Other
2,3539912,20200101,CX asking for order status Was not informed th...,Delivery Canceled by DoorDash,Other,Other,Other
3,3539917,20200101,This was a disappointing experience First the ...,Out of Store Cold Food,Other,Other,Other
4,3539917,20200101,This was a disappointing experience First the ...,IT Above Store Refund,Other,Other,Other


In [18]:
text_data = test_data_clean

#text_data = text_data.Tweets.values.tolist()
text_data = pd.DataFrame(text_data).COMMENT.astype(str).values.tolist()



In [19]:
text_data = [t.split(' ') for t in text_data]

In [25]:
text_data[0]

['cx',
 'ask',
 'order',
 'statu',
 'wa',
 'inform',
 'order',
 'got',
 'cancelledask',
 'compens',
 'free',
 'deliveri',
 'next',
 'orderhttpsinternaldoordashcomcustomersupportdelivery400886301']

In [26]:
id2word = Dictionary(text_data)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in text_data]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1)]]


In [27]:
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

[[('ask', 1),
  ('cancelledask', 1),
  ('compens', 1),
  ('cx', 1),
  ('deliveri', 1),
  ('free', 1),
  ('got', 1),
  ('inform', 1),
  ('next', 1),
  ('order', 2),
  ('orderhttpsinternaldoordashcomcustomersupportdelivery400886301', 1),
  ('statu', 1),
  ('wa', 1)]]

In [28]:
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=10, 
                   random_state=100,
                   update_every=1,
                   chunksize=100,
                   alpha='auto',
                   per_word_topics=True)

In [29]:
# Print the Keyword
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.057*"cream" + 0.056*"sour" + 0.052*"lettuc" + 0.046*"doubl" + '
  '0.044*"correct" + 0.042*"half" + 0.042*"put" + 0.041*"drink" + 0.038*"ask" '
  '+ 0.037*"corn"'),
 (1,
  '0.103*"wait" + 0.061*"suppos" + 0.056*"employe" + 0.028*"around" + '
  '0.027*"start" + 0.024*"notic" + 0.022*"walk" + 0.021*"serv" + 0.020*"wast" '
  '+ 0.018*"cook"'),
 (2,
  '0.029*"even" + 0.021*"becaus" + 0.021*"receipt" + 0.021*"ingredi" + '
  '0.017*"brown" + 0.017*"ask" + 0.017*"chipotl" + 0.017*"complet" + '
  '0.016*"realli" + 0.016*"last"'),
 (3,
  '0.074*"order" + 0.038*"refund" + 0.031*"app" + 0.028*"call" + 0.026*"store" '
  '+ 0.024*"would" + 0.023*"deliveri" + 0.022*"cancel" + 0.020*"locat" + '
  '0.020*"place"'),
 (4,
  '0.067*"custom" + 0.041*"mexican" + 0.040*"guest" + 0.040*"team" + '
  '0.039*"servic" + 0.032*"gener" + 0.021*"2020" + 0.019*"januari" + '
  '0.019*"ladi" + 0.019*"number"'),
 (5,
  '0.166*"meal" + 0.105*"kid" + 0.080*"request" + 0.078*"phone" + 0.043*"taco" '
  '+ 0.035*"

In [30]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=text_data, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

#k = 10 Coherence Score:  0.4659113056026286


Coherence Score:  0.4009293139960075


# pyLDAvis

In [31]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [38]:


prep_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(prep_data, 'output_test.html')

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
