Reference: https://www.kaggle.com/code/venkatasubramanian/automatic-ticket-classification-notebook#Topic-Modelling-using-NMF

In [None]:
import pandas as pd
import json
import numpy as np

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

nltk.download('averaged_perceptron_tagger')
import spacy
nlp = spacy.load("en_core_web_sm")
from textblob import TextBlob
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Data Loading

In [None]:
df = pd.json_normalize(json.load(open("complaints-2021-05-14_08_16_.json")))

# Basic Data Understanding

- getting basic information such as number of variables, observations, sample data, and value frequencies
- identifying data quality issues

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
#Assign new column names
df.columns = ['index', 'type', 'id', 'score', 'tags', 'zip_code','complaint_id', 'issue', 'date_received',
       'state', 'consumer_disputed', 'product','company_response', 'company', 'submitted_via',
       'date_sent_to_company', 'company_public_response','sub_product', 'timely',
       'complaint_what_happened', 'sub_issue','consumer_consent_provided']

In [None]:
df.value_counts('type')

In [None]:
df.value_counts('tags')

In [None]:
df.value_counts('issue')

In [None]:
df.value_counts('consumer_disputed')

In [None]:
df.value_counts('product')

In [None]:
df.value_counts('sub_product')

In [None]:
df.value_counts('timely')

In [None]:
df.value_counts('complaint_what_happened')

In [None]:
df.value_counts('sub_issue')

In [None]:
df.value_counts('consumer_consent_provided')

In [None]:
# null or empty values
null_count_df = pd.DataFrame({'columns':df.columns,
                              'empty_string_count':list(map(lambda column: (df[column] == '').sum(), df.columns)),
                              'null_value_count':list(map(lambda column: (df[column] == np.nan).sum(), df.columns))})
null_count_df

In [None]:
# masked values
masked_text_pattern = r"\b[x|X]{2,20}"
masked_df = pd.DataFrame({
    'number_of_masked':df['complaint_what_happened'].apply(lambda text:len(re.findall(masked_text_pattern, text))), 
    'masked':df['complaint_what_happened'].apply(lambda text:re.findall(masked_text_pattern, text))})
df_with_masked = masked_df[masked_df['number_of_masked']!=0]
df_with_masked

In [None]:
df_with_masked.value_counts('number_of_masked')

In [None]:
import matplotlib.pyplot as plt

plt.hist(df_with_masked['number_of_masked'],bins=200)
plt.show()

# Initial Data Preprocessing

In [None]:
def clean_text(text):
  # Case-folding (Lowercase)
  text = text.lower()

  # Remove url
  url_pattern = re.compile(r'(https?://\S+)|(www\.\S+)|(\S+\.\S+/\S+)')
  text = url_pattern.sub(r'', text)

  # Remove emoji
  emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"  # other miscellaneous symbols
                                u"\U000024C2-\U0001F251"  # enclosed characters
                              "]+", flags=re.UNICODE)
  text = emoji_pattern.sub(r'', text)

  # Remove Punctuation, and words containing numbers
  punt_pattern = '[^\w\s]'
  word_with_num_pattern = '\w*\d\w*'
  text = re.sub(punt_pattern, '', text)
  text = re.sub(word_with_num_pattern, '', text)

  # Tokenisation
  tokens = word_tokenize(text)

  # Remove stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]

  return tokens

def lemmatise_with_pos_tagged(tokens):
  lemmatizer = WordNetLemmatizer()
  lemmas = []
  for word, tag in pos_tag(tokens):
    wntag = tag[0].lower()
    wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
    lemmas.append(lemmatizer.lemmatize(word, wntag) if wntag else word)
  return lemmas

def further_clean(tokens):
  # remove repeating characters from tokens
  RepeatTokensRm =  " ".join( [ re.sub(r'(\w)\1{2,}', r'\1', word) for word in tokens] )
  # Remove tokens containing digits
  digitTokensRm =  " ".join( [ word for word in RepeatTokensRm.split() if not re.search(r'\d', word) ] ) 
  # Remove tokens containing underscore
  underscoreTokensRm =  " ".join( [ word for word in digitTokensRm.split() if not re.search(r'_|\w*_\w*', word) ] )
  # Remove tokens containing Special Characters
  specialTokensRm =  " ".join( [ word for word in underscoreTokensRm.split() if not re.search(r'[^a-zA-Z0-9\s]', word) ] )
  # Remove tokens less than 2 characters
  return " ".join( [ word for word in specialTokensRm.split() if len(word) > 2 ] )

In [None]:
def preprocess_text(text):
    tokens = clean_text(text)
    lemmas = lemmatise_with_pos_tagged(tokens)
    preprocessed_text = further_clean(lemmas)
    return preprocessed_text

In [None]:
# drop unnecessary columns
df.drop(['index', 'type', 'id', 'score', 'tags', 'zip_code','complaint_id', 'issue', 'date_received',
       'state', 'consumer_disputed', 'product','company_response', 'company', 'submitted_via',
       'date_sent_to_company', 'company_public_response','sub_product', 'timely',
       'sub_issue','consumer_consent_provided'],axis=1,inplace=True)

# remove values containing empty strings or only whitespaces of complaint_what_happened column
df['complaint_what_happened'].replace(r'^\s*$', np.nan, regex=True, inplace=True)
df.dropna(inplace=True)

# Apply text preprocessing to the 'complaint_what_happened' column
df['preprocessed_text'] = df['complaint_what_happened'].apply(preprocess_text)

# Display the preprocessed text
df.head()

In [None]:
# df_complain_text['complaint_clean'] = df_complain_text['complaint_what_happened'].apply(clean_text)
# print("Done clean")
# df_complain_text['complaint_pos_tagged'] = df_complain_text['complaint_clean'].apply(pos_tagger)
# print("Done lemmatisation")
# df_complain_text['complaint_lemmatise'] = df_complain_text['complaint_pos_tagged'].apply(lemmatise)
# print("POS Tagged")
# df_complain_processed = df_complain_text[['complaint_clean','complaint_what_happened','complaint_pos_tagged','complaint_lemmatise']]
# df_complain_processed

In [None]:
# remove individual "i", "I", "s", "xxxx", "xxxxxxxx"
# def clean_specific_unigram(sentence):
#     unigram_to_remove = ["i", "I", "s", "xxxxxxxx", "xxxx"]

#     # Remove Punctuation, and words containing numbers
#     for ug in unigram_to_remove:
#         unigram_pattern = f'(\A{ug} )|( {ug} )|( {ug}\Z)'
#         sentence = re.sub(unigram_pattern, ' ', sentence)
#     return sentence

# df_complain_processed['complaint_remove_specific_unigram'] = df_complain_processed['complaint_lemmatise'].apply(clean_specific_unigram)
# df_complain_processed


# Exploratory data analysis

In [None]:
# unigram frequency
from nltk.tokenize import word_tokenize

text = " ".join(list(df['preprocessed_text']))
uni_tokens = word_tokenize(text)
unigram_df = pd.DataFrame({'unigram':uni_tokens})

unigram_freq_df = unigram_df.groupby('unigram').size().reset_index(name='count').sort_values(by='count', ascending=False)
unigram_freq_df.iloc[:49]

In [None]:
# bigram
import itertools
from nltk import bigrams
from nltk.tokenize import word_tokenize


text = list(df['preprocessed_text'])
bigrams_2dlist = [list(bigrams(word_tokenize(txt))) for txt in text]
bigrams_flattenlist = list(itertools.chain.from_iterable(bigrams_2dlist))

bigram_df = pd.DataFrame({'bigram':bigrams_flattenlist})

bigram_freq_df = bigram_df.groupby('bigram').size().reset_index(name='count').sort_values(by='count', ascending=False)
bigram_freq_df.iloc[:49]

In [None]:
# trigram
import itertools
from nltk import trigrams
from nltk.tokenize import word_tokenize


text = list(df['preprocessed_text'])
trigrams_2dlist = [list(trigrams(word_tokenize(txt))) for txt in text]
trigrams_flattenlist = list(itertools.chain.from_iterable(trigrams_2dlist))

trigram_df = pd.DataFrame({'trigram':trigrams_flattenlist})
trigram_freq_df = trigram_df.groupby('trigram').size().reset_index(name='count').sort_values(by='count', ascending=False)
trigram_freq_df.iloc[:49]

In [None]:
# word cloud of unigrams
d = {}
for a, x in unigram_freq_df.values:
    d[a] = x

import matplotlib.pyplot as plt
from wordcloud import WordCloud

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# Feature Extraction

- TF-IDF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

tfidf = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english')
dtm = tfidf.fit_transform(df['preprocessed_text'])
dtm

# Feature Engineering

## Ticket Category

### Topic Modelling

#### Non-Negative Matrix Factorization (NMF)

In [None]:
# from sklearn.decomposition import NMF
# from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.nmf import Nmf
from collections import Counter
from operator import itemgetter
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
texts = df['preprocessed_text'].str.split(' ')

dictionary = Dictionary(texts)

dictionary.filter_extremes(
    no_below=3,
    no_above=0.85,
    keep_n=5000
)

corpus = [dictionary.doc2bow(text) for text in texts]

topic_nums = list(np.arange(5, 40 + 1, 5))

coherence_scores = []

for num in topic_nums:
    nmf = Nmf(
        corpus=corpus,
        num_topics=num,
        id2word=dictionary,
        chunksize=2000,
        passes=5,
        kappa=.1,
        minimum_probability=0.01,
        w_max_iter=300,
        w_stop_condition=0.0001,
        h_max_iter=100,
        h_stop_condition=0.001,
        eval_every=10,
        normalize=True,
        random_state=42
    )
    cm = CoherenceModel(
        model=nmf,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    
    coherence_scores.append(round(cm.get_coherence(), 5))
    print(f'topic modelling done for iteration {num}')

scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

fig = plt.figure(figsize=(15, 7))

plt.plot(
    topic_nums,
    coherence_scores,
    linewidth=3,
    color='#4287f5'
)

plt.xlabel("Topic Num", fontsize=14)
plt.ylabel("Coherence Score", fontsize=14)
plt.title('Coherence Score by Topic Number - Best Number of Topics: {}'.format(best_num_topics), fontsize=18)
plt.xticks(np.arange(5, max(topic_nums) + 1, 5), fontsize=12)
plt.yticks(fontsize=12)

plt.show()

#### Latent Dirichlet Allocation (LDA)

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from collections import Counter
from operator import itemgetter
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
texts = df['preprocessed_text'].str.split(' ')

dictionary = Dictionary(texts)

dictionary.filter_extremes(
    no_below=3,
    no_above=0.85,
    keep_n=5000
)

corpus = [dictionary.doc2bow(text) for text in texts]

topic_nums = list(np.arange(5, 40 + 1, 5))

coherence_scores = []

for num in topic_nums:
    lda = LdaModel(
        corpus=corpus,
        num_topics=num,
        id2word=dictionary,
        chunksize=2000,
        passes=5,
        minimum_probability=0.01,
        alpha='symmetric',
        per_word_topics=True,
        eta=0.6,
        eval_every=10,
        random_state=42
    )
    cm = CoherenceModel(
        model=lda,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    
    coherence_scores.append(round(cm.get_coherence(), 5))
    print(f'topic modelling done for iteration {num}')

scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

fig = plt.figure(figsize=(15, 7))

plt.plot(
    topic_nums,
    coherence_scores,
    linewidth=3,
    color='#4287f5'
)

plt.xlabel("Topic Num", fontsize=14)
plt.ylabel("Coherence Score", fontsize=14)
plt.title('Coherence Score by Topic Number - Best Number of Topics: {}'.format(best_num_topics), fontsize=18)
plt.xticks(np.arange(5, max(topic_nums) + 1, 5), fontsize=12)
plt.yticks(fontsize=12)

plt.show()

#### Top2Vec

In [None]:
from top2vec import Top2Vec

umap_args = {'n_neighbors': 10,
             'n_components': 5,
             'metric': 'cosine',
             "random_state": 42}
hdbscan_args = {'min_cluster_size': 10,
                'min_samples':5,
                'metric': 'euclidean',
                'cluster_selection_method': 'eom'}

top2vec = Top2Vec(
    documents= df.complaint_what_happened, 
    speed='deep-learn', 
    workers=8, 
    min_count = 0, 
    embedding_model='distiluse-base-multilingual-cased', 
    umap_args = umap_args, 
    hdbscan_args = hdbscan_args)

top2vec.get_num_topics()

#### BERTopic

In [None]:
# generate ticket category
# LDA
# NMF
# Top2Vec
# BERTopic

# evaluate which is the best


## Ticket Priority

In [None]:
# generate ticket priority
# based on overall sentiment score, polarity, and topic frequency
# then abc ranking (20-30-50)



# Model building
- data partition
- machine learning models
    - SVM
    - KNN
    - LogR
    - NB
    - Dtree
    - random forest
- evaluations
    - accuracy, precision, recall, f1-score