# Topic Modeling

## Pre-processing helpers

In [1]:
import json
import re
from IPython.display import clear_output

import pandas as pd
import stanza
from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel
from gensim.models.coherencemodel import CoherenceModel
from markdown import markdown

In [None]:
STOPWORDS = STOPWORDS.union((
    "var", "variable", "computed", "costa", "botocore", "version", "step",
    "support", "source", "hashicorp", "service", "branch", "pull", "merge", "issue",
    "pr", "galoy-pay", "bumped", "add", "payload", "boto", "accurics", "hana",
    "bump", "added", "latest", "update", "tf", "github", "test", "sourced",
    "instead", "use", "plan", "updates", "diff", "bump-galoy-pay-image", "draft",
    "iam", "i'm", "v1", "apply", "fix", "fixes", "kvo", "needed", "tco", "create",
    "run", "code", "feat", "lambda", "need", "link", "project", "new", "change",
    "they're"
))

UPOS = ('PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV')
nlp_pipeline = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

def load_dataset(filepath):
    with open(filepath, encoding='utf-8') as f:
        return json.load(f)

def clean_markup(doc):
    # Convert Markdown to HTML markup
    clean_doc = markdown(doc,extensions=['fenced_code'])
    clean_doc = BeautifulSoup(clean_doc)
    # Remove unwanted content
    for s in clean_doc.select('code'):
        s.extract()
    for s in clean_doc.select('pre'):
        s.extract()
    for s in clean_doc.select('blockquotes'):
        s.extract()
    # Remove HTML markup
    clean_doc = ''.join(clean_doc.findAll(text=True))
    # Remove URLs
    clean_doc = re.sub(r'\S*https?:\S*', '', clean_doc, flags=re.MULTILINE)
    
    return clean_doc
    
def prepare_document(doc):
    clean_doc = clean_markup(doc)

    tokens = [token.to_dict()[0]["lemma"]
                for token in nlp_pipeline(clean_doc).iter_tokens()
                if token.to_dict()[0]["upos"] in UPOS and not token.to_dict()[0]["text"] in STOPWORDS
            ]
    return tokens

def prepare_corpus(documents):
    corpus = []
    total_docs = len(documents)

    for i in range(total_docs):
        print(f'{total_docs} documents: {(i+1)/total_docs*100:.2f}% parsed')
        print(documents[i])
        corpus.append(prepare_document(documents[i]))
        clear_output(wait=False)

    return corpus

def build_tfidf_model(corpus):
    corpus_dict = Dictionary(corpus)
    corpus_bow = tuple(corpus_dict.doc2bow(sentence) for sentence in corpus)
    tfidf_model = TfidfModel(corpus_bow, normalize=True)

    return corpus_dict, corpus_bow, tfidf_model

def get_keywords(model, num_topics=-1, num_words=5):
    topic_vectors = model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)
    return sorted(tuple(set([w[0] for t in topic_vectors for w in t[1]])))

## Commits

In [3]:
data = load_dataset('data/dataset.json')

In [4]:
documents = [unit['content']['message'] for unit in data if unit['type'] == 'commit']

In [5]:
corpus = prepare_corpus(documents)

In [7]:
(corpus_dict, corpus_bow, tfidf_model) = build_tfidf_model(corpus)

Explore hyperparameters
- K = {5,6,...,34,35}
- alpha = {0.01,50/K}
- beta = {0.01,50/K}
- chunksize = {1,2,4,8,...,1024}

In [None]:
for num_topics in range(5,36):
    for alpha in (0.01, 50/num_topics):
        for beta in (0.01, 50/num_topics):
            for chunksize in (1,2,4,8,16,32,64,128,256,512,1024):
                lda_model = LdaModel(corpus=corpus_bow,id2word=corpus_dict,num_topics=num_topics,alpha=alpha,eta=beta,chunksize=chunksize,passes=100)
                perplexity = lda_model.log_perplexity(corpus_bow)
                coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=corpus_dict, coherence='c_v')
                coherence_lda = coherence_model_lda.get_coherence()
                print(f'{num_topics: <{2}} {alpha:.{2}} {beta:.{2}} {chunksize: <{4}} | {perplexity: >+{5}.{2}f} {coherence_lda: >{4}.{2}f}')

Train models for manual inspection

In [None]:
configs = [
    {'num_topics':5, 'alpha':50/5, 'eta':0.01, 'chunksize':10},
    {'num_topics':9, 'alpha':50/9, 'eta':0.01, 'chunksize':16},
    {'num_topics':11, 'alpha':50/11, 'eta':0.01, 'chunksize':32},
    {'num_topics':12, 'alpha':50/12, 'eta':0.01, 'chunksize':32}, # SELECTED MODEL
    {'num_topics':14, 'alpha':50/14, 'eta':0.01, 'chunksize':32},
    {'num_topics':13, 'alpha':50/13, 'eta':0.01, 'chunksize':32},
    {'num_topics':16, 'alpha':50/16, 'eta':0.01, 'chunksize':32}
]

lda_models = []

for conf in configs:
    lda_models.append(LdaModel(corpus=corpus_bow,id2word=corpus_dict,passes=100,**conf))
    perplexity = lda_models[-1].log_perplexity(corpus_bow)
    coherence_model_lda = CoherenceModel(model=lda_models[-1], texts=corpus, dictionary=corpus_dict, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"{conf['num_topics']: <{2}} {conf['alpha']:.{2}} {conf['eta']:.{2}} {conf['chunksize']: <{4}} | {perplexity: >+{5}.{2}f} {coherence_lda: >{4}.{2}f}")

Inpect models

In [None]:
# Pick model for inspection
model_num = 3 # selected for the study

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_models[model_num], corpus_bow, corpus_dict)
vis

Print top words in vectors

In [None]:
topic_vectors = lda_model.show_topics(num_topics=configs[model_num]['num_topics'], num_words=10, formatted=False)
for v in topic_vectors:
    print([w[0] for w in v[1]])

### Issues

In [8]:
data = load_dataset('data/dataset.json')

In [9]:
documents = []
for unit in data:
    if unit['type'] == 'issue':
        document = ''
        if unit['content']['title']:
            document += '\n'+ unit['content']['title']
        if unit['content']['body']:
            document += '\n'+ unit['content']['body']
        document += '\n'.join([c for c in unit['content']['comments']])
        documents.append(document)

In [10]:
corpus = prepare_corpus(documents)

In [11]:
(corpus_dict, corpus_bow, tfidf_model) = build_tfidf_model(corpus)

Explore hyperparameters
- K = {5,6,...,34,35}
- alpha = {0.01,50/K}
- beta = {0.01,50/K}
- chunksize = {1,2,4,8,...,1024}

In [None]:
for num_topics in range(5,36):
    for alpha in (0.01, 50/num_topics):
        for beta in (0.01, 50/num_topics):
            for chunksize in (1,2,4,8,16,32,64,128,256,512,1024):
                lda_model = LdaModel(corpus=corpus_bow,id2word=corpus_dict,num_topics=num_topics,alpha=alpha,eta=beta,chunksize=chunksize,passes=100)
                perplexity = lda_model.log_perplexity(corpus_bow)
                coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=corpus_dict, coherence='c_v')
                coherence_lda = coherence_model_lda.get_coherence()
                print(f'{num_topics: <{2}} {alpha:.{2}} {beta:.{2}} {chunksize: <{4}} | {perplexity: >+{5}.{2}f} {coherence_lda: >{4}.{2}f}')

Train models for manual inspection

In [None]:
configs = [
    {'num_topics':5, 'alpha':0.01, 'eta':50/5, 'chunksize':2}, # SELECTED MODEL
    {'num_topics':12, 'alpha':50/12, 'eta':0.01, 'chunksize':2},
    {'num_topics':20, 'alpha':0.01, 'eta':50/20, 'chunksize':256},
    {'num_topics':21, 'alpha':50/21, 'eta':0.01, 'chunksize':2},
    {'num_topics':27, 'alpha':50/27, 'eta':0.01, 'chunksize':2},
    {'num_topics':33, 'alpha':50/33, 'eta':0.01, 'chunksize':4}
]

lda_models = []

for conf in configs:
    lda_models.append(LdaModel(corpus=corpus_bow,id2word=corpus_dict,passes=100,**conf))
    perplexity = lda_models[-1].log_perplexity(corpus_bow)
    coherence_model_lda = CoherenceModel(model=lda_models[-1], texts=corpus, dictionary=corpus_dict, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"{conf['num_topics']: <{2}} {conf['alpha']:.{2}} {conf['eta']:.{2}} {conf['chunksize']: <{4}} | {perplexity: >+{5}.{2}f} {coherence_lda: >{4}.{2}f}")

Inspect models

In [None]:
# Pick model for inspection
model_num = 0 # selected for the study

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_models[model_num], corpus_bow, corpus_dict)
vis

Print top words in vectors

In [None]:
topic_vectors = lda_model.show_topics(num_topics=configs[model_num]['num_topics'], num_words=10, formatted=False)
for v in topic_vectors:
    print([w[0] for w in v[1]])