<a href="https://colab.research.google.com/github/eriksali/Text-Analytics_LDA/blob/master/Top15_Documents_Per_Topic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# for text preprocessing
import re
import spacy

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import numpy for matrix operation
import numpy as np

# Importing Gensim
import gensim
from gensim import corpora
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive/')

dirpath = '/content/drive/MyDrive/Names_2000/'

import os 

files = [os.path.join(dirpath, fname) for fname in os.listdir(dirpath)]
corpus = []

for filename in sorted(files):
    with open(filename, 'r') as f:
        corpus += [f.read()]
        
from pathlib import Path

obit_titles = [Path(file).stem for file in sorted(files)]
## obit_titles = [Path(file).stem for file in files]

obit_titles

# Apply Preprocessing on the Corpus

import nltk

nltk.download('wordnet')
nltk.download('stopwords')

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

# clean data stored in a new list
data_lemmatized = [clean(doc).split() for doc in corpus] 

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Build LDA model
## lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=20)

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
                                           

!pip install pyLDAvis

from pprint import pprint
import pandas as pd

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Converting list of documents (corpus) into Document Term Matrix using the dictionary 
doc_term_matrix = [id2word.doc2bow(i) for i in data_lemmatized]


def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        Lda = gensim.models.ldamodel.LdaModel
        model = Lda(doc_term_matrix, num_topics=num_topics, id2word = id2word, passes=20, random_state=30, eval_every=None)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

# Select the model and print the topics
optimal_model = model_list[3]


# Finding the dominant topic in each document 

def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_lemmatized):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Get the Dominant topic, Percentage Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    ## names = pd.Series(obit_titles)

    ## sent_topics_df = pd.concat([sent_topics_df, contents, names], axis=1)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data_lemmatized)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Contrib', 'Keywords', 'Text']
## df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Contrib', 'Keywords', 'Text', 'Name']

# Display
df_dominant_topic.head(2000)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Unnamed: 0,Document_No,Dominant_Topic,Topic_Contrib,Keywords,Text
0,0,16.0,0.3606,"loss, var, train, pred, iter, score, backend, ...","[rmse, correct, pred, torch, tensor, repeat, t..."
1,1,17.0,0.6655,"cach, bar, foo, offens, tvm, observ, stack, al...","[diagon, add, scalar, torch, randn, plu, diag,..."
2,2,1.0,0.9926,"respons, output, json, type, user, content, ge...","[creat, token, first, first, tri, creat, datas..."
3,3,18.0,0.7095,"mock, call, obj, tabl, job, return, equal, con...","[dont, restart, download, job, still, run, job..."
4,4,11.0,0.9886,"sampl, job, percent, run, save, config, zero, ...","[first, set, mock, network, call, experi, comp..."
...,...,...,...,...,...
1994,1994,14.0,0.5917,"config, dtype, array, imag, matrix, shape, equ...","[matrix, matrix, array, noisi, random, rand, m..."
1995,1995,9.0,0.5179,"color, track, predict, encod, name, case, loca...","[max, cluster, max, cluster, array, result, ma..."
1996,1996,13.0,0.8579,"shape, tensor, size, torch, layer, data, input...","[trace, doe, contain, hard, code, constant, si..."
1997,1997,13.0,0.9736,"shape, tensor, size, torch, layer, data, input...","[output, box, convert, tensor, dtype, torch, f..."


In [17]:

# Finding the dominant topic in each document 

def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_lemmatized):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Get the Dominant topic, Percentage Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    ## names = pd.Series(obit_titles)

    ## sent_topics_df = pd.concat([sent_topics_df, contents, names], axis=1)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data_lemmatized)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Contrib', 'Keywords', 'Text']
## df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Contrib', 'Keywords', 'Text', 'Name']

# Display
df_dominant_topic.head(2000)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Contrib,Keywords,Text
0,0,16.0,0.3609,"loss, var, train, pred, iter, score, backend, ...","[rmse, correct, pred, torch, tensor, repeat, t..."
1,1,17.0,0.6655,"cach, bar, foo, offens, tvm, observ, stack, al...","[diagon, add, scalar, torch, randn, plu, diag,..."
2,2,1.0,0.9926,"respons, output, json, type, user, content, ge...","[creat, token, first, first, tri, creat, datas..."
3,3,18.0,0.7096,"mock, call, obj, tabl, job, return, equal, con...","[dont, restart, download, job, still, run, job..."
4,4,11.0,0.9886,"sampl, job, percent, run, save, config, zero, ...","[first, set, mock, network, call, experi, comp..."
...,...,...,...,...,...
1994,1994,14.0,0.5918,"config, dtype, array, imag, matrix, shape, equ...","[matrix, matrix, array, noisi, random, rand, m..."
1995,1995,9.0,0.5178,"color, track, predict, encod, name, case, loca...","[max, cluster, max, cluster, array, result, ma..."
1996,1996,13.0,0.8578,"shape, tensor, size, torch, layer, data, input...","[trace, doe, contain, hard, code, constant, si..."
1997,1997,13.0,0.9736,"shape, tensor, size, torch, layer, data, input...","[output, box, convert, tensor, dtype, torch, f..."


In [None]:
lda_model.print_topics(num_words=15)

In [None]:
[ [(id2word[wid], s) for (wid, s) in lda_model.get_topic_terms(tid, topn=15)] for tid in range(20)]

In [None]:
x=lda_model.show_topics(num_topics=20, num_words=15,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Topics and Words
for topic,words in topics_words:
    print(str(topic)+ ", "+ str(words))
print()

#Below Code Prints Only Words 
## for topic,words in topics_words:
    ## print(" ".join(words))

In [23]:
# Finding the most representative document (top20) for each topic

sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(15)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Display
sent_topics_sorteddf.head(300)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.9901,"expect, index, data, result, equal, frame, act...","[junction, comput, distanc, second, anndata, a..."
1,0.0,0.9852,"expect, index, data, result, equal, frame, act...","[singl, key, multi, key, groupbi, index, data,..."
2,0.0,0.9793,"expect, index, data, result, equal, frame, act...","[float, truncat, engin, parser, exp, result, e..."
3,0.0,0.9721,"expect, index, data, result, equal, frame, act...","[2015, 2016, 2015, 2016, concat, datafram, fir..."
4,0.0,0.9712,"expect, index, data, result, equal, frame, act...","[result, type, consist, matter, path, take, co..."
...,...,...,...,...
295,19.0,0.9756,"error, rais, messag, pytest, msg, valu, except...","[duplic, code, interv, right, open, loader, in..."
296,19.0,0.9672,"error, rais, messag, pytest, msg, valu, except...","[boundari, sub, bin, precis, boundari, boundar..."
297,19.0,0.9648,"error, rais, messag, pytest, msg, valu, except...","[node, address, error, node, address, miss, no..."
298,19.0,0.9472,"error, rais, messag, pytest, msg, valu, except...","[rais, error, endpoint, provid, endpoint, get,..."


In [6]:

# Finding the most representative document (top20) for each topic

sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Contrib", "Keywords", "Text"]

# Display
sent_topics_sorteddf.head(20)

ValueError: ignored