In [1]:
import os
import pandas as pd
from PyPDF2 import PdfReader
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from gensim.models import CoherenceModel 
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from pprint import pprint
from tqdm.auto import tqdm

  from imp import reload


# Reading pdf files from inside pdfs folder and storing inside a dataframe

In [2]:
folder = "pdfs/"
files = [f for f in os.listdir(folder) if os.path.isfile(folder+f) and f.endswith('.pdf')]

In [3]:
def doc_to_words(doc):
    return(gensim.utils.simple_preprocess(str(doc), deacc=True))

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'group', 'financial', 'report'])

def remove_stopwords(texts):
    return [word for word in texts if word not in stop_words]

In [4]:
data = []
print("Total pdfs - "+ str(len(files)))
pbar = tqdm(files, desc='Reading pdf files', ncols=100, unit="pdfs", colour="green",position=0, leave=True)
for name in pbar:
    pbar.set_description("Reading "+name[:10])
    reader = PdfReader(folder+name)
    if reader.is_encrypted:
        try:
            reader.decrypt("")
            pbar.set_description("File Decrypted (PyPDF2)")
        except:
            command = (
                "cp "
                + folder+name
                + " temp.pdf; qpdf --password='' --decrypt temp.pdf "
                + folder+name
                + "; rm temp.pdf"
                
            )
            os.system(command)
            pbar.set_description("File Decrypted (qpdf)")
            reader = PdfReader(folder+name)
    else:
        pass
    number_of_pages = len(reader.pages)
    text = ""
    pbar.set_description("Number of pages=" + str(number_of_pages) )
    for i in range(number_of_pages):
        pbar.set_description("Reading page number - " + str(i)+"/"+ str(number_of_pages) )
        page = reader.pages[i]
        tt = page.extract_text()
        text +=tt
    pbar.set_description("Creating text to words")
    text = list(doc_to_words(text))
    pbar.set_description("Removing stopwords")
    text=remove_stopwords(text)
    
    id2word = corpora.Dictionary([text])
    id2word.token2id
    pbar.set_description("Creating corpus")
    corpus = [id2word.doc2bow(text)]
    pbar.set_description("Creating lda model")
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=20, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)
    tops=lda_model.print_topics()
    perplexity=lda_model.log_perplexity(corpus)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=[text], dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    data.append([name,tops, perplexity, coherence_lda])
pbar.close()

Total pdfs - 26


Creating lda model: 100%|[32m█████████████████████████████████████████[0m| 26/26 [02:26<00:00,  5.64s/pdfs][0m


In [5]:
df = pd.DataFrame(data, index=None, columns=['Filename', 'Topics', 'Perplexity', 'Coherence Score'])

In [6]:
df

Unnamed: 0,Filename,Topics,Perplexity,Coherence Score
0,115212-19Co-26959608R16313558352T-Gl.pdf,"[(0, 0.001*""pharma"" + 0.001*""vifor"" + 0.001*""r...",-7.643647,0.706936
1,159623-21In-41661603Y58142996996E-Gl.pdf,"[(0, 0.001*""year"" + 0.001*""million"" + 0.001*""t...",-9.092584,0.326911
2,159463-21Su-35719712F60875638102E-Gl.pdf,"[(0, 0.001*""development"" + 0.001*""terna"" + 0.0...",-7.78415,0.356673
3,158852-21Su-37489072X12442877160G-It.pdf,"[(0, 0.001*""di"" + 0.001*""pag"" + 0.001*""health""...",-7.50107,0.253145
4,159350-21In-38244000A10099284300E-Sw.pdf,"[(0, 0.001*""skistar"" + 0.001*""company"" + 0.001...",-7.866288,0.27388
5,159621-21Su-37989798F25496581572M-Gl.pdf,"[(0, 0.001*""healthineers"" + 0.001*""business"" +...",-7.904073,0.223775
6,52769-13In-13878247F2105483100O-Eu.pdf,"[(0, 0.001*""year"" + 0.001*""data"" + 0.000*""dire...",-8.155629,0.320534
7,159806-21Su-40750530B3341223848E-Gl.pdf,"[(0, 0.001*""infineon"" + 0.001*""management"" + 0...",-7.556803,0.218826
8,159805-21In-38992420Q1573440030X-Gl.pdf,"[(0, 0.001*""infineon"" + 0.001*""year"" + 0.001*""...",-7.826061,0.241627
9,159861-21Su-35649003S3870234810V-Gl.pdf,"[(0, 0.001*""company"" + 0.001*""aegean"" + 0.001*...",-8.008579,0.235413
