In [1]:
import os
import pandas as pd
from PyPDF2 import PdfReader
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from pprint import pprint

  from imp import reload


# Reading pdf files from inside pdfs folder and storing inside a dataframe

In [2]:
files = [f for f in os.listdir("pdfs/") if os.path.isfile("pdfs/"+f) and f.endswith('.pdf')]

data = []

for name in files:
    reader = PdfReader("pdfs/"+name)
    number_of_pages = len(reader.pages)
    text = ""
    for i in range(number_of_pages):
        page = reader.pages[i]
        tt = page.extract_text()
        text +=tt
    data.append(text)
    
df = pd.DataFrame(data, index=None)
df

Unnamed: 0,0
0,Vifor Pharma Ltd. \n 2018 Responsibility Highl...
1,Powering the \n\ndigital economy\nTelecity Gro...
2,SUSTAINABILITY REPORT 2018\nABOUT THIS REPORT\...
3,GRI Report 2017\n \n \n \nPage \n2\n \n \nTabl...
4,SFS Group AG\nSustainability Report\nSustain\n...
5,1\nNorwegian Sustainability Report \n2018\n1.\...


# Converting the text from pdfs into words

In [3]:
def doc_to_words(doc):
    return(gensim.utils.simple_preprocess(str(doc), deacc=True))

for idx, item in enumerate(df[0]):
    df[0][idx]=list(doc_to_words(item))
df

Unnamed: 0,0
0,"[vifor, pharma, ltd, responsibility, highlight..."
1,"[powering, the, digital, economy, telecity, gr..."
2,"[sustainability, report, about, this, report, ..."
3,"[gri, report, page, table, of, content, gri, g..."
4,"[sfs, group, ag, sustainability, report, susta..."
5,"[norwegian, sustainability, report, highlights..."


# Printing number of words inside each pdfs

In [4]:
for doc in df[0]:
    print(len(doc))

12112
45323
49
6224
1254
6995


# Removing stopwords from the words list

In [5]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'group', 'financial', 'report'])

def remove_stopwords(texts):
    return [word for word in texts if word not in stop_words]

for idx, item in enumerate(df[0]):
    df[0][idx]=remove_stopwords(item)
    
df

Unnamed: 0,0
0,"[vifor, pharma, ltd, responsibility, highlight..."
1,"[powering, digital, economy, telecity, plc, an..."
2,"[sustainability, bp, emissions, employee, empl..."
3,"[gri, page, table, content, gri, general, disc..."
4,"[sfs, ag, sustainability, sustain, ability, sf..."
5,"[norwegian, sustainability, highlights, letter..."


# Printing length of words after removing stopwords

In [6]:
for doc in df[0]:
    print(len(doc))

8007
28399
42
4606
752
4389


# Creating unique token for each word

In [8]:
for doc in df[0]:
    id2word = corpora.Dictionary([doc])
    id2word.token2id
    corpus = [id2word.doc2bow(text) for text in [doc]]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    top_topics = pd.DataFrame(lda_model.print_topics())
    print(top_topics)
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))

    coherence_model_lda = CoherenceModel(model=lda_model, texts=[doc], dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)


     0                                                  1
0    0  0.001*"pharma" + 0.001*"vifor" + 0.001*"respon...
1    1  0.001*"vifor" + 0.001*"pharma" + 0.001*"emissi...
2    2  0.001*"pharma" + 0.001*"vifor" + 0.001*"patien...
3    3  0.001*"pharma" + 0.001*"vifor" + 0.001*"respon...
4    4  0.001*"pharma" + 0.001*"vifor" + 0.001*"patien...
5    5  0.001*"vifor" + 0.001*"employees" + 0.001*"pha...
6    6  0.001*"pharma" + 0.001*"vifor" + 0.001*"employ...
7    7  0.001*"vifor" + 0.001*"pharma" + 0.001*"employ...
8    8  0.001*"pharma" + 0.001*"vifor" + 0.001*"employ...
9    9  0.002*"vifor" + 0.001*"pharma" + 0.001*"employ...
10  10  0.001*"pharma" + 0.001*"vifor" + 0.001*"employ...
11  11  0.001*"pharma" + 0.001*"employees" + 0.001*"vi...
12  12  0.001*"pharma" + 0.001*"vifor" + 0.001*"employ...
13  13  0.001*"pharma" + 0.001*"employees" + 0.001*"vi...
14  14  0.001*"vifor" + 0.001*"pharma" + 0.001*"patien...
15  15  0.001*"pharma" + 0.001*"vifor" + 0.001*"patien...
16  16  0.001*

In [None]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

# pyLDAvis.display(vis)