In [1]:
import os
import pandas as pd
from PyPDF2 import PdfReader
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from pprint import pprint

  from imp import reload


# Reading pdf files from inside pdfs folder and storing inside a dataframe

In [2]:
files = [f for f in os.listdir("pdfs/") if os.path.isfile("pdfs/"+f) and f.endswith('.pdf')]

data = []

for name in files:
    reader = PdfReader("pdfs/"+name)
    number_of_pages = len(reader.pages)
    text = ""
    for i in range(number_of_pages):
        page = reader.pages[i]
        tt = page.extract_text()
        text +=tt
    data.append(text)
    
df = pd.DataFrame(data, index=None)
df

Unnamed: 0,0
0,Vifor Pharma Ltd. \n 2018 Responsibility Highl...
1,Powering the \n\ndigital economy\nTelecity Gro...
2,SUSTAINABILITY REPORT 2018\nABOUT THIS REPORT\...
3,GRI Report 2017\n \n \n \nPage \n2\n \n \nTabl...
4,SFS Group AG\nSustainability Report\nSustain\n...
5,1\nNorwegian Sustainability Report \n2018\n1.\...


# Converting the text from pdfs into words

In [3]:
def doc_to_words(doc):
    return(gensim.utils.simple_preprocess(str(doc), deacc=True))

for idx, item in enumerate(df[0]):
    df[0][idx]=list(doc_to_words(item))
df

Unnamed: 0,0
0,"[vifor, pharma, ltd, responsibility, highlight..."
1,"[powering, the, digital, economy, telecity, gr..."
2,"[sustainability, report, about, this, report, ..."
3,"[gri, report, page, table, of, content, gri, g..."
4,"[sfs, group, ag, sustainability, report, susta..."
5,"[norwegian, sustainability, report, highlights..."


# Printing number of words inside each pdfs

In [4]:
for doc in df[0]:
    print(len(doc))

12112
45323
49
6224
1254
6995


# Removing stopwords from the words list

In [5]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'group', 'financial', 'report'])

def remove_stopwords(texts):
    return [word for word in texts if word not in stop_words]

for idx, item in enumerate(df[0]):
    df[0][idx]=remove_stopwords(item)
    
df

Unnamed: 0,0
0,"[vifor, pharma, ltd, responsibility, highlight..."
1,"[powering, digital, economy, telecity, plc, an..."
2,"[sustainability, bp, emissions, employee, empl..."
3,"[gri, page, table, content, gri, general, disc..."
4,"[sfs, ag, sustainability, sustain, ability, sf..."
5,"[norwegian, sustainability, highlights, letter..."


# Printing length of words after removing stopwords

In [6]:
for doc in df[0]:
    print(len(doc))

8007
28399
42
4606
752
4389


# Creating unique token for each word

In [7]:
id2word = corpora.Dictionary(df[0])
id2word.token2id

{'abac': 0,
 'ability': 1,
 'able': 2,
 'absence': 3,
 'abuse': 4,
 'academic': 5,
 'academies': 6,
 'academy': 7,
 'acceptance': 8,
 'accepted': 9,
 'access': 10,
 'accessi': 11,
 'accessible': 12,
 'accident': 13,
 'accordance': 14,
 'according': 15,
 'accordingly': 16,
 'accounted': 17,
 'accurate': 18,
 'achieve': 19,
 'achieved': 20,
 'achievements': 21,
 'across': 22,
 'act': 23,
 'action': 24,
 'actions': 25,
 'actively': 26,
 'activities': 27,
 'activity': 28,
 'actors': 29,
 'actual': 30,
 'adapt': 31,
 'adapted': 32,
 'adaptive': 33,
 'added': 34,
 'addiction': 35,
 'addition': 36,
 'additional': 37,
 'additionally': 38,
 'address': 39,
 'addressed': 40,
 'addressing': 41,
 'adequate': 42,
 'adhere': 43,
 'adjusted': 44,
 'advance': 45,
 'advancement': 46,
 'advancing': 47,
 'advice': 48,
 'advocacy': 49,
 'advocate': 50,
 'advocating': 51,
 'af': 52,
 'afety': 53,
 'affect': 54,
 'affected': 55,
 'affordability': 56,
 'age': 57,
 'agency': 58,
 'agentur': 59,
 'aglobal': 60,

# Vectorizing the text 

In [8]:
corpus = [id2word.doc2bow(text) for text in df[0]]

In [9]:
for idx,doc in enumerate(corpus):
    print("Document no " +str(idx)+ " - " + str(doc[:10]))

Document no 0 - [(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 5), (8, 1), (9, 2)]
Document no 1 - [(1, 4), (2, 15), (3, 4), (9, 1), (10, 11), (13, 1), (14, 22), (15, 2), (16, 5), (18, 2)]
Document no 2 - [(448, 1), (680, 3), (684, 4), (877, 1), (1080, 1), (1370, 1), (2049, 1), (2050, 1), (4394, 1), (4678, 2)]
Document no 3 - [(0, 3), (1, 2), (2, 2), (10, 1), (14, 1), (15, 1), (16, 2), (22, 5), (23, 2), (24, 2)]
Document no 4 - [(1, 1), (2, 2), (13, 1), (14, 1), (15, 4), (20, 3), (22, 1), (26, 1), (34, 5), (61, 1)]
Document no 5 - [(1, 4), (2, 3), (9, 1), (10, 2), (14, 7), (15, 7), (17, 1), (18, 1), (19, 2), (22, 3)]


# Implementation of LDA

In [22]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Top 20 topics from all documents

In [23]:
top_topics = pd.DataFrame(lda_model.print_topics())
top_topics

Unnamed: 0,0,1
0,11,"0.000*""year"" + 0.000*""data"" + 0.000*""board"" + ..."
1,24,"0.000*""business"" + 0.000*""company"" + 0.000*""em..."
2,22,"0.001*""year"" + 0.000*""data"" + 0.000*""business""..."
3,28,"0.000*""year"" + 0.000*""data"" + 0.000*""business""..."
4,2,"0.000*""year"" + 0.000*""employees"" + 0.000*""exec..."
5,1,"0.000*""norwegian"" + 0.000*""data"" + 0.000*""year..."
6,7,"0.000*""year"" + 0.000*""data"" + 0.000*""committee..."
7,5,"0.001*""year"" + 0.000*""business"" + 0.000*""commi..."
8,12,"0.000*""year"" + 0.000*""data"" + 0.000*""board"" + ..."
9,19,"0.000*""business"" + 0.000*""year"" + 0.000*""emplo..."


# Topic matrix

In [24]:
topics_df = pd.DataFrame(lda_model.get_topics(), index=None)
topics_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8303,8304,8305,8306,8307,8308,8309,8310,8311,8312
0,0.000118,0.000129,0.00014,0.000119,0.000117,0.000117,0.000117,0.000119,0.000117,0.000119,...,0.000118,0.000118,0.000118,0.000117,0.000118,0.000118,0.00012,0.00012,0.000117,0.000123
1,0.000119,0.000128,0.000134,0.000119,0.000117,0.000117,0.000117,0.000119,0.000117,0.000119,...,0.000117,0.000118,0.000118,0.000117,0.000117,0.000118,0.000121,0.000119,0.000118,0.000123
2,0.000118,0.000128,0.000131,0.000119,0.000117,0.000118,0.000117,0.00012,0.000117,0.000119,...,0.000117,0.000117,0.000117,0.000117,0.000117,0.000117,0.000119,0.000117,0.000117,0.000119
3,0.000106,0.00011,0.000114,0.000103,0.000102,0.000102,0.000101,0.000103,0.000101,0.000103,...,0.000102,0.000102,0.000102,0.000101,0.000102,0.000101,0.000103,0.000102,0.000102,0.000103
4,0.000118,0.000134,0.00014,0.000119,0.000115,0.000116,0.000115,0.000121,0.000115,0.000119,...,0.000116,0.000116,0.000116,0.000115,0.000116,0.000116,0.000118,0.000117,0.000116,0.00012
5,0.000118,0.000128,0.000134,0.00012,0.000115,0.000116,0.000116,0.000119,0.000116,0.000118,...,0.000116,0.000116,0.000116,0.000116,0.000116,0.000116,0.000117,0.000116,0.000116,0.000118
6,0.000118,0.000128,0.000136,0.000119,0.000116,0.000118,0.000117,0.000122,0.000116,0.00012,...,0.000116,0.000117,0.000117,0.000116,0.000116,0.000116,0.000118,0.000117,0.000116,0.000118
7,0.000118,0.000123,0.000129,0.00012,0.000117,0.000118,0.000117,0.00012,0.000117,0.000118,...,0.000117,0.000117,0.000117,0.000117,0.000117,0.000117,0.000117,0.000117,0.000117,0.000117
8,0.00012,0.000131,0.000135,0.00012,0.000117,0.000118,0.000117,0.000119,0.000117,0.00012,...,0.000118,0.000117,0.000117,0.000118,0.000117,0.000117,0.000119,0.000118,0.000117,0.000119
9,0.000117,0.000128,0.000149,0.00012,0.000115,0.000116,0.000115,0.000121,0.000115,0.000119,...,0.000115,0.000115,0.000115,0.000115,0.000115,0.000115,0.00012,0.000117,0.000115,0.000119


In [25]:

print('\nPerplexity: ', lda_model.log_perplexity(corpus))

coherence_model_lda = CoherenceModel(model=lda_model, texts=df[0], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Perplexity:  -9.139611124876245

Coherence Score:  0.3283870977177921


In [26]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

pyLDAvis.display(vis)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
