# Topic Modeling using the BMW integrated report

In [14]:
# import the required packages

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [15]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'group', 'financial', 'report'])

In [16]:
# I previousely convert the pdf to txt and then to JSON using an online tool. 
import json

In [18]:
with open('bmw.json') as f:
    d = json.load(f)



# Tokenize words and Clean-up text

simple_preprocess() makes most of the work

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(d))


[['bmw', 'group', 'report', 'our', 'responsibility', 'our', 'future', 'report', 'on', 'the', 'bmw', 'group', 'economic', 'performance', 'and', 'its', 'ecological', 'and', 'social', 'contributions', 'link', 'to', 'the', 'online', 'report', 'bmw', 'group', 'contents', 'report', 'contents', 'about', 'this', 'report', 'part', 'of', 'the', 'combined', 'management', 'report', 'to', 'our', 'stakeholders', 'bmw', 'group', 'in', 'figures', 'report', 'of', 'the', 'supervisory', 'board', 'statement', 'of', 'the', 'chairman', 'of', 'the', 'board', 'of', 'management', 'dialogue', 'with', 'stakeholders', 'bmw', 'stock', 'and', 'capital', 'markets', 'in', 'combined', 'management', 'report', 'general', 'information', 'and', 'group', 'profile', 'products', 'and', 'services', 'production', 'purchasing', 'and', 'supplier', 'network', 'employees', 'and', 'society', 'report', 'on', 'economic', 'position', 'report', 'on', 'outlook', 'risks', 'and', 'opportunities', 'internal', 'control', 'system', 'relevant

In [7]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  


['bmw', 'group', 'report', 'our', 'responsibility', 'our', 'future', 'report', 'on', 'the', 'bmw', 'group', 'economic', 'performance', 'and', 'its', 'ecological', 'and', 'social', 'contributions', 'link', 'to', 'the', 'online', 'report', 'bmw', 'group', 'contents', 'report', 'contents', 'about', 'this', 'report', 'part', 'of', 'the', 'combined', 'management', 'report', 'to', 'our', 'stakeholders', 'bmw', 'group', 'in', 'figures', 'report', 'of', 'the', 'supervisory', 'board', 'statement', 'of', 'the', 'chairman', 'of', 'the', 'board', 'of', 'management', 'dialogue', 'with', 'stakeholders', 'bmw', 'stock', 'and', 'capital', 'markets', 'in', 'combined', 'management', 'report', 'general', 'information', 'and', 'group', 'profile', 'products', 'and', 'services', 'production', 'purchasing', 'and', 'supplier_network', 'employees', 'and', 'society', 'report', 'on', 'economic_position', 'report', 'on', 'outlook', 'risks', 'and', 'opportunities', 'internal_control_system', 'relevant', 'for', 'ac

In [8]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram[bigram[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)


# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['responsibility', 'future', 'economic', 'performance', 'ecological', 'social', 'contribution', 'link', 'online', 'content', 'content', 'part', 'combined', 'management', 'stakeholder', 'figure', 'supervisory', 'board', 'statement', 'management', 'dialogue', 'stakeholder', 'stock', 'capital', 'market', 'combine', 'management', 'general', 'information', 'profile', 'product', 'service', 'production', 'purchase', 'supplier_network', 'employee', 'society', 'outlook', 'risk', 'opportunitie', 'internal_control', 'system', 'relevant', 'accounting', 'reporting', 'process', 'disclosure', 'relevant', 'takeover', 'explanatory_comment', 'content', 'statement', 'corporate_governance', 'fundamental_aspect', 'corporate_governance', 'part', 'combine', 'management', 'remuneration', 'part', 'combine', 'management', 'glossary', 'explanation', 'key', 'figure', 'income', 'statement', 'segment', 'statement', 'balance_sheet', 'segment', 'responsibility', 'statement', 'company', 'legal', 'representative', 'ca

In [9]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 5), (13, 1), (14, 1), (15, 1), (16, 5), (17, 1), (18, 1), (19, 1), (20, 1), (21, 6), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 3), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 4), (37, 1), (38, 3), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 2), (55, 3), (56, 1), (57, 1), (58, 1), (59, 1), (60, 8), (61, 3), (62, 1), (63, 4), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 12), (78, 3), (79, 2), (80, 1), (81, 1), (82, 1), (83, 4), (84, 2), (85, 1), (86, 1), (87, 1), (88, 1), (89, 2), (90, 2), (91, 1), (92, 1), (93, 1), (94, 2), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 4), (101, 3), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 2), (109, 2), (110, 1

In [10]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

the keywords for each topic and the weightage(importance) of each keyword is shown with lda_model.print_topics(). Output gives the top 10 keywords that contribute to this topic. The weights reflect how important a keyword is to that topic.

In [11]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.073*"board" + 0.034*"supervisory" + 0.033*"member" + 0.032*"remuneration" '
  '+ 0.023*"year" + 0.023*"management" + 0.015*"amount" + 0.012*"target" + '
  '0.011*"meeting" + 0.011*"share"'),
 (1,
  '0.016*"employee" + 0.014*"management" + 0.011*"also" + 0.009*"supplier" + '
  '0.009*"work" + 0.008*"sustainability" + 0.008*"health" + 0.008*"safety" + '
  '0.008*"measure" + 0.008*"gri"'),
 (2,
  '0.001*"year" + 0.001*"board" + 0.001*"risk" + 0.001*"management" + '
  '0.001*"amount" + 0.001*"service" + 0.001*"supervisory" + 0.001*"also" + '
  '0.001*"share" + 0.001*"statement"'),
 (3,
  '0.029*"share" + 0.024*"control" + 0.016*"right" + 0.015*"change" + '
  '0.014*"agreement" + 0.013*"board" + 0.013*"company" + 0.013*"article" + '
  '0.011*"management" + 0.009*"voting_right"'),
 (4,
  '0.017*"compliance" + 0.014*"management" + 0.012*"audit" + 0.010*"vehicle" + '
  '0.010*"statement" + 0.010*"risk" + 0.009*"also" + 0.009*"information" + '
  '0.008*"consolidated" + 0.008*"use"'),


In [12]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.157199119434658

Coherence Score:  0.43395346425048276


In [13]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(
