In [63]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nbinteract as nbi
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [3]:
stopwords = open('./stop.txt').read()
stopwords = set(stopwords.split())

In [4]:
document = open('./Takeaways.txt').read()

In [5]:
list_of_sentence=document.split("\n")

In [6]:
list_of_simple_preprocess_data = []
for i in list_of_sentence:
    list_of_simple_preprocess_data.append(gensim.utils.simple_preprocess(i, deacc=True, min_len=2))

In [7]:
data = list_of_simple_preprocess_data
data

[['ecosystem',
  'mindset',
  'is',
  'key',
  'to',
  'competitiveness',
  'but',
  'note',
  'that',
  'the',
  'competition',
  'is',
  'ecosystem',
  'vs',
  'ecosystem',
  'and',
  'not',
  'company',
  'vs',
  'company'],
 ['there',
  'are',
  'three',
  'types',
  'of',
  'ecosystems',
  'business',
  'innovation',
  'platform',
  'we',
  'need',
  'to',
  'find',
  'position',
  'in',
  'an',
  'ecosystem',
  'that',
  'creates',
  'value',
  'and',
  'to',
  'create',
  'value',
  'we',
  'need',
  'to',
  'use',
  'the',
  'right',
  'mindset'],
 ['companies',
  'in',
  'ecosystems',
  'create',
  'significant',
  'value',
  'companies',
  'with',
  'traditional',
  'mindsets',
  'using',
  'old',
  'business',
  'models',
  'do',
  'not'],
 ['one',
  'should',
  'invest',
  'in',
  'those',
  'assets',
  'and',
  'capabilities',
  'that',
  'are',
  'rare',
  'valuable',
  'and',
  'hard',
  'for',
  'competitors',
  'to',
  'imitate'],
 ['for',
  'an',
  'organization',
  '

In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data[0]]])

['ecosystem', 'mindset', 'is', 'key', 'to', 'competitiveness', 'but', 'note', 'that', 'the', 'competition', 'is', 'ecosystem', 'vs', 'ecosystem', 'and', 'not', 'company', 'vs', 'company']


In [9]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [67]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['note']]


In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


In [12]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [59]:
from ipywidgets import interact, fixed
from IPython.core.interactiveshell import InteractiveShell
import ipywidgets as widgets
from IPython.display import display

InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

a_list = list(range(2, 51))
@interact(Number=a_list)
def my_function(Number):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=Number, id2word=id2word,random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    pyLDAvis.enable_notebook()
    p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word,R=10, sort_topics=False, mds='tsne')
    display(p)
    def format_topics_sentences(ldamodel=None, corpus=corpus, texts=list_of_sentence):
    # Init output
        sent_topics_df = pd.DataFrame()
        pd.set_option('max_colwidth',None)


        # Get main topic in each document
        for i, row_list in enumerate(ldamodel[corpus]):
            row = row_list[0] if ldamodel.per_word_topics else row_list            
            # print(row)
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                if ((j == 0) & (prop_topic >0.6)):  # => dominant topic
                    wp = ldamodel.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
                else:
                    break
        sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

        # Add original text to the end of the output
        contents = pd.Series(texts)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return(sent_topics_df)


    df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=list_of_sentence)

    # Format
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Percentage', 'Keywords', 'Text']
    df_dominant_topic['Dominant_Topic'] = df_dominant_topic['Dominant_Topic'] + 1

    #ALL = 'ALL'
    def unique_sorted_values_plus_ALL(array):
        colvalue = array.unique().tolist()
        colvalue.sort()
        #colvalue.insert(0, ALL)
        return colvalue

    this =unique_sorted_values_plus_ALL(df_dominant_topic['Dominant_Topic'].value_counts().loc[lambda x : x>3].reset_index()['index'])
    @interact(DominantTopic=this)
    def my_function(DominantTopic):
        df_dominant = (df_dominant_topic[df_dominant_topic['Dominant_Topic'] == DominantTopic]).sort_values(by=['Percentage'],ascending=False)
        df_dispaly = (df_dominant[['Keywords', 'Percentage','Text']]).style.hide_index().set_properties(**{'text-align': 'left'}).set_properties(subset=['Keywords'], **{'width': '300px'})
        display(df_dispaly.set_table_styles([dict(selector='th', props=[('text-align', 'left')])]))
   
    return (lda_model)

interactive(children=(Dropdown(description='Number', options=(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …