In [1]:
import requests
import pandas as pd
import numpy as np
import string
import warnings
import matplotlib.pyplot as pltimport 
import pyLDAvis.sklearn

from pprint import pprint
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
from tqdm import tqdm
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig

warnings.filterwarnings('ignore')
%matplotlib inline

  formatvalue=lambda value: "")[1:-1]


In [2]:
# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
#!python3 -m spacy download en_core_web_sm

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

### Build the Google News Scraper using BeautifulSoup
---

In [4]:
def covid19_news_scraper(search_query):
    """
    Pass in a client name or search query and returns last 100 headlines associating the client with Covid-19   
    """
    # Use this URL for Australian centric data
    news_url = "https://news.google.com.au/rss/search?q={"+str(search_query)+"%coronavirus}"
    Client = urlopen(news_url)
    xml_page = Client.read()
    Client.close()
    # Beautiful Soup Library is the bomb
    soup_page = soup(xml_page,"xml")
    news_list = soup_page.findAll("item")
    # Two separate lists for News Title and Publication Date
    l1 = []
    l2 = []
    for news in news_list:
        # Append to a list
        l1.append(news.title.text)
        l2.append(news.pubDate.text)
        # Zip the two together
        l_tup = list(zip(l1, l2))
    # Save this to a DataFrame
    df = pd.DataFrame(l_tup, columns=['Title', 'Date'])
    # Select Date of Headline
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    # Split the Title into Headline and Source columns and then drop the 'Title' column
    df[['Headline','Source']] = df['Title'].str.rsplit("-",1,expand=True)
    df.drop('Title', axis=1, inplace=True)
    return df

nab_df = covid19_news_scraper('NAB')
print(nab_df.shape)
nab_df.head()

(100, 3)


Unnamed: 0,Date,Headline,Source
0,2020-04-20,"ASX drops 2.5pc as oil prices collapse, while NAB flags $1.1b hit to earnings",ABC News
1,2020-04-20,NAB flags $1.14bn triple hit to H1 result,The West Australian
2,2020-03-24,NAB worker sacked over false coronavirus test,The New Daily
3,2020-04-17,NAB's McEwan vows fight to save jobs in bank restructure,Sydney Morning Herald
4,2020-04-20,Coronavirus credit card cuts: Which banks have slashed rates?,Mozo.com.au


In [5]:
cba_df = covid19_news_scraper('CBA')
print(cba_df.shape)
cba_df.head()

(100, 3)


Unnamed: 0,Date,Headline,Source
0,2020-04-14,Hoarding boom is over: CBA says shoppers have closed their wallets,The Age
1,2020-04-16,"Sydney, Melbourne house prices facing 10 per cent fall: CBA",Brisbane Times
2,2020-03-31,Coronavirus: CBA report reveals what we’re buying during pandemic,NEWS.com.au
3,2020-03-23,"Coronavirus ASX: Coles, BHP Up, CBA, Westpac, NAB, Afterpay Fall",Canstar
4,2020-04-01,CBA increases coronavirus loan support,7NEWS.com.au


In [6]:
anz_df = covid19_news_scraper('ANZ')
print(anz_df.shape)
anz_df.head()

(100, 3)


Unnamed: 0,Date,Headline,Source
0,2020-03-24,ANZ signals further relief for customers during crisis,6PR
1,2020-04-04,'Australia won't look the same': ANZ's Elliott warns coronavirus impact will be generational,The Age
2,2020-04-19,'Everybody is paying': ANZ Bank chairman David Gonski,Brisbane Times
3,2020-03-24,Big bank coronavirus support: How to defer your mortgage repayments,Mozo.com.au
4,2020-04-19,Gonski flags hit to dividend at ANZ,The Australian


In [7]:
wbc_df = covid19_news_scraper('Westpac')
print(wbc_df.shape)
wbc_df.head()

(100, 3)


Unnamed: 0,Date,Headline,Source
0,2020-04-16,Riskier funds have been exposed by coronavirus: Westpac,The Australian Financial Review
1,2020-04-14,"Westpac flags $1.4b hit to earnings, braces for coronavirus impact",Sydney Morning Herald
2,2020-04-15,Westpac survey reveals coronavirus caused consumer confidence to fall,NEWS.com.au
3,2020-04-20,"ASX drops 2.5pc as oil prices collapse, while NAB flags $1.1b hit to earnings",ABC News
4,2020-03-23,"Coronavirus ASX: Coles, BHP Up, CBA, Westpac, NAB, Afterpay Fall",Canstar


### Sentiment Analyis on Text
---

In [8]:
def covid19_news_scraper(search_query):
    """
    Pass in a client name or search query and returns last 100 headlines associating the client with Covid-19   
    """
    # Use this URL for Australian centric data
    news_url = "https://news.google.com.au/rss/search?q={"+str(search_query)+"%coronavirus}"
    Client = urlopen(news_url)
    xml_page = Client.read()
    Client.close()
    # Beautiful Soup Library is the bomb
    soup_page = soup(xml_page,"xml")
    news_list = soup_page.findAll("item")
    
    # Two separate lists for News Title and Publication Date
    l1 = []
    l2 = []
    for news in news_list:
        # Append to a list
        l1.append(news.title.text)
        l2.append(news.pubDate.text)
        # Zip the two together
        l_tup = list(zip(l1, l2))
    
    # Save this to a DataFrame
    df = pd.DataFrame(l_tup, columns=['Title', 'Date'])
    # Select Date of Headline
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    # Split the Title into Headline and Source columns and then drop the 'Title' column
    df[['Headline','Source']] = df['Title'].str.rsplit("-",1,expand=True)
    df.drop('Title', axis=1, inplace=True)
    df['Client'] = str(search_query)
    return df

def sentiment_analyser(search_query):
    """
    Runs a Google News Search on the input string and then uses VADER sentiment analysis engine on each returned headline.
    Input: Search Query String
    Output: DataFrame with compound sentiment score for each news article
    """
    # Create a Covid-19 News DataFrame for each organization of interest
    news_df = covid19_news_scraper(search_query)
    # Initialize VADER Sentiment Intensity Analyzer 
    sia = SIA()
    results = []

    # Calculate the polarity score for each headline associated with the organization
    for row in news_df['Headline']:
        pol_score = sia.polarity_scores(row)
        pol_score['Headline'] = row
        results.append(pol_score)
    
    # Create the Sentiment DataFrame
    sent_df = pd.DataFrame.from_records(results)
    # Merge the two dataframes together on the 'Headline' column
    merge_df = news_df.merge(sent_df, on='Headline')
    # Re-order and Rename the columns
    merge_df = merge_df.rename(columns={'compound':'Composite Score'})
    col_order = ['Client','Date','Headline','Source','Composite Score']#,'neg','neu','pos']
    print('Completed processing %s' % search_query, "...")
    return merge_df[col_order]

cba_df = sentiment_analyser('CBA')
print(cba_df.shape)
cba_df.head()

Completed processing CBA ...
(100, 5)


Unnamed: 0,Client,Date,Headline,Source,Composite Score
0,CBA,2020-04-14,Hoarding boom is over: CBA says shoppers have closed their wallets,The Age,0.0
1,CBA,2020-04-16,"Sydney, Melbourne house prices facing 10 per cent fall: CBA",Brisbane Times,0.0
2,CBA,2020-03-31,Coronavirus: CBA report reveals what we’re buying during pandemic,NEWS.com.au,0.0
3,CBA,2020-03-23,"Coronavirus ASX: Coles, BHP Up, CBA, Westpac, NAB, Afterpay Fall",Canstar,0.0
4,CBA,2020-04-01,CBA increases coronavirus loan support,7NEWS.com.au,0.4019


In [9]:
nab_df = sentiment_analyser('NAB')
print(nab_df.shape)
nab_df.head()

Completed processing NAB ...
(100, 5)


Unnamed: 0,Client,Date,Headline,Source,Composite Score
0,NAB,2020-04-20,"ASX drops 2.5pc as oil prices collapse, while NAB flags $1.1b hit to earnings",ABC News,-0.4939
1,NAB,2020-04-20,NAB flags $1.14bn triple hit to H1 result,The West Australian,0.0
2,NAB,2020-03-24,NAB worker sacked over false coronavirus test,The New Daily,0.0
3,NAB,2020-04-17,NAB's McEwan vows fight to save jobs in bank restructure,Sydney Morning Herald,0.1531
4,NAB,2020-04-20,Coronavirus credit card cuts: Which banks have slashed rates?,Mozo.com.au,-0.128


In [10]:
anz_df = sentiment_analyser('ANZ')
print(anz_df.shape)
anz_df.head()

Completed processing ANZ ...
(100, 5)


Unnamed: 0,Client,Date,Headline,Source,Composite Score
0,ANZ,2020-03-24,ANZ signals further relief for customers during crisis,6PR,-0.25
1,ANZ,2020-04-04,'Australia won't look the same': ANZ's Elliott warns coronavirus impact will be generational,The Age,-0.1027
2,ANZ,2020-04-19,'Everybody is paying': ANZ Bank chairman David Gonski,Brisbane Times,0.0
3,ANZ,2020-03-24,Big bank coronavirus support: How to defer your mortgage repayments,Mozo.com.au,0.128
4,ANZ,2020-04-19,Gonski flags hit to dividend at ANZ,The Australian,0.0


In [11]:
wbc_df = sentiment_analyser('Westpac')
print(wbc_df.shape)
wbc_df.head()

Completed processing Westpac ...
(100, 5)


Unnamed: 0,Client,Date,Headline,Source,Composite Score
0,Westpac,2020-04-16,Riskier funds have been exposed by coronavirus: Westpac,The Australian Financial Review,-0.4019
1,Westpac,2020-04-14,"Westpac flags $1.4b hit to earnings, braces for coronavirus impact",Sydney Morning Herald,0.0
2,Westpac,2020-04-15,Westpac survey reveals coronavirus caused consumer confidence to fall,NEWS.com.au,0.5106
3,Westpac,2020-04-20,"ASX drops 2.5pc as oil prices collapse, while NAB flags $1.1b hit to earnings",ABC News,-0.4939
4,Westpac,2020-03-23,"Coronavirus ASX: Coles, BHP Up, CBA, Westpac, NAB, Afterpay Fall",Canstar,0.0


### Modify the code to work for multiple clients
---

In [12]:
def covid19_news_scraper(search_query):
    """
    Pass in a client name or search query and returns last 100 headlines associating the client with Covid-19   
    """
    # Use this URL for Australian centric data
    news_url = "https://news.google.com.au/rss/search?q={"+str(search_query)+"%coronavirus}"
    Client = urlopen(news_url)
    xml_page = Client.read()
    Client.close()
    # Beautiful Soup Library is the bomb
    soup_page = soup(xml_page,"xml")
    news_list = soup_page.findAll("item")
    
    # Two separate lists for News Title and Publication Date
    l1 = []
    l2 = []
    for news in news_list:
        # Append to a list
        l1.append(news.title.text)
        l2.append(news.pubDate.text)
        # Zip the two together
        l_tup = list(zip(l1, l2))
    
    # Save this to a DataFrame
    df = pd.DataFrame(l_tup, columns=['Title', 'Date'])
    # Select Date of Headline
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    # Split the Title into Headline and Source columns and then drop the 'Title' column
    df[['Headline','Source']] = df['Title'].str.rsplit("-",1,expand=True)
    df.drop('Title', axis=1, inplace=True)
    df['Client'] = str(search_query)
    return df

def sentiment_analyser(search_query):
    """
    Runs a Google News Search on the input string and then uses VADER sentiment analysis engine on each returned headline.
    Input: Search Query String
    Output: DataFrame with compound sentiment score for each news article
    """
    # Create a Covid-19 News DataFrame for each organization of interest
    news_df = covid19_news_scraper(search_query)
    # Initialize VADER Sentiment Intensity Analyzer 
    sia = SIA()
    results = []

    # Calculate the polarity score for each headline associated with the organization
    for row in news_df['Headline']:
        pol_score = sia.polarity_scores(row)
        pol_score['Headline'] = row
        results.append(pol_score)
    
    # Create the Sentiment DataFrame
    sent_df = pd.DataFrame.from_records(results)
    # Merge the two dataframes together on the 'Headline' column
    merge_df = news_df.merge(sent_df, on='Headline')
    # Re-order and Rename the columns
    merge_df = merge_df.rename(columns={'compound':'VADER Score'})
    col_order = ['Client','Date','Headline','Source','VADER Score','neg','neu','pos']
    print('Completed processing %s' % search_query, "...")
    return merge_df[col_order]

def client_c19_news_agg(client_list):
    """
    Provided a list of clients, this pulls up the past 100 covid-19 related news articles on each of them and calculates 
    a Composite Sentiment score for each article related to a client 
    """
    frames = [sentiment_analyser(c) for c in client_list]
    result = pd.concat(frames)
    print()
    print("VADER Score is a Normalized Weighted Sentiment Composite Score that ranges from +1 (Extremely Positive) to -1 (Extremely Negative)")
    return result

clients = ['CBA', 'NAB', 'Westpac', 'ANZ']
df = client_c19_news_agg(clients)
df.sample(10)

Completed processing CBA ...
Completed processing NAB ...
Completed processing Westpac ...
Completed processing ANZ ...

VADER Score is a Normalized Weighted Sentiment Composite Score that ranges from +1 (Extremely Positive) to -1 (Extremely Negative)


Unnamed: 0,Client,Date,Headline,Source,VADER Score,neg,neu,pos
82,CBA,2020-04-03,Why Commonwealth Bank shares are a no-brainer for income investors,Motley Fool Australia,0.296,0.0,0.784,0.216
84,CBA,2020-04-04,Banks urged to 'stretch' to help crisis customers,The Age,-0.34,0.32,0.469,0.211
20,ANZ,2020-04-14,Coronavirus support: What the Big Four banks are offering households and businesses,The New Daily,0.4019,0.0,0.803,0.197
4,Westpac,2020-03-23,"Coronavirus ASX: Coles, BHP Up, CBA, Westpac, NAB, Afterpay Fall",Canstar,0.0,0.0,1.0,0.0
44,Westpac,2020-04-09,Westpac supports Government Loan Guarantee Scheme,Mirage News,0.5423,0.0,0.471,0.529
1,CBA,2020-04-16,"Sydney, Melbourne house prices facing 10 per cent fall: CBA",Brisbane Times,0.0,0.0,1.0,0.0
14,ANZ,2020-03-26,Coronavirus pushes up contactless payment limit to contain virus spread,7NEWS.com.au,0.0,0.0,1.0,0.0
41,NAB,2020-03-28,NAB offers SMEs new $250000 low-rate loans,Mirage News,0.0,0.0,1.0,0.0
50,ANZ,2020-04-02,Australian banks face calls to scrap dividends during coronavirus crisis,The New Daily,-0.6249,0.313,0.687,0.0
42,NAB,2020-04-19,"U.S. Lawmakers Make Push to Assist Local Newspapers, Broadcasters",The New York Times,0.0,0.0,1.0,0.0


In [13]:
df.shape

(400, 8)

### Topic Analysis on Text
---

In [14]:
# Create a spaCy object
nlp = spacy.load('en_core_web_sm')

#### Entity Recognition
---

In [15]:
for client in clients:
    print(df[df.Client==client]['Headline'].head())

0    Hoarding boom is over: CBA says shoppers have closed their wallets 
1    Sydney, Melbourne house prices facing 10 per cent fall: CBA        
2    Coronavirus: CBA report reveals what we’re buying during pandemic  
3    Coronavirus ASX: Coles, BHP Up, CBA, Westpac, NAB, Afterpay Fall   
4    CBA increases coronavirus loan support                             
Name: Headline, dtype: object
0    ASX drops 2.5pc as oil prices collapse, while NAB flags $1.1b hit to earnings 
1    NAB flags $1.14bn triple hit to H1 result                                     
2    NAB worker sacked over false coronavirus test                                 
3    NAB's McEwan vows fight to save jobs in bank restructure                      
4    Coronavirus credit card cuts: Which banks have slashed rates?                 
Name: Headline, dtype: object
0    Riskier funds have been exposed by coronavirus: Westpac                       
1    Westpac flags $1.4b hit to earnings, braces for coronavirus impact

In [16]:
for client in clients:
    for i in range(100):
        doc = nlp(df[df.Client==client]['Headline'][i])
        spacy.displacy.render(doc, style='ent', jupyter=True)

In [17]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

#### Lemmatization
---

In [19]:
for client in clients:
    for i in range(10):
        doc = nlp(df[df.Client==client]['Headline'][i])
        review = str(" ".join([j.lemma_ for j in doc]))
        doc = nlp(review)
        spacy.displacy.render(doc, style='ent',jupyter=True)

In [20]:
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [21]:
tqdm.pandas()
df["Headline"] = df["Headline"].progress_apply(spacy_tokenizer)
df.head()

100%|██████████| 400/400 [00:00<00:00, 1657.07it/s]


Unnamed: 0,Client,Date,Headline,Source,VADER Score,neg,neu,pos
0,CBA,2020-04-14,hoarding boom cba shopper close wallet,The Age,0.0,0.0,1.0,0.0
1,CBA,2020-04-16,sydney melbourne house price face 10 cent fall cba,Brisbane Times,0.0,0.0,1.0,0.0
2,CBA,2020-03-31,coronavirus cba report reveal buy pandemic,NEWS.com.au,0.0,0.0,1.0,0.0
3,CBA,2020-03-23,coronavirus asx coles bhp cba westpac nab afterpay fall,Canstar,0.0,0.0,1.0,0.0
4,CBA,2020-04-01,cba increase coronavirus loan support,7NEWS.com.au,0.4019,0.0,0.597,0.403


In [22]:
df.tail()

Unnamed: 0,Client,Date,Headline,Source,VADER Score,neg,neu,pos
95,ANZ,2020-03-23,australian plenty place turn free financial assistance come month,Daily Telegraph,0.5106,0.0,0.82,0.18
96,ANZ,2020-04-02,commbank offer interest refund home loan credit card customer,Savings.com.au,0.6808,0.0,0.616,0.384
97,ANZ,2020-04-01,australian stock market jump 3.5 cent investor pin hope major bank,9News,0.4215,0.0,0.823,0.177
98,ANZ,2020-04-05,important step home loan customer coronavirus pandemic,PerthNow,0.2023,0.0,0.87,0.13
99,ANZ,2020-04-03,9 place hire worker right,7NEWS.com.au,0.0,0.0,1.0,0.0


### Topic Modeling with sklearn - to be redone using GENSIM
---

In [24]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english',
                             lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(df["Headline"])
NUM_TOPICS = 10

In [25]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [26]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [27]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [28]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [29]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('coronavirus', 12.108495026471733), ('cut', 9.691777020916298), ('rate', 9.467867584769303), ('dividend', 8.31703132216418), ('buy', 8.2906412593394), ('bank', 7.350987947705607), ('spend', 4.515033052400761), ('credit', 4.328008287618024), ('card', 4.319301902767212), ('asx', 3.8753238060334017)]
Topic 1:
[('crisis', 17.912544085768673), ('worker', 15.891386229007685), ('loan', 14.75646489665657), ('customer', 12.567537536462645), ('nab', 11.880317760957944), ('coronavirus', 11.327298154182317), ('credit', 10.800468250671095), ('card', 10.749134054743365), ('home', 10.56127183100596), ('offer', 10.464030505740533)]
Topic 2:
[('coronavirus', 26.730100748468928), ('pandemic', 19.62417912777104), ('home', 10.410343475253306), ('loan', 10.319935300544417), ('job', 9.920832428539269), ('customer', 8.689824171649104), ('relief', 7.806980415282623), ('hit', 6.1603713667008275), ('reveal', 5.77937928566801), ('million', 5.115857519621089)]
Topic 3:
[('bank', 25.330438152

In [30]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer)

NMF Model:
Topic 0:
[('coronavirus', 4.716631291649473), ('crisis', 0.4602247605758719), ('australia', 0.3883804996688655), ('pandemic', 0.2767452569078068), ('impact', 0.1910020102662019), ('job', 0.18715867856535462), ('push', 0.15451522326062977), ('spread', 0.1421902751359621), ('limit', 0.1291827339391156), ('payment', 0.12734347403907278)]
Topic 1:
[('bank', 3.3111000923531275), ('mortgage', 0.5244241624572922), ('commonwealth', 0.4075055501015735), ('branch', 0.3866712266332575), ('hold', 0.3714071642259443), ('amid', 0.36546115307722676), ('close', 0.3542282649246589), ('freeze', 0.33800595964550184), ('payment', 0.28855807174060233), ('australian', 0.20983251834066496)]
Topic 2:
[('nab', 3.69867366207072), ('branch', 0.34541373744897397), ('flag', 0.27173816720180805), ('close', 0.2693354587914116), ('collapse', 0.2544582305076172), ('low', 0.21475336375890328), ('commonwealth', 0.1473793882953752), ('consumer', 0.14273391971702853), ('oil', 0.14245888943432142), ('worker', 0.

In [31]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, vectorizer)

LSI Model:
Topic 0:
[('coronavirus', 0.8177223413434412), ('bank', 0.37685601691614484), ('australian', 0.13530716993321396), ('business', 0.11794244523238191), ('nab', 0.11357574304847509), ('asx', 0.10011495434283109), ('westpac', 0.09189683618615513), ('loan', 0.08374972064393095), ('crisis', 0.08229721254911022), ('big', 0.08148087225333067)]
Topic 1:
[('bank', 0.7579592833411971), ('big', 0.2074186409429326), ('business', 0.1516377345353304), ('dividend', 0.12301603292445024), ('offer', 0.11549376650075129), ('mortgage', 0.11113105364385024), ('loan', 0.10432004018498592), ('support', 0.10180817418037655), ('commonwealth', 0.09079039289628142), ('hold', 0.07870549215379484)]
Topic 2:
[('asx', 0.47852646893201184), ('nab', 0.4248689893124374), ('westpac', 0.2880763440141423), ('dividend', 0.1984529657921083), ('hit', 0.17952462867900756), ('share', 0.15551583522016152), ('buy', 0.13703797455228325), ('fall', 0.1319743472268919), ('cba', 0.13126111780257665), ('price', 0.12776017350

In [32]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash