In [1]:
import requests
import pandas as pd
import numpy as np
import string
import warnings
import matplotlib.pyplot as pltimport 
import pyLDAvis.sklearn

from pprint import pprint
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
from tqdm import tqdm
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig

warnings.filterwarnings('ignore')
%matplotlib inline

  formatvalue=lambda value: "")[1:-1]


In [2]:
# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
#!python3 -m spacy download en_core_web_sm

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

### Build the Google News Scraper using BeautifulSoup
---

In [4]:
def covid19_news_scraper(search_query):
    """
    Pass in a client name or search query and returns last 100 headlines associating the client with Covid-19   
    """
    # Use this URL for Australian centric data
    news_url = "https://news.google.com.au/rss/search?q={"+str(search_query)+"%coronavirus}"
    Client = urlopen(news_url)
    xml_page = Client.read()
    Client.close()
    # Beautiful Soup Library is the bomb
    soup_page = soup(xml_page,"xml")
    news_list = soup_page.findAll("item")
    # Two separate lists for News Title and Publication Date
    l1 = []
    l2 = []
    for news in news_list:
        # Append to a list
        l1.append(news.title.text)
        l2.append(news.pubDate.text)
        # Zip the two together
        l_tup = list(zip(l1, l2))
    # Save this to a DataFrame
    df = pd.DataFrame(l_tup, columns=['Title', 'Date'])
    # Select Date of Headline
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    # Split the Title into Headline and Source columns and then drop the 'Title' column
    df[['Headline','Source']] = df['Title'].str.rsplit("-",1,expand=True)
    df.drop('Title', axis=1, inplace=True)
    return df

nab_df = covid19_news_scraper('NAB')
print(nab_df.shape)
nab_df.head()

(100, 3)


Unnamed: 0,Date,Headline,Source
0,2020-04-17,NAB's McEwan vows fight to save jobs in bank restructure,Sydney Morning Herald
1,2020-03-24,NAB worker sacked over false coronavirus test,The New Daily
2,2020-04-14,"A huge hit to the business sector from coronavirus, NAB anticipates",Property Observer
3,2020-04-14,"ASX lifts, while collapse in business confidence points to 'unprecedented' recession: NAB report",ABC News
4,2020-03-23,"Coronavirus ASX: Coles, BHP Up, CBA, Westpac, NAB, Afterpay Fall",Canstar


In [5]:
cba_df = covid19_news_scraper('CBA')
print(cba_df.shape)
cba_df.head()

(100, 3)


Unnamed: 0,Date,Headline,Source
0,2020-04-16,"Sydney, Melbourne house prices facing 10 per cent fall: CBA",Brisbane Times
1,2020-04-14,Hoarding boom is over: CBA says shoppers have closed their wallets,The Age
2,2020-03-24,Business orders crash on coronavirus: CBA index,The Australian Financial Review
3,2020-03-31,Coronavirus: CBA report reveals what we’re buying during pandemic,NEWS.com.au
4,2020-04-01,CBA increases coronavirus loan support,7NEWS.com.au


In [6]:
anz_df = covid19_news_scraper('ANZ')
print(anz_df.shape)
anz_df.head()

(100, 3)


Unnamed: 0,Date,Headline,Source
0,2020-04-19,'Everybody is paying': ANZ Bank chairman David Gonski,Brisbane Times
1,2020-03-20,"The big four banks are letting borrowers hit pause on their payments, but this is no mortgage holiday",ABC News
2,2020-04-04,'Australia won't look the same': ANZ's Elliott warns coronavirus impact will be generational,The Age
3,2020-03-24,ANZ signals further relief for customers during crisis,6PR
4,2020-04-19,Loan relief requests due to COVID-19 continue to pile up,Tasmania Examiner


In [7]:
wbc_df = covid19_news_scraper('Westpac')
print(wbc_df.shape)
wbc_df.head()

(100, 3)


Unnamed: 0,Date,Headline,Source
0,2020-04-16,Riskier funds have been exposed by coronavirus: Westpac,The Australian Financial Review
1,2020-04-14,"Westpac flags $1.4b hit to earnings, braces for coronavirus impact",Sydney Morning Herald
2,2020-04-15,Westpac survey reveals coronavirus caused consumer confidence to fall,NEWS.com.au
3,2020-04-18,Coronavirus money help: How government and banks like Westpac can help,NEWS.com.au
4,2020-03-20,"The big four banks are letting borrowers hit pause on their payments, but this is no mortgage holiday",ABC News


### Sentiment Analyis on Text
---

In [8]:
def covid19_news_scraper(search_query):
    """
    Pass in a client name or search query and returns last 100 headlines associating the client with Covid-19   
    """
    # Use this URL for Australian centric data
    news_url = "https://news.google.com.au/rss/search?q={"+str(search_query)+"%coronavirus}"
    Client = urlopen(news_url)
    xml_page = Client.read()
    Client.close()
    # Beautiful Soup Library is the bomb
    soup_page = soup(xml_page,"xml")
    news_list = soup_page.findAll("item")
    
    # Two separate lists for News Title and Publication Date
    l1 = []
    l2 = []
    for news in news_list:
        # Append to a list
        l1.append(news.title.text)
        l2.append(news.pubDate.text)
        # Zip the two together
        l_tup = list(zip(l1, l2))
    
    # Save this to a DataFrame
    df = pd.DataFrame(l_tup, columns=['Title', 'Date'])
    # Select Date of Headline
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    # Split the Title into Headline and Source columns and then drop the 'Title' column
    df[['Headline','Source']] = df['Title'].str.rsplit("-",1,expand=True)
    df.drop('Title', axis=1, inplace=True)
    df['Client'] = str(search_query)
    return df

def sentiment_analyser(search_query):
    """
    Runs a Google News Search on the input string and then uses VADER sentiment analysis engine on each returned headline.
    Input: Search Query String
    Output: DataFrame with compound sentiment score for each news article
    """
    # Create a Covid-19 News DataFrame for each organization of interest
    news_df = covid19_news_scraper(search_query)
    # Initialize VADER Sentiment Intensity Analyzer 
    sia = SIA()
    results = []

    # Calculate the polarity score for each headline associated with the organization
    for row in news_df['Headline']:
        pol_score = sia.polarity_scores(row)
        pol_score['Headline'] = row
        results.append(pol_score)
    
    # Create the Sentiment DataFrame
    sent_df = pd.DataFrame.from_records(results)
    # Merge the two dataframes together on the 'Headline' column
    merge_df = news_df.merge(sent_df, on='Headline')
    # Re-order and Rename the columns
    merge_df = merge_df.rename(columns={'compound':'Composite Score'})
    col_order = ['Client','Date','Headline','Source','Composite Score']#,'neg','neu','pos']
    print('Completed processing %s' % search_query, "...")
    return merge_df[col_order]

cba_df = sentiment_analyser('CBA')
print(cba_df.shape)
cba_df.head()

Completed processing CBA ...
(100, 5)


Unnamed: 0,Client,Date,Headline,Source,Composite Score
0,CBA,2020-04-16,"Sydney, Melbourne house prices facing 10 per cent fall: CBA",Brisbane Times,0.0
1,CBA,2020-04-14,Hoarding boom is over: CBA says shoppers have closed their wallets,The Age,0.0
2,CBA,2020-03-24,Business orders crash on coronavirus: CBA index,The Australian Financial Review,-0.4019
3,CBA,2020-03-31,Coronavirus: CBA report reveals what we’re buying during pandemic,NEWS.com.au,0.0
4,CBA,2020-04-01,CBA increases coronavirus loan support,7NEWS.com.au,0.4019


In [9]:
nab_df = sentiment_analyser('NAB')
print(nab_df.shape)
nab_df.head()

Completed processing NAB ...
(100, 5)


Unnamed: 0,Client,Date,Headline,Source,Composite Score
0,NAB,2020-04-17,NAB's McEwan vows fight to save jobs in bank restructure,Sydney Morning Herald,0.1531
1,NAB,2020-03-24,NAB worker sacked over false coronavirus test,The New Daily,0.0
2,NAB,2020-04-14,"A huge hit to the business sector from coronavirus, NAB anticipates",Property Observer,0.3182
3,NAB,2020-04-14,"ASX lifts, while collapse in business confidence points to 'unprecedented' recession: NAB report",ABC News,-0.4019
4,NAB,2020-03-23,"Coronavirus ASX: Coles, BHP Up, CBA, Westpac, NAB, Afterpay Fall",Canstar,0.0


In [10]:
anz_df = sentiment_analyser('ANZ')
print(anz_df.shape)
anz_df.head()

Completed processing ANZ ...
(100, 5)


Unnamed: 0,Client,Date,Headline,Source,Composite Score
0,ANZ,2020-04-19,'Everybody is paying': ANZ Bank chairman David Gonski,Brisbane Times,0.0
1,ANZ,2020-03-20,"The big four banks are letting borrowers hit pause on their payments, but this is no mortgage holiday",ABC News,0.1901
2,ANZ,2020-04-04,'Australia won't look the same': ANZ's Elliott warns coronavirus impact will be generational,The Age,-0.1027
3,ANZ,2020-03-24,ANZ signals further relief for customers during crisis,6PR,-0.25
4,ANZ,2020-04-19,Loan relief requests due to COVID-19 continue to pile up,Tasmania Examiner,0.4767


In [11]:
wbc_df = sentiment_analyser('Westpac')
print(wbc_df.shape)
wbc_df.head()

Completed processing Westpac ...
(100, 5)


Unnamed: 0,Client,Date,Headline,Source,Composite Score
0,Westpac,2020-04-16,Riskier funds have been exposed by coronavirus: Westpac,The Australian Financial Review,-0.4019
1,Westpac,2020-04-14,"Westpac flags $1.4b hit to earnings, braces for coronavirus impact",Sydney Morning Herald,0.0
2,Westpac,2020-04-15,Westpac survey reveals coronavirus caused consumer confidence to fall,NEWS.com.au,0.5106
3,Westpac,2020-04-18,Coronavirus money help: How government and banks like Westpac can help,NEWS.com.au,0.7845
4,Westpac,2020-03-20,"The big four banks are letting borrowers hit pause on their payments, but this is no mortgage holiday",ABC News,0.1901


### Modify the code to work for multiple clients
---

In [12]:
def covid19_news_scraper(search_query):
    """
    Pass in a client name or search query and returns last 100 headlines associating the client with Covid-19   
    """
    # Use this URL for Australian centric data
    news_url = "https://news.google.com.au/rss/search?q={"+str(search_query)+"%coronavirus}"
    Client = urlopen(news_url)
    xml_page = Client.read()
    Client.close()
    # Beautiful Soup Library is the bomb
    soup_page = soup(xml_page,"xml")
    news_list = soup_page.findAll("item")
    
    # Two separate lists for News Title and Publication Date
    l1 = []
    l2 = []
    for news in news_list:
        # Append to a list
        l1.append(news.title.text)
        l2.append(news.pubDate.text)
        # Zip the two together
        l_tup = list(zip(l1, l2))
    
    # Save this to a DataFrame
    df = pd.DataFrame(l_tup, columns=['Title', 'Date'])
    # Select Date of Headline
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    # Split the Title into Headline and Source columns and then drop the 'Title' column
    df[['Headline','Source']] = df['Title'].str.rsplit("-",1,expand=True)
    df.drop('Title', axis=1, inplace=True)
    df['Client'] = str(search_query)
    return df

def sentiment_analyser(search_query):
    """
    Runs a Google News Search on the input string and then uses VADER sentiment analysis engine on each returned headline.
    Input: Search Query String
    Output: DataFrame with compound sentiment score for each news article
    """
    # Create a Covid-19 News DataFrame for each organization of interest
    news_df = covid19_news_scraper(search_query)
    # Initialize VADER Sentiment Intensity Analyzer 
    sia = SIA()
    results = []

    # Calculate the polarity score for each headline associated with the organization
    for row in news_df['Headline']:
        pol_score = sia.polarity_scores(row)
        pol_score['Headline'] = row
        results.append(pol_score)
    
    # Create the Sentiment DataFrame
    sent_df = pd.DataFrame.from_records(results)
    # Merge the two dataframes together on the 'Headline' column
    merge_df = news_df.merge(sent_df, on='Headline')
    # Re-order and Rename the columns
    merge_df = merge_df.rename(columns={'compound':'VADER Score'})
    col_order = ['Client','Date','Headline','Source','VADER Score','neg','neu','pos']
    print('Completed processing %s' % search_query, "...")
    return merge_df[col_order]

def client_c19_news_agg(client_list):
    """
    Provided a list of clients, this pulls up the past 100 covid-19 related news articles on each of them and calculates 
    a Composite Sentiment score for each article related to a client 
    """
    frames = [sentiment_analyser(c) for c in client_list]
    result = pd.concat(frames)
    print()
    print("VADER Score is a Normalized Weighted Sentiment Composite Score that ranges from +1 (Extremely Positive) to -1 (Extremely Negative)")
    return result

clients = ['CBA', 'NAB', 'Westpac', 'ANZ']
df = client_c19_news_agg(clients)
df.sample(10)

Completed processing CBA ...
Completed processing NAB ...
Completed processing Westpac ...
Completed processing ANZ ...

VADER Score is a Normalized Weighted Sentiment Composite Score that ranges from +1 (Extremely Positive) to -1 (Extremely Negative)


Unnamed: 0,Client,Date,Headline,Source,VADER Score,neg,neu,pos
77,CBA,2020-03-31,Where to invest your Commonwealth Bank dividends,Motley Fool Australia,0.0,0.0,1.0,0.0
76,NAB,2020-03-25,Victorians warned of life under lockdown as coronavirus measures tighten,Herald Sun,-0.2732,0.189,0.811,0.0
21,CBA,2020-04-15,"Coronavirus has people buying home improvement items over alcohol, data shows",NEWS.com.au,0.4588,0.0,0.769,0.231
90,NAB,2020-03-24,Renters desperate for relief as 1 million jobs set to disappear due to coronavirus,ABC News,-0.0258,0.243,0.578,0.179
28,ANZ,2020-03-31,ABA expands COVID-19 relief for businesses,Australian Broker,0.5423,0.0,0.471,0.529
71,NAB,2020-03-26,"Coronavirus hit almost half of Australian businesses even before social distancing, ABS reveals",ABC News,0.0,0.0,1.0,0.0
92,NAB,2020-04-07,"Coronavirus economic downturn dashes bank dividend hopes, says UBS",The Australian,0.4215,0.0,0.741,0.259
7,NAB,2020-04-02,"Consumer anxiety on the rise as coronavirus impacts economy, NAB survey shows",Domain News,-0.1779,0.134,0.866,0.0
87,CBA,2020-04-04,Banks urged to 'stretch' to help crisis customers,The Age,-0.34,0.32,0.469,0.211
74,Westpac,2020-04-01,Coronavirus Australia live updates: Five workers in remote Kimberley test positive,The Australian,0.5574,0.0,0.735,0.265


In [20]:
df.shape

(400, 8)

### Topic Analysis on Text
---

In [13]:
# Create a spaCy object
nlp = spacy.load('en_core_web_sm')

#### Entity Recognition
---

In [34]:
for client in clients:
    print(df[df.Client==client]['Headline'].head())

0    Sydney, Melbourne house prices facing 10 per cent fall: CBA        
1    Hoarding boom is over: CBA says shoppers have closed their wallets 
2    Business orders crash on coronavirus: CBA index                    
3    Coronavirus: CBA report reveals what we’re buying during pandemic  
4    CBA increases coronavirus loan support                             
Name: Headline, dtype: object
0    NAB's McEwan vows fight to save jobs in bank restructure                                         
1    NAB worker sacked over false coronavirus test                                                    
2    A huge hit to the business sector from coronavirus, NAB anticipates                              
3    ASX lifts, while collapse in business confidence points to 'unprecedented' recession: NAB report 
4    Coronavirus ASX: Coles, BHP Up, CBA, Westpac, NAB, Afterpay Fall                                 
Name: Headline, dtype: object
0    Riskier funds have been exposed by coronavirus: Westpac

In [42]:
for client in clients:
    for i in range(100):
        doc = nlp(df[df.Client==client]['Headline'][i])
        spacy.displacy.render(doc, style='ent', jupyter=True)

In [43]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

#### Lemmatization
---

In [43]:
for i in range(10):
    doc = nlp(df2['Headline'][i])
    review = str(" ".join([j.lemma_ for j in doc]))
    doc = nlp(review)
    spacy.displacy.render(doc, style='ent',jupyter=True)

In [44]:
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [45]:
tqdm.pandas()
df2["Headline"] = df2["Headline"].progress_apply(spacy_tokenizer)
df2.head()

100%|██████████| 100/100 [00:00<00:00, 949.04it/s]


Unnamed: 0,Headline,Source
0,coronavirus sees cba automatically cut mortgage payments minimum,Canstar
1,commonwealth bank boss spread coronavirus wrong impact global financial crisis,ABC News
2,hoarding boom cba shopper close wallet,The Age
3,business order crash coronavirus cba index,The Australian Financial Review
4,coronavirus cba report reveal buy pandemic,NEWS.com.au


### Topic Modeling
---

In [46]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english',
                             lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(df2["Headline"])
NUM_TOPICS = 10

In [47]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [48]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [49]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [50]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [51]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('coronavirus', 0.6215094512992357), ('mortgage', 0.4378688279865017), ('home', 0.3789002115614898), ('bank', 0.3568725500900768), ('cba', 0.3533836106260866), ('pandemic', 0.3399277946782544), ('covid-', 0.33433098220711976), ('commonwealth', 0.32724327219036947), ('cut', 0.3250315205742598), ('impact', 0.306140610348303)]
Topic 1:
[('coronavirus', 0.45688905973133326), ('commonwealth', 0.3468843734533722), ('australian', 0.3421965840911092), ('buy', 0.3300940819269943), ('cba', 0.3289515423275764), ('banks', 0.3252566898218607), ('asx', 0.3137900826953336), ('business', 0.30160128885678705), ('market', 0.296457743086696), ('cut', 0.2929025756868064)]
Topic 2:
[('coronavirus', 7.163214099590859), ('pandemic', 4.279851879727932), ('home', 3.434502767939538), ('cba', 3.2699996289268847), ('mortgage', 2.184684758976158), ('buy', 2.0290443687574595), ('cut', 1.4135377923477341), ('support', 1.1505285130691476), ('repayment', 1.1396372612621368), ('business', 0.3726212

In [52]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer)

NMF Model:
Topic 0:
[('coronavirus', 3.322555840882391), ('buy', 0.24768646980302325), ('pandemic', 0.23454928743329279), ('impact', 0.13903066899498498), ('home', 0.05649988341755386), ('support', 0.0), ('australian', 0.0), ('bank', 0.0), ('banks', 0.0), ('big', 0.0)]
Topic 1:
[('cba', 2.6042345880253452), ('cut', 0.2615336434453031), ('asx', 0.1392582938490559), ('pandemic', 0.1229639434749683), ('support', 0.08871573558364575), ('impact', 0.08384933904583663), ('market', 0.0), ('repayment', 0.0), ('australian', 0.0), ('bank', 0.0)]
Topic 2:
[('bank', 3.066340739035271), ('commonwealth', 1.0572951184085988), ('big', 0.16281111230185774), ('australian', 0.08510346462179846), ('asx', 0.0741862843797162), ('coronavirus', 0.028183889783995105), ('loan', 0.028099673701502795), ('mortgage', 0.010725360386537163), ('impact', 0.004642786183083784), ('banks', 0.0)]
Topic 3:
[('business', 2.210821534480327), ('support', 1.1091481497941587), ('australian', 0.32076239400534773), ('loan', 0.25302

In [53]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, vectorizer)

LSI Model:
Topic 0:
[('coronavirus', 0.8140455008647512), ('bank', 0.3378159053997171), ('cba', 0.3039441296852416), ('business', 0.14252097305548347), ('loan', 0.11640546684512047), ('crisis', 0.11459614570801907), ('share', 0.10977043031517489), ('commonwealth', 0.10091163209886854), ('big', 0.08899253710111298), ('mortgage', 0.08845379614113864)]
Topic 1:
[('bank', 0.5708338330135357), ('commonwealth', 0.18277460167308873), ('big', 0.17872025310747364), ('crisis', 0.08989711669437683), ('mortgage', 0.04686538256663277), ('coronavirus', 0.03951301886538218), ('australian', 0.03026737830728309), ('repayment', 0.025156224364868716), ('banks', 0.019545599352505633), ('dividend', 0.004281090577142101)]
Topic 2:
[('bank', 0.5392686522873819), ('cba', 0.4102112655621962), ('share', 0.24687345458303195), ('big', 0.21805512996337975), ('commonwealth', 0.18451381427867874), ('dividend', 0.15843609500451414), ('covid-', 0.15770544170378645), ('business', 0.13227901099823255), ('asx', 0.1168529

In [54]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash