In [1]:
#!pip install yahooquery
#!pip install bs4
#!pip install urllib
#!pip install plotly
#!pip install nltk

In [2]:
# libraries for webscraping, parsing and getting stock data
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from yahooquery import Ticker
import time

import requests
import itertools
import numpy as np
from itertools import chain

# for plotting and data manipulation
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.express as px

# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# for getting current date and time to print 'last updated'
from datetime import datetime
import datetime

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\damia\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Get All Tickers from Dow Jones Index

Get all tickers from the table in this Wikipedia page (https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average).

In [3]:
df_dow_jones = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]
df_dow_jones

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added,Notes,Index weighting
0,3M,NYSE,MMM,Conglomerate,1976-08-09,As Minnesota Mining and Manufacturing,2.41%
1,American Express,NYSE,AXP,Financial services,1982-08-30,,3.02%
2,Amgen,NASDAQ,AMGN,Biopharmaceutical,2020-08-31,,5.48%
3,Apple,NASDAQ,AAPL,Information technology,2015-03-19,,2.84%
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,,3.36%
5,Caterpillar,NYSE,CAT,Construction and mining,1991-05-06,,4.52%
6,Chevron,NYSE,CVX,Petroleum industry,2008-02-19,Also 1930-07-18 to 1999-11-01,3.50%
7,Cisco,NASDAQ,CSCO,Information technology,2009-06-08,,0.96%
8,Coca-Cola,NYSE,KO,Drink industry,1987-03-12,Also 1932-05-26 to 1935-11-20,1.22%
9,Disney,NYSE,DIS,Broadcasting and entertainment,1991-05-06,,1.89%


In [4]:
tickers = df_dow_jones['Symbol'].tolist()
tickers

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DIS',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT']

# Scrape the Date, Time and News Headlines Data

Explanation of data scraping and sentiment analysis for the next few cells available here: https://medium.datadriveninvestor.com/sentiment-analysis-of-stocks-from-financial-news-using-python-82ebdcefb638

In [5]:
# Scrape the Date, Time and News Headlines Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    print(ticker)
    url = finwiz_url + ticker
    req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'})

    try:
       response = urlopen(req)   
    except:
       time.sleep(10) # if there is an error and request is blocked, do it more slowly by waiting for 10 seconds before requesting again
       response = urlopen(req)  
        
    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)
    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    # Add the table to our dictionary
    news_tables[ticker] = news_table

MMM
AXP
AMGN
AAPL
BA
CAT
CVX
CSCO
KO
DIS
DOW
GS
HD
HON
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PG
CRM
TRV
UNH
VZ
V
WBA
WMT


# Parse the Date, Time and News Headlines into a Python List

In [6]:
# Parse the Date, Time and News Headlines into a Python List
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element
        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        # Extract the ticker from the file name, get the string up to the 1st '_'  
        ticker = file_name.split('_')[0]
        print(ticker)
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([ticker, date, time, text])
        
parsed_news[:5] # print first 5 rows of news

MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
MMM
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AXP
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN
AMGN


V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
V
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WBA
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT
WMT


[['MMM',
  'Nov-02-23',
  '09:00AM',
  '3M Company (MMM) Is a Trending Stock: Facts to Know Before Betting on It'],
 ['MMM',
  'Nov-01-23',
  '10:12AM',
  '3 Dividend Kings Whose Growth Streaks Could Break Before 2030'],
 ['MMM', 'Oct-30-23', '11:53AM', 'Is 3M Stock a Buy?'],
 ['MMM', 'Oct-30-23', '06:19AM', '11 Best Stocks to Buy for Income'],
 ['MMM', 'Oct-28-23', '06:00AM', "Why 3M's Long-Term Decline Will Continue"]]

# Perform Sentiment Analysis with Vader

In [7]:
# Perform Sentiment Analysis with Vader
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
today = datetime.date.today()
parsed_and_scored_news = parsed_and_scored_news.replace("Today", today)
#parsed_and_scored_news.date.unique()

# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)

# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
parsed_and_scored_news.head()

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,MMM,2023-11-02,09:00AM,3M Company (MMM) Is a Trending Stock: Facts to...,0.0,1.0,0.0,0.0
1,MMM,2023-11-01,10:12AM,3 Dividend Kings Whose Growth Streaks Could Br...,0.0,0.755,0.245,0.3818
2,MMM,2023-10-30,11:53AM,Is 3M Stock a Buy?,0.0,1.0,0.0,0.0
3,MMM,2023-10-30,06:19AM,11 Best Stocks to Buy for Income,0.0,0.588,0.412,0.6369
4,MMM,2023-10-28,06:00AM,Why 3M's Long-Term Decline Will Continue,0.0,1.0,0.0,0.0


In [8]:
parsed_and_scored_news

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,MMM,2023-11-02,09:00AM,3M Company (MMM) Is a Trending Stock: Facts to...,0.000,1.000,0.000,0.0000
1,MMM,2023-11-01,10:12AM,3 Dividend Kings Whose Growth Streaks Could Br...,0.000,0.755,0.245,0.3818
2,MMM,2023-10-30,11:53AM,Is 3M Stock a Buy?,0.000,1.000,0.000,0.0000
3,MMM,2023-10-30,06:19AM,11 Best Stocks to Buy for Income,0.000,0.588,0.412,0.6369
4,MMM,2023-10-28,06:00AM,Why 3M's Long-Term Decline Will Continue,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...
2995,WMT,2023-10-25,05:36AM,The reason weight loss drug use tomorrow is su...,0.312,0.688,0.000,-0.6124
2996,WMT,2023-10-24,05:45PM,Walmart (WMT) Beats Stock Market Upswing: What...,0.000,1.000,0.000,0.0000
2997,WMT,2023-10-24,11:25AM,Walmart Chooses Georgia Site for Dairy Facility,0.000,1.000,0.000,0.0000
2998,WMT,2023-10-24,10:30AM,The Trade Desk (TTD) Benefits from Expanding C...,0.000,0.755,0.245,0.3818


# Calculate Mean Sentiment for Each Ticker

In [9]:
# Group by each ticker and get the mean of all sentiment scores
mean_scores = parsed_and_scored_news.groupby(['ticker']).mean()
mean_scores

  mean_scores = parsed_and_scored_news.groupby(['ticker']).mean()


Unnamed: 0_level_0,neg,neu,pos,compound
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,0.08038,0.81169,0.10794,0.037388
AMGN,0.05155,0.86552,0.08295,0.022988
AXP,0.01976,0.83012,0.15012,0.217738
BA,0.09816,0.83769,0.06415,-0.068298
CAT,0.03334,0.87225,0.09443,0.098926
CRM,0.01948,0.82573,0.15479,0.223908
CSCO,0.03181,0.88055,0.08763,0.092196
CVX,0.07302,0.83237,0.0946,0.031193
DIS,0.06319,0.85233,0.08449,0.034208
DOW,0.02495,0.86509,0.10996,0.117624


# Get Market Cap, Sector and Industry of each Ticker

The yahooquery python library contains a great deal of information about a company, including those we want. https://pypi.org/project/yahooquery/

In [11]:
# get the price, sector and industry of each ticker using the yahooquery api (UPDATE: yahooquery api no longer works)
# UPDATE: yahooquery api no longer works so this code instead scrapes the finviz website for the marketcap, sector and industry of each ticker

def get_ticker_data(ticker):
    data={}
    url_base = "https://finviz.com/quote.ashx?t="
    url = url_base + ticker
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
    soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
    l = []
    # parse all values in table into dict
    for i, row in enumerate(soup.select('.snapshot-td2')):
        #print(row)
        #l.append([td.text for td in row.select(""'td')])
        if i%2 == 0:
            key = row.text
        else:   
            data[key]= row.text
    data['Sector'] = soup.select('.quote-links')[0].select('.tab-link')[0].text
    data['Industry'] = soup.select('.quote-links')[0].select('.tab-link')[1].text
    if data['Market Cap'][-1] == 'B':
        data['Market Cap'] = float(data['Market Cap'][:-1])*10e9
    elif data['Market Cap'][-1] == 'M':
        data['Market Cap'] = float(data['Market Cap'][:-1])*10e6
    else:
        pass
    return data

tickers_data = Ticker(tickers)
tickers_summary = tickers_data.summary_detail
tickers_profile = tickers_data.asset_profile

sectors = []
industries = []
marketcap = []

for ticker in tickers:
    print(ticker)
    data = get_ticker_data(ticker)
    #print(data)
    marketcap.append(data['Market Cap'])
    sectors.append(data['Sector'])
    industries.append(data['Industry'])

MMM
AXP
AMGN
AAPL
BA
CAT
CVX
CSCO
KO
DIS
DOW
GS
HD
HON
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PG
CRM
TRV
UNH
VZ
V
WBA
WMT


# Combine the Information Above and the Corresponding Tickers into a DataFrame

In [None]:
# dictionary {'column name': list of values for column} to be converted to dataframe
d = {'Symbol': tickers, 'Sector': sectors, 'Industry': industries, 'Market Cap': marketcap}
# create dataframe from 
df_info = pd.DataFrame(data=d)
df_info

# Get Names of Companies from the Dow Jones DataFrame obtained Earlier

In [None]:
df_info_name = df_info.merge(df_dow_jones[['Company', 'Symbol']], on = 'Symbol')
df_info_name

# Join Stock Information and Sentiment Information

Join the mean_scores (which stores sentiment info) and the df_info_name (which stores company info above) DataFrames by matching the 'ticker' column in mean_scores DataFrame with the 'Symbol' column in df_info_name Dataframe.

In [None]:
df = mean_scores.merge(df_info_name, left_on = 'ticker', right_on = 'Symbol')
df = df.rename(columns={"compound": "Sentiment Score", "neg": "Negative", "neu": "Neutral", "pos": "Positive"})
df

# Generate the Treemap Plot!

In [None]:
# group data into sectors at the highest level, breaks it down into industry, and then ticker, specified in the 'path' parameter
# the 'values' parameter uses the value of the column to determine the relative size of each box in the chart
# the color of the chart follows the sentiment score
# when the mouse is hovered over each box in the chart, the negative, neutral, positive and overall sentiment scores will all be shown
# the color is red (#ff0000) for negative sentiment scores, black (#000000) for 0 sentiment score and green (#00FF00) for positive sentiment scores
fig = px.treemap(df, path=[px.Constant("Dow Jones"), 'Sector', 'Industry', 'Symbol'], values='Market Cap',
                  color='Sentiment Score', hover_data=['Company', 'Negative', 'Neutral', 'Positive', 'Sentiment Score'],
                  color_continuous_scale=['#FF0000', "#000000", '#00FF00'],
                  color_continuous_midpoint=0)

fig.data[0].customdata = df[['Company', 'Negative', 'Neutral', 'Positive', 'Sentiment Score']].round(3) # round to 3 decimal places
fig.data[0].texttemplate = "%{label}<br>%{customdata[4]}"

fig.update_traces(textposition="middle center")
fig.update_layout(margin = dict(t=30, l=10, r=10, b=10), font_size=20)

fig.show()

# Get Current Date, Time and Timezone

In [None]:
# datetime object containing current date and time
now = datetime.datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
dt_string

In [None]:
timezone_string = datetime.datetime.now().astimezone().tzname()
timezone_string

# Generate HTML File with Updated Time and Treemap

In [None]:
with open('dow_jones_live_sentiment.html', 'a') as f:
    f.truncate(0) # clear file if something is already written on it
    title = "<h1>Dow Jones Stock Sentiment Dashboard</h1>"
    updated = "<h2>Last updated: " + dt_string + " (Timezone: " + timezone_string + ")</h2>"
    description = "This dashboard is updated every half an hour with sentiment analysis performed on latest scraped news headlines from the FinViz website."
    f.write(title + updated + description)
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file

In [None]:
with open('dow_jones_live_sentiment.html', 'a') as f:
    f.truncate(0) # clear file if something is already written on it
    title = "<h1>Dow Jones Stock Sentiment Dashboard</h1>"
    updated = "<h2>Last updated: " + dt_string + " (Timezone: " + timezone_string + ")</h2>"
    description = "This dashboard is updated every half an hour with sentiment analysis performed on latest scraped news headlines from the FinViz website.<br><br>"
    code = """<a href="https://medium.com/datadriveninvestor/use-github-actions-to-create-a-live-stock-sentiment-dashboard-online-580a08457650">Explanatory Article</a> | <a href="https://github.com/damianboh/dow_jones_live_stock_sentiment_treemap">Source Code</a>"""
    author = """ | Created by Damian Boh, check out my <a href="https://damianboh.github.io/">GitHub Page</a>"""
   
    f.write(title + updated + description + code + author)
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file