In [1]:
#!pip install yfinance
#!pip install bs4
#!pip install urllib
#!pip install plotly
#!pip install nltk

In [2]:
# libraries for webscraping, parsing and getting stock data
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import yfinance as yf
import time

# for plotting and data manipulation
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.express as px

# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# for getting current date and time to print 'last updated'
from datetime import datetime

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Get All Tickers from Dow Jones Index

Get all tickers from the table in this Wikipedia page (https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average).

In [3]:
df_dow_jones = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]
df_dow_jones

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added,Notes,Index weighting
0,3M,NYSE,MMM,Conglomerate,1976-08-09,As Minnesota Mining and Manufacturing,2.88%
1,American Express,NYSE,AXP,Financial services,1982-08-30,,3.56%
2,Amgen,NASDAQ,AMGN,Biopharmaceutical,2020-08-31,,4.88%
3,Apple,NASDAQ,AAPL,Information technology,2015-03-19,,3.15%
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,,3.40%
5,Caterpillar,NYSE,CAT,Construction and Mining,1991-05-06,,4.19%
6,Chevron,NYSE,CVX,Petroleum industry,2008-02-19,Also 1930-07-18 to 1999-11-01,3.05%
7,Cisco,NASDAQ,CSCO,Information technology,2009-06-08,,1.00%
8,Coca-Cola,NYSE,KO,Drink industry,1987-03-12,Also 1932-05-26 to 1935-11-20,1.28%
9,Disney,NYSE,DIS,Broadcasting and entertainment,1991-05-06,,2.32%


In [4]:
tickers = df_dow_jones['Symbol'].tolist()
tickers

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DIS',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT']

# Scrape the Date, Time and News Headlines Data

Explanation of data scraping and sentiment analysis for the next few cells available here: https://medium.datadriveninvestor.com/sentiment-analysis-of-stocks-from-financial-news-using-python-82ebdcefb638

In [5]:
# Scrape the Date, Time and News Headlines Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    print(ticker)
    url = finwiz_url + ticker
    req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'})

    try:
       response = urlopen(req)   
    except:
       time.sleep(1) # if there is an error and request is blocked, do it more slowly by waiting for 1 second before requesting again
       response = urlopen(req)  
        
    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)
    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    # Add the table to our dictionary
    news_tables[ticker] = news_table

MMM
AXP
AMGN
AAPL
BA
CAT
CVX
CSCO
KO
DIS
DOW
GS
HD
HON
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PG
CRM
TRV
UNH
VZ
V
WBA
WMT


# Parse the Date, Time and News Headlines into a Python List

In [6]:
# Parse the Date, Time and News Headlines into a Python List
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element
        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        # Extract the ticker from the file name, get the string up to the 1st '_'  
        ticker = file_name.split('_')[0]
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([ticker, date, time, text])
        
parsed_news[:5] # print first 5 rows of news

[['MMM',
  'Aug-20-22',
  '10:37AM',
  '3 Top Dividend Kings to Buy for the Long Haul'],
 ['MMM',
  'Aug-19-22',
  '09:45AM',
  "3M (MMM) Stock Declines 18% YTD: What's Pulling it Down?"],
 ['MMM',
  'Aug-18-22',
  '11:03AM',
  '3M Unit Defends Request to Shield Parent From Mass Earplug Lawsuits'],
 ['MMM',
  'Aug-18-22',
  '07:34AM',
  '1 Monster Risk for 3M That You Might Want to Avoid'],
 ['MMM',
  'Aug-18-22',
  '12:10AM',
  'Mayar Capital on 3M Company (MMM): Weve Always Liked this Business with its Diversified Revenues']]

# Perform Sentiment Analysis with Vader

In [7]:
# Perform Sentiment Analysis with Vader
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)

# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)

# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
parsed_and_scored_news.head()

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,MMM,2022-08-20,10:37AM,3 Top Dividend Kings to Buy for the Long Haul,0.0,0.816,0.184,0.2023
1,MMM,2022-08-19,09:45AM,3M (MMM) Stock Declines 18% YTD: What's Pullin...,0.0,1.0,0.0,0.0
2,MMM,2022-08-18,11:03AM,3M Unit Defends Request to Shield Parent From ...,0.138,0.862,0.0,-0.1531
3,MMM,2022-08-18,07:34AM,1 Monster Risk for 3M That You Might Want to A...,0.341,0.556,0.103,-0.4588
4,MMM,2022-08-18,12:10AM,Mayar Capital on 3M Company (MMM): Weve Always...,0.0,0.833,0.167,0.4215


In [8]:
parsed_and_scored_news

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,MMM,2022-08-20,10:37AM,3 Top Dividend Kings to Buy for the Long Haul,0.000,0.816,0.184,0.2023
1,MMM,2022-08-19,09:45AM,3M (MMM) Stock Declines 18% YTD: What's Pullin...,0.000,1.000,0.000,0.0000
2,MMM,2022-08-18,11:03AM,3M Unit Defends Request to Shield Parent From ...,0.138,0.862,0.000,-0.1531
3,MMM,2022-08-18,07:34AM,1 Monster Risk for 3M That You Might Want to A...,0.341,0.556,0.103,-0.4588
4,MMM,2022-08-18,12:10AM,Mayar Capital on 3M Company (MMM): Weve Always...,0.000,0.833,0.167,0.4215
...,...,...,...,...,...,...,...,...
2995,WMT,2022-08-16,10:58AM,Dow Leads Mixed Stock Market Session; Oil Expl...,0.000,1.000,0.000,0.0000
2996,WMT,2022-08-16,10:54AM,Walmart Q2 FY 2023 Earnings Report Recap,0.000,1.000,0.000,0.0000
2997,WMT,2022-08-16,10:46AM,Housing Sector Remains Weak in July,0.367,0.633,0.000,-0.4404
2998,WMT,2022-08-16,10:37AM,"Walmart's Bouncing Back, but This Warren Buffe...",0.000,0.741,0.259,0.5927


# Calculate Mean Sentiment for Each Ticker

In [9]:
# Group by each ticker and get the mean of all sentiment scores
mean_scores = parsed_and_scored_news.groupby(['ticker']).mean()
mean_scores

Unnamed: 0_level_0,neg,neu,pos,compound
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,0.05145,0.85045,0.0981,0.092025
AMGN,0.03654,0.88651,0.07695,0.065582
AXP,0.07087,0.83066,0.09845,0.055924
BA,0.06471,0.80836,0.12695,0.095775
CAT,0.0473,0.83393,0.11876,0.101663
CRM,0.06568,0.81187,0.12245,0.107414
CSCO,0.06028,0.83907,0.10067,0.071071
CVX,0.0501,0.82214,0.12775,0.128233
DIS,0.05353,0.84214,0.10433,0.09358
DOW,0.03023,0.84751,0.12226,0.146543


# Get Market Cap, Sector and Industry of each Ticker

The yfinance python library contains a great deal of information about a company, including those we want. https://pypi.org/project/yfinance/

In [None]:
sectors = []
industries = []
marketcap = []
for ticker in tickers:
    print(ticker)
    tickerdata = yf.Ticker(ticker)
    marketcap.append(tickerdata.info['marketCap'])
    sectors.append(tickerdata.info['sector'])
    industries.append(tickerdata.info['industry'])

MMM
AXP
AMGN
AAPL
BA
CAT


# Combine the Information Above and the Corresponding Tickers into a DataFrame

In [None]:
# dictionary {'column name': list of values for column} to be converted to dataframe
d = {'Symbol': tickers, 'Sector': sectors, 'Industry': industries, 'Market Cap': marketcap}
# create dataframe from 
df_info = pd.DataFrame(data=d)
df_info

# Get Names of Companies from the Dow Jones DataFrame obtained Earlier

In [None]:
df_info_name = df_info.merge(df_dow_jones[['Company', 'Symbol']], on = 'Symbol')
df_info_name

# Join Stock Information and Sentiment Information

Join the mean_scores (which stores sentiment info) and the df_info_name (which stores company info above) DataFrames by matching the 'ticker' column in mean_scores DataFrame with the 'Symbol' column in df_info_name Dataframe.

In [None]:
df = mean_scores.merge(df_info_name, left_on = 'ticker', right_on = 'Symbol')
df = df.rename(columns={"compound": "Sentiment Score", "neg": "Negative", "neu": "Neutral", "pos": "Positive"})
df

# Generate the Treemap Plot!

In [None]:
# group data into sectors at the highest level, breaks it down into industry, and then ticker, specified in the 'path' parameter
# the 'values' parameter uses the value of the column to determine the relative size of each box in the chart
# the color of the chart follows the sentiment score
# when the mouse is hovered over each box in the chart, the negative, neutral, positive and overall sentiment scores will all be shown
# the color is red (#ff0000) for negative sentiment scores, black (#000000) for 0 sentiment score and green (#00FF00) for positive sentiment scores
fig = px.treemap(df, path=[px.Constant("Dow Jones"), 'Sector', 'Industry', 'Symbol'], values='Market Cap',
                  color='Sentiment Score', hover_data=['Company', 'Negative', 'Neutral', 'Positive', 'Sentiment Score'],
                  color_continuous_scale=['#FF0000', "#000000", '#00FF00'],
                  color_continuous_midpoint=0)

fig.data[0].customdata = df[['Company', 'Negative', 'Neutral', 'Positive', 'Sentiment Score']].round(3) # round to 3 decimal places
fig.data[0].texttemplate = "%{label}<br>%{customdata[4]}"

fig.update_traces(textposition="middle center")
fig.update_layout(margin = dict(t=30, l=10, r=10, b=10), font_size=20)

fig.show()

# Get Current Date and Time

In [None]:
# datetime object containing current date and time
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
dt_string

# Generate HTML File with Updated Time and Treemap

In [None]:
with open('dow_jones_live_sentiment.html', 'a') as f:
    f.truncate(0) # clear file if something is already written on it
    title = "<h1>Dow Jones Stock Sentiment Dashboard</h1>"
    updated = "<h2>Last updated: " + dt_string + "</h2>"
    description = "This dashboard is updated every hour with sentiment analysis performed on latest scraped news headlines from the FinViz website."
    f.write(title + updated + description)
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file