In [7]:
# libraries for webscraping, parsing and getting stock data
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import yfinance as yf
import time

# for plotting and data manipulation
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.express as px

# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# for getting current date and time to print 'last updated'
from datetime import datetime

# Get All Tickers from Dow Jones Index
df_dow_jones = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]
tickers = df_dow_jones['Symbol'].tolist()

# Scrape the Date, Time and News Headlines Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finwiz_url + ticker
    req = Request(url=url,headers = { "user-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'})

    try:
       response = urlopen(req)   
    except:
       time.sleep(10) # if there is an error and request is blocked, do it more slowly by waiting for 10 seconds before requesting again
       response = urlopen(req)  
        
    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)

    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    # Add the table to our dictionary
    news_tables[ticker] = news_table

	

# Parse the Date, Time and News Headlines into a Python List
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element
        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        # Extract the ticker from the file name, get the string up to the 1st '_'  
        ticker = file_name.split('_')[0]
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([ticker, date, time, text])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
parsed_news

[['MMM',
  'Sep-01-22',
  '05:13PM',
  '3M Should Be Blocked From Health Care Spinoff, New Suit Argues'],
 ['MMM',
  'Sep-01-22',
  '04:42PM',
  'UPDATE 3-Veterans seeking earplug damages sue to block 3M healthcare spinoff'],
 ['MMM',
  'Sep-01-22',
  '04:39PM',
  'Veterans seeking earplug damages sue to block 3M healthcare spinoff'],
 ['MMM',
  'Sep-01-22',
  '02:45PM',
  'Home Away From Home: 3M Brands Help College Students and Young Adults Prepare for Independent Living'],
 ['MMM',
  'Sep-01-22',
  '10:54AM',
  '3M Plans Job Cuts to Rein in Expenses as Legal Troubles Grow'],
 ['MMM',
  'Sep-01-22',
  '09:09AM',
  '3M Finalizes Separation of its Food Safety Business and Merger of the Business with Neogen; Accepts Shares Tendered in Exchange Offer'],
 ['MMM',
  'Sep-01-22',
  '09:04AM',
  "Is It Time to Buy the Dow Jones' 4 Worst-Performing August Stocks?"],
 ['MMM',
  'Sep-01-22',
  '08:38AM',
  '3M Plans Job Cuts to Rein in Expenses as Legal Troubles Grow'],
 ['MMM',
  'Sep-01-22',


In [9]:
!pip install bs4 --upgrade



