# Scrape Text from Yahoo Finance News


## Import libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import time
import random
import re
import os

import yfinance as yf

import requests
from bs4 import BeautifulSoup


In [2]:
date_today = datetime.now().date()
print(f'Last updated {date_today}')


Last updated 2022-12-19


## Scrape news from yfinance library

In [3]:
indices = ['MSFT', 'AAPL', 'GOOG', 'META', 'TSLA', 'SPY', 'NVDA', 'AMZN', 'COMP', 'FANG', '^GSPC',
           'CSCO', 'WFC', 'C', 'JPM', 'BCS', 'MS', 'CS', 'HPQ', 'BAC', 'CRM', 'NKE', 'DOW', 'VOO', '^IXIC', 'DELL', 'INTC', 'ADBE', 'PYPL', 'IBM', 'GS', 'AXP', 'MA', 'V', 'ORCL',
           'LLY', 'SPOT', 'UA', 'NFLX', 'WRBY', 'TGT', 'WMT', 'FORD', 'DUOL', 'TM', 'HD', 'CVS', 'ZM', 'DIS', 'ADDYY', 'ARHS', 'GE', 'GM', 'PTON', 'LOW', 'JNJ', 'WBD', 'SHOP', 'PINS', 'RTX', 'BIRD', 'LULU', 'BA', 'PFE', 'PG', 'SBUX', 'COST',
           'CVX', 'ACI', 'SFM', 'AJRD', 'LMT', 'HBAN', 'SONY', 'HON', 'COF', 'MRNA', 'CRWD', 'FDX', 'RLLCF', 'CFG', 'VZ', 'KTOS', 'XOM', 'GD', 'VWAGY', 'MZDAY', 'APH', 'KR', 'COSM', 'FWONA', 'UPS', 'MDB', 'NTDOY', 'HMC', 'MAXR', 'CAT', 'SYF', 'KO', 'AZN', 'T', 'SHEL', 'FUJHY', 'NTIC', 'DNUT', 'PEP', 'CMG', 'NSANY', 'WMG', 'FOX', 'TTE', 'CMCSA', 'PNC', 'TMUS', 'NVAX', 'NOC', 'BP', 'BNTX', 'NVS', 'EA', 'TXT', 'FITB',
           'AMD', 'CZNC', 'XPEV', 'NKLA', 'BYDDY', 'LUMN', 'BSRR', 'CSWI', 'AGCO', 'FBIZ', 'LNN', 'CCBG', 'LI', 'EBTC', 'CFFI', 'DE', 'FELE', 'MBWM', 'NIO', 'RIVN', 'FNLC', 'EQBK',
           'RS', 'W', 'TAL', 'X', 'GOTU', 'DAL', 'UAL', 'SXC', 'AAL', 'LUV', 'ZEUS', 'CMC', 'TMST', 'JBLU', 'ALK', 'EXAS', 'SAVE', 'WFRD', 'SCHN', 'BKR']     # specify indices of interest


In [4]:
# funtion to scrape body text, headlines, and puslish date from approved URLs
def scrape_news(url, columns=['news', 'headlines', 'raw_publish_date', 'publish_date'], verbose=0):
    if verbose > 0:
        print(url)
        
    # send request
    response = requests.get(url)
    if verbose > 0:
        print(response.status_code, response.reason, '\n')

    if response.status_code != 200:
        return
   
    # valid response; proceed
    soup = BeautifulSoup(response.text)
    
    text = soup.find('div', attrs={'class': 'caas-body'}).text.replace('\xa0', ' ')     # obtain body text
    headlines = soup.find_all('h1')     # obtain headlines and titles
    for n, head in enumerate(headlines):
        headlines[n] = head.text
    raw_publish_date = soup.find('time').text     # obtain time of publish
    try:
        publish_date = datetime.strptime(raw_publish_date, '%B %d, %Y, %I:%M %p')     # obtain processed datetime
    except:
        publish_date = None
            
    return dict(zip(columns, [text, headlines, raw_publish_date, publish_date]))


In [5]:
verbose = 0
scrape_dict = []

for index in indices:
    print(f'loading {index}', end=' -- ')

    # obtain news
    try:
        news = yf.Ticker(index).news
        print(f'{len(news)} urls')
        
    except:
        print('no news')
        continue
        
    for num, d in enumerate(news):

        if verbose > 1:
            print(f'\t{num}  |  {d["title"]}',)
            print(f'\t{d["link"]}')
            print(f'\t{" ".join(d["relatedTickers"])}\n')

        link = news[num]['link']
        scrape_results = scrape_news(url=link, verbose=verbose)
        if scrape_results is not None:
            scrape_results['scrape_date'] = date_today
            scrape_results['index'] = index
            scrape_results['url'] = link
            scrape_results['related_tickers'] = d['relatedTickers']
            scrape_dict.append(scrape_results)
        

loading MSFT -- 8 urls
loading AAPL -- 8 urls
loading GOOG -- 8 urls
loading META -- 8 urls
loading TSLA -- 8 urls
loading SPY -- 8 urls
loading NVDA -- 8 urls
loading AMZN -- 8 urls
loading COMP -- 8 urls
loading FANG -- 8 urls
loading ^GSPC -- 8 urls
loading CSCO -- 8 urls
loading WFC -- 8 urls
loading C -- 8 urls
loading JPM -- 8 urls
loading BCS -- 8 urls
loading MS -- 8 urls
loading CS -- 8 urls
loading HPQ -- 8 urls
loading BAC -- 8 urls
loading CRM -- 8 urls
loading NKE -- 8 urls
loading DOW -- 8 urls
loading VOO -- 3 urls
loading ^IXIC -- 8 urls
loading DELL -- 8 urls
loading INTC -- 8 urls
loading ADBE -- 8 urls
loading PYPL -- 8 urls
loading IBM -- 8 urls
loading GS -- 8 urls
loading AXP -- 8 urls
loading MA -- 8 urls
loading V -- 8 urls
loading ORCL -- 8 urls
loading LLY -- 8 urls
loading SPOT -- 8 urls
loading UA -- 8 urls
loading NFLX -- 8 urls
loading WRBY -- 8 urls
loading TGT -- 8 urls
loading WMT -- 8 urls
loading FORD -- 2 urls
loading DUOL -- 8 urls
loading TM -- 8 u

In [6]:
scrape_df = pd.DataFrame(scrape_dict, columns=['scrape_date', 'index', 'url', 'news', 'headlines', 'raw_publish_date', 'publish_date', 'related_tickers'])
scrape_df


Unnamed: 0,scrape_date,index,url,news,headlines,raw_publish_date,publish_date,related_tickers
0,2022-12-19,MSFT,https://finance.yahoo.com/news/paid-hack-nouri...,'A paid hack': Nouriel Roubini called out Kevi...,"[Yahoo Finance, 'A paid hack': Nouriel Roubini...","December 19, 2022, 7:05 AM",2022-12-19 07:05:00,"[JNJ, MSFT, HD]"
1,2022-12-19,MSFT,https://finance.yahoo.com/news/cisco-csco-join...,Cisco Systems CSCO recently announced its part...,"[Yahoo Finance, Cisco (CSCO) Joins Forces With...","December 19, 2022, 6:19 AM",2022-12-19 06:19:00,"[CSCO, WIT, MSFT]"
2,2022-12-19,MSFT,https://finance.yahoo.com/news/12-most-advance...,"In this article, we discuss the 12 most advanc...","[Yahoo Finance, 12 Most Advanced Countries in ...","December 19, 2022, 6:01 AM",2022-12-19 06:01:00,"[TTC, LNN, MSFT]"
3,2022-12-19,MSFT,https://finance.yahoo.com/news/microsoft-corpo...,Microsoft (MSFT) has recently been on Zacks.co...,"[Yahoo Finance, Microsoft Corporation (MSFT) i...","December 19, 2022, 6:00 AM",2022-12-19 06:00:00,[MSFT]
4,2022-12-19,AAPL,https://finance.yahoo.com/news/investors-heavi...,Apple (AAPL) has recently been on Zacks.com's ...,"[Yahoo Finance, Investors Heavily Search Apple...","December 19, 2022, 6:00 AM",2022-12-19 06:00:00,[AAPL]
...,...,...,...,...,...,...,...,...
688,2022-12-19,BKR,https://finance.yahoo.com/news/permian-oil-dri...,"In its weekly release, Baker Hughes Company BK...","[Yahoo Finance, Permian Oil Drilling Rig Count...","December 19, 2022, 5:13 AM",2022-12-19 05:13:00,"[EOG, BKR, DVN]"
689,2022-12-19,BKR,https://finance.yahoo.com/news/baker-hughes-re...,"HOUSTON, TX and LONDON, ENGLAND / ACCESSWIRE /...","[Yahoo Finance, Baker Hughes Recognized by For...","December 14, 2022, 5:10 AM",2022-12-14 05:10:00,[BKR]
690,2022-12-19,BKR,https://finance.yahoo.com/news/baker-hughes-ad...,Baker HughesThe Nasdaq-100 is one of the world...,"[Yahoo Finance, Baker Hughes Added to the Nasd...","December 12, 2022, 5:00 AM",2022-12-12 05:00:00,[BKR]
691,2022-12-19,BKR,https://finance.yahoo.com/news/annual-changes-...,"Nasdaq, Inc.NEW YORK, Dec. 09, 2022 (GLOBE NEW...","[Yahoo Finance, Annual Changes to the Nasdaq-1...","December 9, 2022, 5:00 PM",2022-12-09 17:00:00,"[CSGP, WBD, BAIDF, BIDU, GFS, SWKS, FANG, ^NDX..."


In [7]:
save_path = '../data/scraped_news_from_api'
save_file = f'yahoo_finance_news_{date_today}.csv'
scrape_df.to_csv(os.path.join(save_path, save_file), index=False)
