# Scrape Text from Yahoo Finance News via HTML


## Import libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import time
import random
import re
import os

import yfinance as yf

import requests
from bs4 import BeautifulSoup



In [2]:
date_today = datetime.now().date()
print(f'Last updated {date_today}')


Last updated 2022-12-18


## Obtain index info and news from yfinance library

In [3]:
indices_2022_12_12 = ['MSFT', 'AAPL', 'GOOG', 'META', 'TSLA', 'SPY', 'NVDA', 'AMZN', 'COMP', 'FANG', '^GSPC']     # specify indices of interest
indices_2022_12_13 = ['CSCO', 'WFC', 'C', 'JPM', 'BCS', 'MS', 'CS', 'HPQ', 'BAC', 'CRM', 'NKE', 'DOW', 'VOO', '^IXIC', 'DELL', 'INTC', 'ADBE', 'PYPL', 'IBM', 'GS', 'AXP', 'MA', 'V', 'ORCL']
indices_2022_12_16_1 = ['LLY', 'SPOT', 'UA', 'NFLX', 'WRBY', 'TGT', 'WMT', 'FORD', 'DUOL', 'TM', 'HD', 'CVS', 'ZM', 'DIS', 'ADDYY', 'ARHS', 'GE', 'GM', 'PTON', 'LOW', 'JNJ', 'WBD', 'SHOP', 'PINS', 'RTX', 'BIRD', 'LULU', 'BA', 'PFE', 'PG', 'SBUX', 'COST']
indices_2022_12_16_2 = ['CVX', 'ACI', 'SFM', 'AJRD', 'LMT', 'HBAN', 'SONY', 'HON', 'COF', 'MRNA', 'CRWD', 'FDX', 'RLLCF', 'CFG', 'VZ', 'KTOS', 'XOM', 'GD', 'VWAGY', 'MZDAY', 'APH', 'KR', 'COSM', 'FWONA', 'UPS', 'MDB', 'NTDOY', 'HMC', 'MAXR', 'CAT', 'SYF', 'KO', 'AZN', 'T', 'SHEL', 'FUJHY', 'NTIC', 'DNUT', 'PEP', 'CMG', 'NSANY', 'WMG', 'FOX', 'TTE', 'CMCSA', 'PNC', 'TMUS', 'NVAX', 'NOC', 'BO', 'BNTX', 'NVS', 'EA', 'TXT', 'FITB']
indices_2022_12_17 = ['AMD', 'CZNC', 'XPEV', 'NKLA', 'BYDDY', 'LUMN', 'BSRR', 'CSWI', 'AGCO', 'FBIZ', 'LNN', 'CCBG', 'LI', 'EBTC', 'CFFI', 'DE', 'FELE', 'MBWM', 'NIO', 'RIVN', 'FNLC', 'EQBK']
indices_2022_12_18 = ['RS', 'W', 'TAL', 'X', 'GOTU', 'DAL', 'UAL', 'SXC', 'AAL', 'LUV', 'ZEUS', 'CMC', 'TMST', 'JBLU', 'ALK', 'EXAS', 'SAVE', 'WFRD', 'SCHN', 'BKR']


In [4]:
prev_indices = indices_2022_12_12 + indices_2022_12_13 + indices_2022_12_16_1 + indices_2022_12_16_2 + indices_2022_12_17 + indices_2022_12_18
len(prev_indices)


144

## Scrape news

In [5]:
def load_text(dr, old_indices):
    all_urls = {}
    files = os.listdir(dr)
    new_indices = []
    for file in files:
        if not file.endswith('.txt'):
            continue
        index = file.split('_')[0].upper()
        if index in prev_indices:
            print(f'\tskipping {index}')
            continue
            
        print(file)
        urls = []
        with open(os.path.join(dr, file)) as f:
            txt_soup = f.read()
            h3s = txt_soup.split('<h3 class="Mb(5px)">')
            for h3 in h3s:
                if h3.startswith('<a'):
                    link = h3.split('href="')[1].split('"')[0]
                    urls.append(f'http://www.finance.yahoo.com{link}')
        count = 0
        index_key = index
        while index in all_urls.keys():
            index_key = f'{index}_{count}'
        all_urls[index_key] = urls
        
        new_indices.append(index)
        
    return all_urls, new_indices


In [6]:
soup_dir = '../data/htmls'
all_urls, indices_2022_12_13 = load_text(soup_dir, old_indices=prev_indices)


	skipping CVX
	skipping LLY
	skipping SPOT
rs_2022-12-18.txt
	skipping ACI
	skipping COMP
	skipping UA
w_2022-12-18.txt
	skipping SFM
	skipping NFLX
	skipping AJRD
	skipping LMT
tal_2022-12-18.txt
	skipping WFC
x_2022-12-18.txt
	skipping HBAN
	skipping WRBY
	skipping CSCO
	skipping SONY
	skipping TGT
	skipping AMD
	skipping WFC
	skipping C
	skipping CZNC
	skipping WMT
	skipping XPEV
	skipping JPM
	skipping FORD
	skipping DUOL
gotu_2022-12-18.txt
	skipping HON
	skipping NKLA
	skipping BYDDY
	skipping COF
dal_2022-12-18.txt
	skipping BCS
	skipping TM
	skipping MS
	skipping MRNA
	skipping CRWD
	skipping GOOG
ual_2022-12-18.txt
	skipping HD
	skipping CVS
sxc_2022-12-18.txt
aal_2022-12-18.txt
	skipping META
	skipping FDX
	skipping CS
	skipping ZM
	skipping DIS
	skipping RLLCF
	skipping CFG
luv_2022-12-18.txt
	skipping HPQ
	skipping VZ
	skipping LUMN
	skipping BSRR
	skipping KTOS
zeus_2022-12-18.txt
	skipping BAC
	skipping XOM
	skipping GD
	skipping ADDYY
	skipping ARHS
	skipping VWAGY
	skip

In [7]:
print(f'Number of urls found: {np.sum([len(all_urls[idx]) for idx in all_urls.keys()])}')
for index in all_urls.keys():
    print(index)
    for link in all_urls[index][:2]:
        print('\t', link)
    

Number of urls found: 2491
RS
	 http://www.finance.yahoo.com/news/3-reasons-why-reliance-steel-174505323.html
	 http://www.finance.yahoo.com/news/reliance-steel-rs-forms-hammer-145502989.html
W
	 http://www.finance.yahoo.com/m/80039d6d-86bb-38e5-bb3f-c453bae126e1/where-will-wayfair-stock-be.html
	 http://www.finance.yahoo.com/m/df19f68b-57b0-3f82-9be3-c8a355b8454a/wayfair-gets-a-new-bull-it.html
TAL
	 http://www.finance.yahoo.com/news/tal-education-group-tal-stock-144002474.html
	 http://www.finance.yahoo.com/news/4-promising-chinese-stocks-buy-130101039.html
X
	 http://www.finance.yahoo.com/m/1e370eb5-7ca6-3ae0-91cc-6f3d05384e94/stocks-extend-slump-twitter-.html
	 http://www.finance.yahoo.com/video/stocks-moving-hours-adobe-united-220540829.html
GOTU
	 http://www.finance.yahoo.com/news/gaotu-techedu-announces-third-quarter-060000154.html
	 http://www.finance.yahoo.com/news/gaotu-techedu-announces-receipt-nyse-123000091.html
DAL
	 http://www.finance.yahoo.com/m/062b8b5e-81cf-37e4-91d3-

In [8]:
# display this round of indices for future purposes
print(f"""['{"', '".join(all_urls.keys())}']""")


['RS', 'W', 'TAL', 'X', 'GOTU', 'DAL', 'UAL', 'SXC', 'AAL', 'LUV', 'ZEUS', 'CMC', 'TMST', 'JBLU', 'ALK', 'EXAS', 'SAVE', 'WFRD', 'SCHN', 'BKR']


In [9]:
# funtion to scrape body text, headlines, and puslish date from approved URLs
def scrape_news(url, columns=['news', 'headlines', 'raw_publish_date', 'publish_date'], verbose=0):
    if verbose > 0:
        print(url)
        
    # send request
    response = requests.get(url)
    if verbose > 0:
        print(response.status_code, response.reason, '\n')

    if response.status_code != 200:
        return
   
    # valid response; proceed
    soup = BeautifulSoup(response.text)
    
    text = soup.find('div', attrs={'class': 'caas-body'}).text.replace('\xa0', ' ')     # obtain body text
    headlines = soup.find_all('h1')     # obtain headlines and titles
    for n, head in enumerate(headlines):
        headlines[n] = head.text
    raw_publish_date = soup.find('time').text     # obtain time of publish
    try:
        publish_date = datetime.strptime(raw_publish_date, '%B %d, %Y, %I:%M %p')     # obtain processed datetime
    except:
        publish_date = None
            
    return dict(zip(columns, [text, headlines, raw_publish_date, publish_date]))


In [10]:
save_path = '../data/scraped_news_from_html'
verbose = 0
for index in all_urls.keys():
    print(f'--{index}--', end = '\t\t')
    scrape_dict = []
    for link in all_urls[index]:
        scrape_results = scrape_news(url=link, verbose=verbose)
        if scrape_results is not None:
            scrape_results['scrape_date'] = date_today
            scrape_results['index'] = index
            scrape_results['url'] = link
            scrape_results['related_tickers'] = [index]
            scrape_dict.append(scrape_results)
            
    # save the results for this index
    save_file = f'yahoo_finance_{index.lower()}_{date_today}.csv'
    scrape_df = pd.DataFrame(scrape_dict, columns=['scrape_date', 'index', 'url', 'news', 'headlines', 'raw_publish_date', 'publish_date', 'related_tickers'])
    scrape_df.to_csv(os.path.join(save_path, save_file), index=False)
    print(save_file)
        

--RS--		yahoo_finance_rs_2022-12-18.csv
--W--		yahoo_finance_w_2022-12-18.csv
--TAL--		yahoo_finance_tal_2022-12-18.csv
--X--		yahoo_finance_x_2022-12-18.csv
--GOTU--		yahoo_finance_gotu_2022-12-18.csv
--DAL--		yahoo_finance_dal_2022-12-18.csv
--UAL--		yahoo_finance_ual_2022-12-18.csv
--SXC--		yahoo_finance_sxc_2022-12-18.csv
--AAL--		yahoo_finance_aal_2022-12-18.csv
--LUV--		yahoo_finance_luv_2022-12-18.csv
--ZEUS--		yahoo_finance_zeus_2022-12-18.csv
--CMC--		yahoo_finance_cmc_2022-12-18.csv
--TMST--		yahoo_finance_tmst_2022-12-18.csv
--JBLU--		yahoo_finance_jblu_2022-12-18.csv
--ALK--		yahoo_finance_alk_2022-12-18.csv
--EXAS--		yahoo_finance_exas_2022-12-18.csv
--SAVE--		yahoo_finance_save_2022-12-18.csv
--WFRD--		yahoo_finance_wfrd_2022-12-18.csv
--SCHN--		yahoo_finance_schn_2022-12-18.csv
--BKR--		yahoo_finance_bkr_2022-12-18.csv


In [11]:
print('Done')

Done
