## Day30 - Web scraping with BS4

<img src="../images/day30.jpg" />

In [66]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request

import pandas as pd
import numpy as np

from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
import re

In [91]:
ticker = 'TSLA'

# html requests + bs4
URL = 'https://finviz.com/quote.ashx?t=' + ticker
req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) 
resp = urlopen(req)    
html = BeautifulSoup(resp, features="lxml")

# get news-table
news_table = html.find(id='news-table')
# get list of all <tr> elements (aka rows)
table = news_table.findAll('tr')

titles = []
links = []
sources = []

stemmer = SnowballStemmer(language="english")

for row in table:
    # date = row.td.text.strip() 
    
    sources.append(row.span.text.strip())
    links.append(row.a['href'])
    
    # text preprocessing on the fly 
    t = row.a.text.strip().lower()       # lowercase
    t = re.sub(r"[0-9]","",t)            # remove digits
    t = re.sub(r"[^\w\s]","",t)          # remove punctuation
    
    tokens = word_tokenize(t)
    stemmed_tokens = [stemmer.stem(t) for tk in tokens if tk not in stopwords.words('english')]
    titles.append(' '.join(stemmed_tokens))

In [93]:
df = pd.DataFrame({'source' : sources, 'title' : titles, 'link' : links})
df.head()

Unnamed: 0,source,title,link
0,Motley Fool,why tesla stock fell sharply on friday why tes...,https://www.fool.com/investing/2021/03/26/why-...
1,Investor's Business Daily,is nio stock a buy chip shortage ev competitio...,https://www.investors.com/news/nio-stock-buy-n...
2,LA Times,column feds rule that musk and tesla are no fr...,https://finance.yahoo.com/news/column-feds-rul...
3,Yahoo Finance Video,how the pandemic has led to a boom in used car...,https://finance.yahoo.com/video/pandemic-led-b...
4,Investor's Business Daily,dow tech stocks rally as treasury yields jump ...,https://www.investors.com/market-trend/stock-m...
