# Web Scraping: The CNBC

In [126]:
from datetime import datetime
import numpy as np
import pandas as pd


from bs4 import BeautifulSoup
from selenium import webdriver
import time

import requests
import re



### Obtain list of news from the coverpage

URL definition:

In [80]:
# url definition
url = "https://www.cnbc.com"

List of news:

In [81]:
# Request
r1 = requests.get(url)
print(r1.status_code)

# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')

# News identification
coverpage_news = soup1.find_all('a', class_='LazyLoaderPlaceholder-headline')
len(coverpage_news)

200


67

Now we have a list in which every element is a news article:

In [82]:
coverpage_news[0]

<a class="LazyLoaderPlaceholder-headline" href="https://www.cnbc.com/2019/10/02/chinas-fake-meat-trend-is-on-the-rise-researchers-say.html">China's fake meat trend is on the rise, researchers say</a>

In [83]:
n=0
link = coverpage_news[n]['href']
title = coverpage_news[n].get_text()
article = requests.get(link)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')


In [84]:
title

"China's fake meat trend is on the rise, researchers say"

In [85]:
body = soup_article.find_all('p')

In [86]:
body

[<p><input type="checkbox"/>Keep Me Logged In</p>,
 <p class="TickerStory-storyDescription">Wednesday's report of ADP private payrolls could give clues on the labor market, the next part of the economy to be scrutinized.</p>,
 <p class="TickerStory-storyDescription">Sen. Elizabeth Warren could see a major boost from donors in Silicon Valley who are drawn to her Big Tech proposals.</p>,
 <p class="TickerStory-storyDescription">Forever 21 on Tuesday released a list of the nearly 180 locations it expects to close as part of its bankruptcy proceedings, including those stores' owners. When the apparel...</p>,
 <p class="TickerStory-storyDescription">The pro-Trump groups, which also include his joint fundraising entities, have raised more than $308 million in total in 2019, and boast more than $156 million cash on hand.</p>,
 <p class="TickerStory-storyDescription">If current employees are going into the all-hands meetings and recording audio of their CEO, that is a sign that employee morale

In [87]:
len(body)

37

In [88]:
x = soup_article.find_all('p')

In [89]:
len(x)

37

In [90]:
x[0].get_text()

'Keep Me Logged In'

In [111]:
# options = webdriver.ChromeOptions() 
# options.add_experimental_option("excludeSwitches", ['enable-automation'])
# def Article(url):
#     browser = webdriver.Chrome(chrome_options=options, executable_path=r"/usr/bin/chromedriver")
#     browser.get(url)
#     time.sleep(5) # wait for the page to load javascript
#     pageSource = browser.page_source
#     browser.quit()
#     return pageSource

### Let's extract the text from the articles:

First, we'll define the number of articles we want:

In [143]:
number_of_articles = len(coverpage_news)-1
now = datetime.now()

In [146]:
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []
scrapedTime=[]
postedDate=[]
for n in np.arange(0, number_of_articles):
        
    # Getting the link of the article
    link = coverpage_news[n]['href']
    print (link)
    list_links.append(link)
    posteddate=re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', link)
    #posteddate=str(posteddate)
    if posteddate:
        posteddate=','.join(posteddate[0]) 
        posteddate=posteddate.replace(',','/')
    print (posteddate)
    postedDate.append(posteddate)
    
    # Getting the title
    title = coverpage_news[n].get_text()
    list_titles.append(title)
    
    
    # Reading the content (it is divided in paragraphs)
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    x = soup_article.find_all('p')
    scraped = now.strftime("%d/%m/%Y %H:%M:%S")
    scrapedTime.append(scraped)
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
        news_contents.append(final_article)

https://www.cnbc.com/2019/10/02/chinas-fake-meat-trend-is-on-the-rise-researchers-say.html
2019/10/02
https://www.cnbc.com/2019/10/01/bill-gurley-direct-listing-event-convinced-outreach-ceo-to-consider-it.html
2019/10/01
https://www.cnbc.com/2019/10/01/tesla-acquiring-deepscale-computer-vision-start-up-for-self-driving.html
2019/10/01
https://www.cnbc.com/2019/10/01/banned-coach-alberto-salazar-briefed-nike-ceo-on-doping-violations.html
2019/10/01
https://www.cnbc.com/2019/10/01/visa-mastercard-reconsider-backing-facebooks-libra-report-says.html
2019/10/01
https://www.cnbc.com/2019/10/01/meet-triple-five-group-the-developers-behind-american-dream.html
2019/10/01
https://www.cnbc.com/2019/10/01/those-weekly-splurges-cost-7400-extra-yearly-study-shows.html
2019/10/01
https://www.cnbc.com/2019/10/01/photos-final-home-designed-by-frank-lloyd-wright-to-go-up-for-auction.html
2019/10/01
https://www.cnbc.com/2019/10/01/johnson-johnson-settles-with-ohio-counties-ahead-of-opioid-trial.html
2019

In [150]:
len(postedDate)


66

Let's put them into:
* a dataset which will the input of the models (`df_features`)
* a dataset with the title and the link (`df_show_info`)

In [151]:
# df_features
df_features = pd.DataFrame(
     {'Article Content': news_contents 
    })

# df_show_info
df_show_info = pd.DataFrame(
    {'Article Title': list_titles,
     'Article Link': list_links})

In [156]:
df=pd.DataFrame(columns= ["STOCK ID", "URL", "PUBLISH TIME", "EDITED TIME", "SCRAPED TIME", "TITLE", "CONTENT" ])
df['URL']=list_links
df['PUBLISH TIME']=postedDate
df['SCRAPED TIME']=scrapedTime
df['TITLE']=list_titles
df['CONTENT']=news_contents
df

Unnamed: 0,STOCK ID,URL,PUBLISH TIME,EDITED TIME,SCRAPED TIME,TITLE,CONTENT
0,,https://www.cnbc.com/2019/10/02/chinas-fake-me...,2019/10/02,,02/10/2019 17:24:34,"China's fake meat trend is on the rise, resear...",Keep Me Logged In U.S. markets fell broadly fo...
1,,https://www.cnbc.com/2019/10/01/bill-gurley-di...,2019/10/01,,02/10/2019 17:24:34,You don't have to be a big brand name to do a ...,Keep Me Logged In U.S. markets fell broadly fo...
2,,https://www.cnbc.com/2019/10/01/tesla-acquirin...,2019/10/01,,02/10/2019 17:24:34,Tesla is buying computer vision start-up DeepS...,Keep Me Logged In U.S. markets fell broadly fo...
3,,https://www.cnbc.com/2019/10/01/banned-coach-a...,2019/10/01,,02/10/2019 17:24:34,Banned coach Alberto Salazar briefed Nike CEO ...,Keep Me Logged In U.S. markets fell broadly fo...
4,,https://www.cnbc.com/2019/10/01/visa-mastercar...,2019/10/01,,02/10/2019 17:24:34,"Visa, Mastercard reconsider backing Facebook's...",Keep Me Logged In U.S. markets fell broadly fo...
5,,https://www.cnbc.com/2019/10/01/meet-triple-fi...,2019/10/01,,02/10/2019 17:24:34,Meet Triple Five Group: The developers behind ...,Keep Me Logged In U.S. markets fell broadly fo...
6,,https://www.cnbc.com/2019/10/01/those-weekly-s...,2019/10/01,,02/10/2019 17:24:34,"Those weekly splurges cost $7,400 extra annual...",Keep Me Logged In Sticking to a household budg...
7,,https://www.cnbc.com/2019/10/01/photos-final-h...,2019/10/01,,02/10/2019 17:24:34,The final home designed by Frank Lloyd Wright ...,"The legendary Norman Lykes house, designed by ..."
8,,https://www.cnbc.com/2019/10/01/johnson-johnso...,2019/10/01,,02/10/2019 17:24:34,Johnson & Johnson settles with Ohio counties a...,Keep Me Logged In U.S. markets fell broadly fo...
9,,https://www.cnbc.com/2019/10/01/health-care-st...,2019/10/01,,02/10/2019 17:24:34,Health-care stocks are in their longest losing...,Keep Me Logged In The health-care sector just ...


In [157]:
st=pd.DataFrame(pd.read_csv('c:/data/CS/tickers_list.csv'))
st['Issuer Name'][0].split(',')
issuerName=[]
stockabbre=[]
for i in range(len(st['Issuer Name'])):
    issuername=st['Issuer Name'][i].split(',')
    issuerName.append(issuername[0])
print (issuerName)
stockabbre=st["Symbol"]
stockabbre=list(stockabbre)
stockabbre.append('APPL')
print (stockabbre)
    

['ADOMANI', 'AdvisorShares Trust', 'Allied Esports Entertainment', 'Alta Mesa Resources', 'Alta Mesa Resources', 'BIQI International Holdings Corporation', 'China Lending Corporation', 'China TechFaith Wireless Communication Technology Limited', 'ChinaCache International Holdings Ltd.', 'Cray Inc', "Del Frisco's Restaurant Group", 'Diversicare Healthcare Services Inc.', 'Emmaus Life Sciences', 'Emmaus Life Sciences', 'Finisar Corporation', "Fred's", 'Gladstone Capital Corporation', 'Hunter Maritime Acquisition Corp.', 'Hunter Maritime Acquisition Corp.', 'Hunter Maritime Acquisition Corp.', 'Ivy NextShares', 'Ivy NextShares', 'Ivy NextShares', 'Peak Resorts', 'Repay Holdings Corporation', 'Shutterfly', 'USA Technologies', 'USA Technologies', 'Xynomic Pharmaceuticals Holdings', 'Xynomic Pharmaceuticals Holdings', 'Xynomic Pharmaceuticals Holdings', 'Yangtze River Port and Logistics Limited ']
['ADOM', 'BKCH', 'AESEW', 'AMR', 'AMRWW', 'BIQI', 'CLDC', 'CNTF', 'CCIH', 'CRAY', 'DFRG', 'DVCR

In [165]:
for i in range(len(df['CONTENT'])):
    for j in range(len(st['Symbol'])):
        if st['Symbol'][j] in df['CONTENT'][i] or issuerName[j] in df['CONTENT']:
            df['STOCK ID'][i]=st['Symbol'][j]
df['STOCK ID'].describe()

count     0
unique    0
Name: STOCK ID, dtype: int64

In [167]:
df.to_csv("C:/data/CS/CNBC_SCRAPE.csv")