# Web Scraping: The Nasdaq

In [32]:
from datetime import datetime
import numpy as np
import pandas as pd


from bs4 import BeautifulSoup
from selenium import webdriver
import time

import requests
import re



### Obtain list of news from the coverpage

URL definition:

In [33]:
# url definition
url = "https://www.nasdaq.com/news-and-insights"

List of news:

In [34]:
# Request
r1 = requests.get(url)
print(r1.status_code)

# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')

# News identification
coverpage_news = soup1.find_all('a', class_='topic-mini-feed__related-item__link')
len(coverpage_news)

200


32

Now we have a list in which every element is a news article:

In [35]:
coverpage_news[0]

<a alt="Soccer-Late goal saves Gremio in Libertadores semis v Flamengo" aria-label="Soccer-Late goal saves Gremio in Libertadores semis v Flamengo" class="topic-mini-feed__related-item__link" href="/articles/soccer-late-goal-saves-gremio-in-libertadores-semis-v-flamengo-2019-10-02">
  <p class="topic-mini-feed__related-item__title">Soccer-Late goal saves Gremio in Libertadores semis v Flamengo</p>
</a>

In [36]:
n=0
link = coverpage_news[n]['href']
title = coverpage_news[n].get_text()
article = requests.get(url+link)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')



In [37]:
title

'\n  Soccer-Late goal saves Gremio in Libertadores semis v Flamengo\n'

In [38]:
body = soup_article.find_all('p')

In [39]:
body

[<p class="news-insights__subhead">While looking at a biotech might seem counter-intuitive, it makes sense in this context.</p>,
 <p class="topic-mini-feed__related-item__title">Soccer-Late goal saves Gremio in Libertadores semis v Flamengo</p>,
 <p class="topic-mini-feed__related-item__title">Rugby-Mako Vunipola returns on the bench for England's Argentina clash</p>,
 <p class="topic-mini-feed__related-item__title">U.S. committee seeks to interview Boeing engineer on safety of 737 MAX</p>,
 <p class="topic-mini-feed__related-item__title">U.S. committee seeks to interview Boeing engineer on safety of 737 MAX</p>,
 <p class="topic-mini-feed__related-item__title">Japan services activity grows at slightly slower pace in Sept - PMI</p>,
 <p class="topic-mini-feed__related-item__title">PREVIEW-India cenbank set to cut rates again as fiscal measures fail to cheer</p>,
 <p class="topic-mini-feed__related-item__title">Argentine economists forecast deeper recession, hold inflation steady</p>,
 

In [40]:
len(body)

61

In [41]:
x = soup_article.find_all('p')

In [42]:
len(x)

61

In [43]:
x[0].get_text()

'While looking at a biotech might seem counter-intuitive, it makes sense in this context.'

In [44]:
# options = webdriver.ChromeOptions() 
# options.add_experimental_option("excludeSwitches", ['enable-automation'])
# def Article(url):
#     browser = webdriver.Chrome(chrome_options=options, executable_path=r"/usr/bin/chromedriver")
#     browser.get(url)
#     time.sleep(5) # wait for the page to load javascript
#     pageSource = browser.page_source
#     browser.quit()
#     return pageSource

### Let's extract the text from the articles:

First, we'll define the number of articles we want:

In [45]:
number_of_articles = len(coverpage_news)-1
now = datetime.now()

In [47]:
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []
scrapedTime=[]
postedDate=[]
for n in np.arange(0, number_of_articles):
        
    # Getting the link of the article
    link = coverpage_news[n]['href']
    print (link)
    list_links.append(link)
    posteddate=re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', link)
    #posteddate=str(posteddate)
    if posteddate:
        posteddate=','.join(posteddate[0]) 
        posteddate=posteddate.replace(',','/')
    print (posteddate)
    postedDate.append(posteddate)
    
    # Getting the title
    title = coverpage_news[n].get_text()
    list_titles.append(title)
    
    
    # Reading the content (it is divided in paragraphs)
    article = requests.get(url+link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    x = soup_article.find_all('p')
    scraped = now.strftime("%d/%m/%Y %H:%M:%S")
    scrapedTime.append(scraped)
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    news_contents.append(final_article)

/articles/soccer-late-goal-saves-gremio-in-libertadores-semis-v-flamengo-2019-10-02
[]
/articles/rugby-mako-vunipola-returns-on-the-bench-for-englands-argentina-clash-2019-10-02
[]
/articles/u.s.-committee-seeks-to-interview-boeing-engineer-on-safety-of-737-max-2019-10-02
[]
/articles/u.s.-committee-seeks-to-interview-boeing-engineer-on-safety-of-737-max-2019-10-02
[]
/articles/japan-services-activity-grows-at-slightly-slower-pace-in-sept-pmi-2019-10-02
[]
/articles/preview-india-cenbank-set-to-cut-rates-again-as-fiscal-measures-fail-to-cheer-2019-10-02-0
[]
/articles/argentine-economists-forecast-deeper-recession-hold-inflation-steady-2019-10-02
[]
/articles/servicenow-now-stock-moves-1.31%3A-what-you-should-know-2019-10-02
[]
/articles/will-costco-cost-continue-its-run-with-strong-q4-earnings-2019-10-02
[]
/articles/will-q3-earnings-finish-negative-this-year-2019-10-02
[]
/articles/technology-sector-update-for-10-02-2019%3A-ayivhcaaplorcl-2019-10-02-0
[]
/articles/how-to-double-your-

In [48]:
len(postedDate)


31

Let's put them into:
* a dataset which will the input of the models (`df_features`)
* a dataset with the title and the link (`df_show_info`)

In [50]:
df=pd.DataFrame(columns= ["STOCK ID", "URL", "PUBLISH TIME", "EDITED TIME", "SCRAPED TIME", "TITLE", "CONTENT" ])
df['URL']=list_links
df['PUBLISH TIME']=postedDate
df['SCRAPED TIME']=scrapedTime
df['TITLE']=list_titles
df['CONTENT']=news_contents
df

Unnamed: 0,STOCK ID,URL,PUBLISH TIME,EDITED TIME,SCRAPED TIME,TITLE,CONTENT
0,,/articles/soccer-late-goal-saves-gremio-in-lib...,[],,02/10/2019 23:59:44,\n Soccer-Late goal saves Gremio in Libertado...,While looking at a biotech might seem counter-...
1,,/articles/rugby-mako-vunipola-returns-on-the-b...,[],,02/10/2019 23:59:44,\n Rugby-Mako Vunipola returns on the bench f...,While looking at a biotech might seem counter-...
2,,/articles/u.s.-committee-seeks-to-interview-bo...,[],,02/10/2019 23:59:44,\n U.S. committee seeks to interview Boeing e...,While looking at a biotech might seem counter-...
3,,/articles/u.s.-committee-seeks-to-interview-bo...,[],,02/10/2019 23:59:44,\n U.S. committee seeks to interview Boeing e...,While looking at a biotech might seem counter-...
4,,/articles/japan-services-activity-grows-at-sli...,[],,02/10/2019 23:59:44,\n Japan services activity grows at slightly ...,While looking at a biotech might seem counter-...
5,,/articles/preview-india-cenbank-set-to-cut-rat...,[],,02/10/2019 23:59:44,\n PREVIEW-India cenbank set to cut rates aga...,While looking at a biotech might seem counter-...
6,,/articles/argentine-economists-forecast-deeper...,[],,02/10/2019 23:59:44,\n Argentine economists forecast deeper reces...,While looking at a biotech might seem counter-...
7,,/articles/servicenow-now-stock-moves-1.31%3A-w...,[],,02/10/2019 23:59:44,\n ServiceNow (NOW) Stock Moves -1.31%: What ...,While looking at a biotech might seem counter-...
8,,/articles/will-costco-cost-continue-its-run-wi...,[],,02/10/2019 23:59:44,\n Will Costco (COST) Continue Its Run with S...,While looking at a biotech might seem counter-...
9,,/articles/will-q3-earnings-finish-negative-thi...,[],,02/10/2019 23:59:44,\n Will Q3 Earnings Finish Negative This Year?\n,While looking at a biotech might seem counter-...


In [51]:
st=pd.DataFrame(pd.read_csv('c:/data/CS/tickers_list.csv'))
st['Issuer Name'][0].split(',')
issuerName=[]
stockabbre=[]
for i in range(len(st['Issuer Name'])):
    issuername=st['Issuer Name'][i].split(',')
    issuerName.append(issuername[0])
print (issuerName)
stockabbre=st["Symbol"]
stockabbre=list(stockabbre)
stockabbre.append('APPL')
print (stockabbre)
    

['ADOMANI', 'AdvisorShares Trust', 'Allied Esports Entertainment', 'Alta Mesa Resources', 'Alta Mesa Resources', 'BIQI International Holdings Corporation', 'China Lending Corporation', 'China TechFaith Wireless Communication Technology Limited', 'ChinaCache International Holdings Ltd.', 'Cray Inc', "Del Frisco's Restaurant Group", 'Diversicare Healthcare Services Inc.', 'Emmaus Life Sciences', 'Emmaus Life Sciences', 'Finisar Corporation', "Fred's", 'Gladstone Capital Corporation', 'Hunter Maritime Acquisition Corp.', 'Hunter Maritime Acquisition Corp.', 'Hunter Maritime Acquisition Corp.', 'Ivy NextShares', 'Ivy NextShares', 'Ivy NextShares', 'Peak Resorts', 'Repay Holdings Corporation', 'Shutterfly', 'USA Technologies', 'USA Technologies', 'Xynomic Pharmaceuticals Holdings', 'Xynomic Pharmaceuticals Holdings', 'Xynomic Pharmaceuticals Holdings', 'Yangtze River Port and Logistics Limited ']
['ADOM', 'BKCH', 'AESEW', 'AMR', 'AMRWW', 'BIQI', 'CLDC', 'CNTF', 'CCIH', 'CRAY', 'DFRG', 'DVCR

In [52]:
for i in range(len(df['CONTENT'])):
    for j in range(len(st['Symbol'])):
        if st['Symbol'][j] in df['CONTENT'][i] or issuerName[j] in df['CONTENT']:
            df['STOCK ID'][i]=st['Symbol'][j]
df['STOCK ID'].describe()

count     0
unique    0
Name: STOCK ID, dtype: int64

In [53]:
df.to_csv("C:/data/CS/Nasdaq_SCRAPE.csv")