# Web Scraping: The https://www.reuters.com/finance

In [2]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time

### Obtain list of news from the coverpage

URL definition:

In [33]:
# url definition
url = "https://www.reuters.com/finance"

List of news:

In [34]:
# Request
r1 = requests.get(url)
print(r1.status_code)

# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')

# News identification

coverpage_news = soup1.find_all('div', class_='story-content')
#coverpage_news=coverpage_news+coverpage_news2
print (len(coverpage_news))

coverpage_news

200
25


[<div class="story-content">
 					<a href="/article/us-tesla-deliveries/tesla-deliveries-miss-wall-street-estimates-shares-fall-6-idUSKBN1WH2EB">
 							<h3 class="story-title">
 								Tesla deliveries miss Wall Street estimates; shares fall 6%</h3>
 							</a>
 			        <div class="contributor"></div>
 			        <p>Tesla Inc said on Wednesday its deliveries rose less than 2% in the third quarter, missing Wall Street estimates and sending its shares down nearly 6% in trading after the bell.</p>
 					<time class="article-time">
 							<span class="timestamp">6:07pm EDT</span>
 						</time>
 					</div>, <div class="story-content">
 					<a href="/article/us-airbnb-listing-exclusive/exclusive-morgan-stanley-goldman-sachs-poised-to-lead-airbnbs-listing-sources-idUSKBN1WH2JM">
 							<h3 class="story-title">
 								Exclusive: Morgan Stanley, Goldman Sachs poised to lead Airbnb's listing - sources</h3>
 							</a>
 			        <div class="contributor"></div>
 			        <time 

Now we have a list in which every element is a news article:

In [35]:
coverpage_news[0]


<div class="story-content">
					<a href="/article/us-tesla-deliveries/tesla-deliveries-miss-wall-street-estimates-shares-fall-6-idUSKBN1WH2EB">
							<h3 class="story-title">
								Tesla deliveries miss Wall Street estimates; shares fall 6%</h3>
							</a>
			        <div class="contributor"></div>
			        <p>Tesla Inc said on Wednesday its deliveries rose less than 2% in the third quarter, missing Wall Street estimates and sending its shares down nearly 6% in trading after the bell.</p>
					<time class="article-time">
							<span class="timestamp">6:07pm EDT</span>
						</time>
					</div>

In [36]:
n=0
link = coverpage_news[n].find('a')['href']
title = coverpage_news[n].get_text()
article = requests.get(url+link)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')


In [37]:
title

'\n\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tTesla deliveries miss Wall Street estimates; shares fall 6%\n\t\t\t\t\t\t\t\n\t\t\t        \n\t\t\t        Tesla Inc said on Wednesday its deliveries rose less than 2% in the third quarter, missing Wall Street estimates and sending its shares down nearly 6% in trading after the bell.\n\t\t\t\t\t\n\t\t\t\t\t\t\t6:07pm EDT\n\t\t\t\t\t\t\n\t\t\t\t\t'

In [39]:
body = soup_article.find_all('p')

In [40]:
body

[<p class="BylineBar_reading-time" style="color:undefined">3 Min Read</p>,
 <p>(Reuters) - Tesla Inc said on Wednesday its deliveries rose less than 2% in the third quarter, missing Wall Street estimates and sending its shares down nearly 6% in trading after the bell. </p>,
 <p>Total deliveries came in at 97,000 units for the quarter, below analysts’ estimates of 97,477 vehicles, according to IBES data from Refinitiv. </p>,
 <p>The California-based electric car company led by billionaire Elon Musk is under pressure to increase production and prove that there is sustainable demand for its vehicles, as well as show it can make a profit, even as traditional luxury car makers begin to roll out their own electric models. </p>,
 <p>Recently introduced all-electric SUVs from Audi and Jaguar Land Rover have challenged the company’s S and X models, deliveries of which fell 1.4% to 17,400 from the second quarter and came below analysts’ estimates of 18,829 vehicles. </p>,
 <p>Although third-quar

In [41]:
len(body)

16

In [42]:
x = soup_article.find_all('p')

In [43]:
len(x)

16

In [44]:
x[0].get_text()

'3 Min Read'

### Let's extract the text from the articles:

First, we'll define the number of articles we want:

In [23]:
number_of_articles = 25

In [24]:
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []

for n in np.arange(0, number_of_articles):
        
#     # We need to ignore "live" pages since they are not articles
#     if "live" in coverpage_news[n].find('a')['href']:  
#         continue
    
    # Getting the link of the article
    link = url+coverpage_news[n].find('a')['href']
    list_links.append(link)
    
    # Getting the title
    title = coverpage_news[n].find('a').get_text()
    list_titles.append(title)
    
    # Reading the content (it is divided in paragraphs)
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    body = soup_article.find_all('div', class_='content__article-body from-content-api js-article__body')
    #x = body[0].find_all('p')
    
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    news_contents.append(final_article)

In [29]:
len(news_contents)

25

Let's put them into:
* a dataset which will the input of the models (`df_features`)
* a dataset with the title and the link (`df_show_info`)

In [26]:
# df_features
df_features = pd.DataFrame(
     {'Article Content': news_contents 
    })

# df_show_info
df_show_info = pd.DataFrame(
    {'Article Title': list_titles,
     'Article Link': list_links})

In [28]:
df_features['Article Content'][24]

'3 Min Read (Reuters) - The United Auto Workers union said on Tuesday it rejected a new comprehensive offer from General Motors Co to end a two-week-old strike, saying the automaker came up short on several fronts including wages, healthcare and temporary workers.  The union said it made a counterproposal and warned “there are still many important issues that remain unresolved.” Also on Tuesday, GM said the strike by U.S. workers forced it to halt production at its pickup and transmission plants in Silao, Mexico, resulting in temporary layoffs of 6,000 workers.  About 48,000 UAW members went on strike on Sept. 16 seeking higher pay, greater job security, a bigger share of the leading U.S. automaker’s profit and protection of healthcare benefits.  UAW Vice President Terry Dittes told members in a letter the GM offer “came up short” on issues like healthcare, wages, temporary workers and job security, “to name a few.” The union said it is committed “to exploring all options in order to r

In [30]:
df_show_info

Unnamed: 0,Article Title,Article Link
0,\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tUAW rejects ...,https://www.reuters.com/finance/article/us-usa...
1,\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tFitch downgr...,https://www.reuters.com/finance/article/us-wew...
2,\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tJuul hires A...,https://www.reuters.com/finance/article/us-juu...
3,\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tJohnson & Jo...,https://www.reuters.com/finance/article/us-joh...
4,\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tU.S. SEC pro...,https://www.reuters.com/finance/article/us-usa...
5,"\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tVisa, Master...",https://www.reuters.com/finance/article/us-fac...
6,\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tGlobal share...,https://www.reuters.com/finance/article/us-glo...
7,\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tCredit Suiss...,https://www.reuters.com/finance/article/us-cre...
8,\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tOil prices r...,https://www.reuters.com/finance/article/us-glo...
9,\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tToo big to l...,https://www.reuters.com/finance/article/us-usa...


### Time Elapsed

We are interested in how much time the script takes to get the news because this will impact directly on user experience. For this, we'll put it all into a single function and then call it:

In [30]:
def get_news_themirror():
    
    # url definition
    url = "https://www.mirror.co.uk/"
    
    # Request
    r1 = requests.get(url)
    r1.status_code

    # We'll save in coverpage the cover page content
    coverpage = r1.content

    # Soup creation
    soup1 = BeautifulSoup(coverpage, 'html5lib')

    # News identification
    coverpage_news = soup1.find_all('a', class_='headline publication-font')
    len(coverpage_news)
    
    number_of_articles = 5

    # Empty lists for content, links and titles
    news_contents = []
    list_links = []
    list_titles = []

    for n in np.arange(0, number_of_articles):

        # Getting the link of the article
        link = coverpage_news[n]['href']
        list_links.append(link)

        # Getting the title
        title = coverpage_news[n].get_text()
        list_titles.append(title)

        # Reading the content (it is divided in paragraphs)
        article = requests.get(link)
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'html5lib')
        body = soup_article.find_all('div', class_='articulo-cuerpo')
        x = soup_article.find_all('p')

        # Unifying the paragraphs
        list_paragraphs = []
        for p in np.arange(0, len(x)):
            paragraph = x[p].get_text()
            list_paragraphs.append(paragraph)
            final_article = " ".join(list_paragraphs)

        news_contents.append(final_article)

    # df_features
    df_features = pd.DataFrame(
         {'Content': news_contents 
        })

    # df_show_info
    df_show_info = pd.DataFrame(
        {'Article Title': list_titles,
         'Article Link': list_links,
         'Newspaper': 'El Pais English'})
    
    return (df_features, df_show_info)

In [31]:
start = time.time()
x, y = get_news_themirror()
end =time.time()
te = end-start
print("The time elapsed is %f seconds" %(te))

UnboundLocalError: local variable 'final_article' referenced before assignment