# Web Scraping with lazy load

Page to be scraped: El Espectador

In [1]:
# Importing required libraries

import time

from bs4 import BeautifulSoup
from selenium import webdriver

from pymongo import MongoClient

In [2]:
# Creating a connection to MongoDB
client = MongoClient('localhost', 27017)
db = client['news']
collection = db['elespectador']

In [3]:
# Base URL of the site to be analyzed
SITE_URL = 'https://www.elespectador.com'

In [4]:
# Firefox web driver path
# Download the driver for you S.O. here: https://github.com/mozilla/geckodriver/releases
DRIVER_PATH = './geckodriver.exe'

In [5]:
# Creating a new firefox window
browser = webdriver.Firefox(executable_path = DRIVER_PATH)

  browser = webdriver.Firefox(executable_path = DRIVER_PATH)


In [6]:
def make_request(browser, categories):
    if (len(categories)<6):
        test = []
        for x in categories:    
            for i in range(5):
                print(i)
                # Making the request and rendering the browser
                if (i==0):
                    browser.get(SITE_URL + x )
                    print(SITE_URL + x)
                else:
                    print(SITE_URL + x + str(i+1)+'/')
                    browser.get(SITE_URL + x + str(i+1)+'/')
                # Simulating vertical scrolling for handling lazy load
                check_height = browser.execute_script('return document.body.scrollHeight;')
                while True:
                    browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                    time.sleep(3)
                    height = browser.execute_script('return document.body.scrollHeight;')
                    if height == check_height: 
                        break 
                    check_height = height
                test.append(BeautifulSoup(browser.page_source, 'html.parser'))
            # Getting HTML content and passing it to BeautifulSoup for scraping analysis
        return test
    else:
        # Making the request and rendering the browser
        browser.get(SITE_URL + categories)

        # Simulating vertical scrolling for handling lazy load
        check_height = browser.execute_script('return document.body.scrollHeight;')
        while True:
            browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            time.sleep(3)
            height = browser.execute_script('return document.body.scrollHeight;')
            if height == check_height: 
                break 
            check_height = height

        # Getting HTML content and passing it to BeautifulSoup for scraping analysis
        return BeautifulSoup(browser.page_source, 'html.parser')

In [None]:
# Getting HTML content for a particular news listing page
categories = ['/archivo/judicial/','/archivo/politica/','/archivo/colombia/','/archivo/economia/','/archivo/bogota/']
soup = make_request(browser, categories)

0
https://www.elespectador.com/archivo/judicial/
1
https://www.elespectador.com/archivo/judicial/2/
2
https://www.elespectador.com/archivo/judicial/3/
3
https://www.elespectador.com/archivo/judicial/4/
4
https://www.elespectador.com/archivo/judicial/5/
0
https://www.elespectador.com/archivo/politica/
1
https://www.elespectador.com/archivo/politica/2/
2
https://www.elespectador.com/archivo/politica/3/
3
https://www.elespectador.com/archivo/politica/4/
4
https://www.elespectador.com/archivo/politica/5/
0
https://www.elespectador.com/archivo/colombia/
1
https://www.elespectador.com/archivo/colombia/2/
2
https://www.elespectador.com/archivo/colombia/3/


In [None]:
# Finding the section where news are contained 
layout = []
for i in range(len(soup)):
    layout.append(soup[i].find(class_ = 'Layout-flexAds'))
print(len(layout))


In [None]:
# Getting blocks from layout
blocks = []
for i in range(len(layout)):
    blocks.append(layout[i].find_all(class_ = 'Container Block', recursive = True))
print(len(blocks))

In [None]:
# Finding and concatenating news cards
cards = []
for i in range(len(blocks)):
    cards.append(blocks[i][0].find_all(class_ = 'Card_rowCardLeft') + blocks[i][1].find_all(class_ = 'Card_rowCardLeft'))
print(len(cards))

In [None]:
# Building a list with title and relative path of the news founded

news = []
for i in range(len(cards)):
    for x in range(len(cards[i])):         
        news.append({
            'title': cards[i][x].find('h2', class_ = 'Card-Title').find('a').get_text(),
            'relative_path': cards[i][x].find('h2', class_ = 'Card-Title').find('a')['href']
        })
print(len(news))

In [None]:
news

In [None]:
for n in news:
    # Getting HTML content for each news page
    print(n['relative_path'])
    soup = make_request(browser, n['relative_path'])
    
    # Extracting news metadata
    n['datetime'] = soup.find(class_ = 'ArticleHeader-Date').get_text()
    #n['author'] = soup.find(class_ = 'ACredit-Author').find('a').get_text()
    n['summary'] = soup.find(class_ = 'ArticleHeader-Hook').find('div').get_text()
    
    # Extracting and concatenating news full text
    paragraphs = soup.find_all(class_ = 'font--secondary')
    n['full_text'] = ' '.join([p.get_text() for p in paragraphs])

In [None]:
news

In [None]:
# Storing extracted information for further analysis
collection.insert_many(news)