In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
class IronhackSpider:
    """
    This is the constructor class to which you can pass a bunch of parameters. 
    These parameters are stored to the class instance variables so that the
    class functions can access them later.
    
    url_pattern: the regex pattern of the web urls to scape
    pages_to_scrape: how many pages to scrape
    sleep_interval: the time interval in seconds to delay between requests. If <0, requests will not be delayed.
    content_parser: a function reference that will extract the intended info from the scraped content.
    """
    def __init__(self, url_pattern, pages_to_scrape=10, sleep_interval=-1, content_parser=None):
        self.url_pattern = url_pattern
        self.pages_to_scrape = pages_to_scrape
        self.sleep_interval = sleep_interval
        self.content_parser = content_parser
    
    """
    Scrape the content of a single url.
    """
    def scrape_url(self, url):
        print(url)
              
        try:
            response = requests.get(url, timeout=10)
        except requests.exceptions.Timeout:
            print("Timeout")
            return None
        except requests.exceptions.TooManyRedirects:
            print("Too Many Redirects")
            return None
        except requests.exceptions.SSLError:
            print("SSL error")
            return None
        except requests.exceptions.RequestException as e:
            print("RequestException", e)
            return None

        if response.status_code<300:
            result = self.content_parser(response.content)
        elif response.status_code>=400 and response.status_code<500:
            print('request failed because the ressource either does not exist or is forbidden')
            return None
        else:
            print('request failed because the response server encountered an error')
            return None
        self.output_results(result)
    
    """
    Export the scraped content. I export the results into a csv file.
    """
    def output_results(self, r):
        my_df = pd.Series(r)
        my_df.to_csv('webscraping_project.csv', sep=',', index=False, header=False, mode = 'a')
        #mode='a' is equivalent to append for a list. Hence the results are not overwritten. 

    """
    After the class is instantiated, call this function to start the scraping jobs.
    This function uses a FOR loop to call `scrape_url()` for each url to scrape.
    """
    """In IronhackSpider.kickstart(), implement sleep_interval. 
    You will check if self.sleep_interval is larger than 0. 
    If so, tell the FOR loop to sleep the given amount of time before making the next request."""
    def kickstart(self):
        for i in range(1, self.pages_to_scrape+1):
            if self.sleep_interval>0:
                time.sleep(self.sleep_interval)
            self.scrape_url(self.url_pattern % i)

In [3]:
URL_PATTERN = 'https://montessouricettes.fr/blog-montessori-ief/page/%s/' # regex pattern for the urls to scrape
PAGES_TO_SCRAPE = 4 # how many webpages to scrapge

"""
This is a custom parser function you will complete in the challenge.
This function extracts the quotes.
This function will be passed to the IronhackSpider class.
"""
def quotes_parser(content):
    titles=[]
    soup = BeautifulSoup(content, "lxml")
    h2_tags = soup.find('div',{'class':'col-sm-8'}).find_all('h2',{'class':'post-title entry-title'})
    for h2_tag in h2_tags:
        a_tag=h2_tag.find('a')
        text=a_tag.text
        titles.append(text)
    return titles

# Instantiate the IronhackSpider class
my_spider = IronhackSpider(URL_PATTERN, PAGES_TO_SCRAPE, content_parser=quotes_parser)

# Start scraping jobs
my_spider.kickstart()

https://montessouricettes.fr/blog-montessori-ief/page/1/
https://montessouricettes.fr/blog-montessori-ief/page/2/
https://montessouricettes.fr/blog-montessori-ief/page/3/
https://montessouricettes.fr/blog-montessori-ief/page/4/
