# Import librairies

In [1]:
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
import time
#from scrapy.utils.project import get_project_settings

# Scraping de la page simple avec une quote

### Créer un Spider
Une Spider est une classe qui hérite de scrapy.Spider. Elle représente un robot qui va parcourir une ou plusieurs pages web pour en extraire des données.

In [2]:

# On crée une classe RandomQuoteSpider qui hérite de scrapy.Spider. Cela signifie qu'elle utilise les fonctionnalités de base de Scrapy pour naviguer sur le web et extraire des données.

class RandomQuoteSpider(scrapy.Spider):

    # On donne un nom à la Spider pour l'identifier lorsqu'elle sera lancée
    name = "randomquote"

    # URL depuis laquelle la Spider va partir
    start_urls = [
        'http://quotes.toscrape.com/random',
    ]

    # La fonction parse() est automatiquement appelée lorsque la Spider charge une page. Son rôle est d’extraire les informations de la page.
    # parse() prend en paramètre response, qui est une instance de la classe Response contenant le code HTML de la page.

    def parse(self, response):
        
        
        # On récupère l'élément "parent" des éléments qu'on veut scraper. Ici c'est la balise <div> dont la class est "quote".
        quote = response.css('div.quote') 
        
        # Dans cet élément parent, on récupère les sous éléments qui nous intéressent et on les met dans un dictionnaire
        return {
            'text': quote.css('span.text::text').get(), # le ::text signifie qu'on veut récupérer le text à l'intérieur de la balise identifiée par span.text
            'author': quote.css('span small.author::text').get(), # on va chercher la balise small.author contenue dans un span
            'tags': quote.css('div.tags a.tag::text').getall() #get ALL car il y a plusieurs éléments potentiellement
        }


### Créer un crawler

In [3]:
# Nom du fichier qui sera créé avec les résultats de notre scraping

filename = "1_random_quote.json"

# Si le fichier existe déjà, on le supprime
if filename in os.listdir('src/'):
    os.remove('src/' + filename)
    
# On crée une instance de la classe CrawlerProcess, qui va gérer le scraping.
process = CrawlerProcess(settings = {
    'USER_AGENT':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', # On simule un navigateur, pour ne pas être bloqué par le site
    'LOG_LEVEL': logging.INFO, # On définit le niveau de précision des messages de description qui seront donnés
    "FEEDS": {
        'src/' + filename : {"format": "json"}, # On indique où seront sauvegardées les données extraites
    }
    
})


# On lance notre spider et on démarre le processus de scraping
process.crawl(RandomQuoteSpider)
process.start()

2025-02-21 11:05:23 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2025-02-21 11:05:23 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.4, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.10.0, Python 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 23.2.0 (OpenSSL 3.1.3 19 Sep 2023), cryptography 41.0.3, Platform Windows-10-10.0.26100-SP0
2025-02-21 11:05:23 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)



2025-02-21 11:05:24 [scrapy.extensions.telnet] INFO: Telnet Password: c5f2cfe1931d711b
2025-02-21 11:05:24 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2025-02-21 11:05:25 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.dow

# Scraping de la page avec toutes les quotes

### Créer un spider

In [None]:
class QuotesSpider(scrapy.Spider):
    # Name of your spider
    name = "quotes"

    # Url to start your spider from
    start_urls = [
        'http://quotes.toscrape.com/page/1',
    ]

    # Callback function that will be called when starting your spider
    def parse(self, response):
        quotes = response.css('div.quote')
        for quote in quotes:
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('span small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }


### Créer un crawler

In [4]:
filename = "2_quotes.json"

if filename in os.listdir('src/'):
    os.remove('src/' + filename)
    
#Créer un CrawlerProcess
process = CrawlerProcess(settings = {
    'USER_AGENT':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        'src/' + filename : {"format": "json"},
    }
    
})


#Lancer le crawler utilisant la spider
process.crawl(QuotesSpider)
process.start()

2025-02-15 17:21:48 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2025-02-15 17:21:48 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.4, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.10.0, Python 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 23.2.0 (OpenSSL 3.1.3 19 Sep 2023), cryptography 41.0.3, Platform Windows-10-10.0.22631-SP0
2025-02-15 17:21:48 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2025-02-15 17:21:48 [scrapy.extensions.telnet] INFO: Telnet Password: 31bf8e56a0310cb8
2025-02-15 17:21:48 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.fee

# Crawler plus d'une page, naviguer entre les pages

### Spider

In [2]:
class QuotesMultipleSpider(scrapy.Spider):
    # Name of your spider
    name = "quotesmultiplespider"

    # Url to start your spider from
    start_urls = [
        'http://quotes.toscrape.com/page/1',
    ]

    # Callback function that will be called when starting your spider
    def parse(self, response):
        quotes = response.css('div.quote')
        for quote in quotes:
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('span small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }

        
        try:
            next_page = response.css('li.next a').attrib["href"] # Permet de cliquer dans le lien de la balise a, contenue dans la balise li de classe "next"
        except KeyError:
            logging.info("No next page !")
        else:
            yield response.follow(next_page, callback=self.parse)
            

### Crawler

In [3]:
filename = "3_quotesmultiplepages.json"

if filename in os.listdir('src/'):
    os.remove('src/' + filename)
    
#Créer un CrawlerProcess
process = CrawlerProcess(settings = {
    'USER_AGENT':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        'src/' + filename : {"format": "json"},
    }
    
})


#Lancer le crawler utilisant la spider
process.crawl(QuotesMultipleSpider)
process.start()

2025-02-21 12:14:36 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2025-02-21 12:14:36 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.4, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.10.0, Python 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 23.2.0 (OpenSSL 3.1.3 19 Sep 2023), cryptography 41.0.3, Platform Windows-10-10.0.26100-SP0
2025-02-21 12:14:36 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2025-02-21 12:14:36 [scrapy.extensions.telnet] INFO: Telnet Password: eefc5aa935ddc780
2025-02-21 12:14:36 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.fee

# Crawler lemonde.fr/economie

In [None]:
class ArticlesLemondeEconomie(scrapy.Spider):
    # Name of your spider
    name = "lemonde_spider"

    # Url to start your spider from
    start_urls = [
        'https://www.lemonde.fr/economie/',
    ]

    # Callback function that will be called when starting your spider
    def parse(self, response):
        articles = response.css('section.teaser') #div.thread marche aussi
        for article in articles:
            yield {
                'titre': article.css('h3.teaser__title::text').get(),
                'desc': article.css('p.teaser__desc::text').get(),
                'date': article.css('span.meta__date::text').get(),
                'author': article.css('span.meta__author::text').get()
            }

        
            

In [3]:
filename = "article_lemonde_economie.json"

if filename in os.listdir('src/'):
    os.remove('src/' + filename)
    
#Créer un CrawlerProcess
process = CrawlerProcess(settings = {
    'USER_AGENT':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        'src/' + filename : {"format": "json"},
    }
    
})


#Lancer le crawler utilisant la spider
process.crawl(ArticlesLemondeEconomie)
process.start()

2025-02-21 11:42:18 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2025-02-21 11:42:18 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.4, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.10.0, Python 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 23.2.0 (OpenSSL 3.1.3 19 Sep 2023), cryptography 41.0.3, Platform Windows-10-10.0.26100-SP0
2025-02-21 11:42:18 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2025-02-21 11:42:18 [scrapy.extensions.telnet] INFO: Telnet Password: b9e68a1ce3f86d44
2025-02-21 11:42:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.fee

# Authentification puis scraping

### Spider

In [None]:
class QuotesLogin(scrapy.Spider):
    # Name of your spider
    name = "login"

    # Url to start your spider from
    start_urls = [
        'https://quotes.toscrape.com/login',
    ]

    # Callback function that will be called when starting your spider
    def parse(self, response):
        return scrapy.FormRequest.from_response(
            response,
            formdata = {'username':'john', 'password':'secret'},
            callback = self.after_login
        )
    
    

    def after_login(self, response):
        quotes = response.css('div.quote')
        for quote in quotes:
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('span small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }

    

### Crawler

In [None]:
filename = "4_quotesauthentication.json"

if filename in os.listdir('src/'):
    os.remove('src/' + filename)
    
#Créer un CrawlerProcess
process = CrawlerProcess(settings = {
    'USER_AGENT':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        'src/' + filename : {"format": "json"},
    }
    
})


#Lancer le crawler utilisant la spider
process.crawl(QuotesLogin)
process.start()