In [1]:
# scrape webpage
import scrapy
from scrapy.crawler import CrawlerProcess
# text cleaning
import re

#define a spider for scraping the website
class CakesToCsv(scrapy.Spider):
    """scrape first line of  quotes from `wikiquote` by 
    Maynard James Keenan and save to json file"""
    name = "CakesToCsv"
    start_urls = [
        'https://www.karachibakery.com/birthday-cakes1?pg=1', #1. url to scrape
    ]
    custom_settings = {
        'ITEM_PIPELINES': {
            '__main__.ExtractFirstLine': 1 #2. what to run to extract data after response object is sucessfully returned from the website
        },
        'FEEDS': {
            'kaveri_CAKES.csv': { #3. where to save the extracted data
                'format': 'csv',   #3. format of data. other formats like json and xml are also supported
                'overwrite': True
            }
        }
    }

    def parse(self, response):
        """parse data from urls"""
        for cake in response.css('a.fancybox'):
            something = cake.extract() #return the items extracted from the html
            yield {'cake_title': something}

#define extraction logic
class ExtractFirstLine(object):
    def process_item(self, item, spider): #create columns for csv file
        """text processing"""
        lines = dict(item)["cake_title"].splitlines()
        title = self.__get_cake_title__(lines[0])
        img = self.__get_cake_img_link__(lines[0])

        return {'cake_title': title, 'cake_img': img}

    def __get_cake_title__(self, text):
        """get title of the anchor tag"""
        title = re.search("title=\"(.*?)\"", text)[1]
        return title

    def __get_cake_img_link__(self, text):
        """get the image link of the anchor tag"""
        img = re.search("img src=\"(.*?)\"", text)[1]
        return img

In [2]:
#execute the crawler

process = CrawlerProcess() #define the crawler
process.crawl(CakesToCsv) #attach the spider to the crawler
process.start()

2025-01-07 13:16:46 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-01-07 13:16:46 [scrapy.utils.log] INFO: Versions: lxml 5.2.2.0, libxml2 2.12.6, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.11.0, Python 3.10.4 (main, May 25 2024, 00:47:07) [Clang 15.0.0 (clang-1500.3.9.4)], pyOpenSSL 24.3.0 (OpenSSL 3.4.0 22 Oct 2024), cryptography 44.0.0, Platform macOS-15.2-arm64-arm-64bit
2025-01-07 13:16:46 [scrapy.addons] INFO: Enabled addons:
[]
2025-01-07 13:16:46 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2025-01-07 13:16:46 [scrapy.extensions.telnet] INFO: Telnet Password: 16143df697564bfb
2025-01-07 13:16:46 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2025-01-07 13:16:46 [scrapy.crawler] INFO: Overridden 

---------------
<a class="fancybox" rel="prodgal[]" title="1st Birthday Cake- Code: KB-1-BC-001" href="images/cakes/1stbirthday/big/kbbc-c189-b.jpg"><img src="images/cakes/1stbirthday/small/kbbc-c189-s.jpg"><span class="zoom"></span> </a>
---------------
<a class="fancybox" rel="prodgal[]" title="1st Birthday Cake- Code: KB-1-BC-002" href="images/cakes/1stbirthday/big/kbbc-c190-b.jpg"><img src="images/cakes/1stbirthday/small/kbbc-c190-s.jpg"><span class="zoom"></span> </a>
---------------
<a class="fancybox" rel="prodgal[]" title="1st Birthday Cake- Code: KB-1-BC-003" href="images/cakes/1stbirthday/big/kbbc-c191-b.jpg"><img src="images/cakes/1stbirthday/small/kbbc-c191-s.jpg"><span class="zoom"></span> </a>
---------------
<a class="fancybox" rel="prodgal[]" title="1st Birthday Cake- Code: KB-1-BC-004" href="images/cakes/1stbirthday/big/kbbc-c192-b.jpg"><img src="images/cakes/1stbirthday/small/kbbc-c192-s.jpg"><span class="zoom"></span> </a>
---------------
<a class="fancybox" rel="pro