In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

class ClistSpider(scrapy.Spider):
    name = "WS"
    
    # Here is where we insert our scraper.
    start_urls = [
        'https://sandiego.craigslist.org/search/sss?query=bike&sort=rel'
        ]

    # Identifying the information we want from the query response and extracting it using xpath.
    def parse(self, response):
        for res in response.xpath('//*[@id="sortable-results"]/ul/li'):

            # Yield a dictionary with the values we want.
            yield {
                'title': res.xpath('*/a[@class="result-title hdrlnk"]/text()').extract_first(),
                'price': res.xpath('*/span[@class="result-price"]/text()').extract_first(),
                'url': res.xpath('*/a[@class="result-title hdrlnk"]/@href').extract_first()
            }
        # Get the URL of the previous page.
        next_page = response.xpath('//*[@id="searchform"]/div[3]/div[3]/span[2]/a[3]/@href').extract_first()
        
        # There are a LOT of pages here.  For our example, we'll just scrape the first 9.
        # This finds the page number. The next segment of code prevents us from going beyond page 9.
        pagenum = int(response.xpath('//*[@id="searchform"]/div[3]/div[3]/span[2]/span[3]/span[1]/span[2]/text()').extract_first())
        print(pagenum)
        
        # Recursively call the spider to run on the next page, if it exists.
        if next_page is not None and pagenum < 360:
            next_page = response.urljoin(next_page)
            # Request the next page and recursively parse it the same way we did above
            yield scrapy.Request(next_page, callback=self.parse)

# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.          
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'bikecraigslist.json',       # Name our storage file.
    'LOG_ENABLED': False,          # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(ClistSpider)
process.start()
print('Success!')

120
240
360
Success!


In [3]:
import pandas as pd

# Checking whether we got data 

data= pd.read_json('bikecraigslist.json')
print(data.shape)
print(data.tail())

(360, 3)
    price                                              title  \
355  $200          Men's Shogun Chromoly 200 Road Bike 52 cm   
356  $260                      Sunny Health Indoor Spin Bike   
357  $500                                     X-Treme e-bike   
358  None  TREK BIKE NAVIGATOR 14.5 INCH BLACK $175 18 IN...   
359  $170  Giant Hybrid Commuter / Trail bike, Like New c...   

                                                   url  
355  https://sandiego.craigslist.org/nsd/bik/d/mens...  
356  https://sandiego.craigslist.org/csd/spo/d/sunn...  
357  https://sandiego.craigslist.org/csd/bik/d/trem...  
358  https://sandiego.craigslist.org/nsd/for/d/trek...  
359  https://sandiego.craigslist.org/csd/bik/d/gian...  


In [4]:
data.head()

Unnamed: 0,price,title,url
0,$220,1969 Schwinn Varsity Sport Road Bike 57cm,https://sandiego.craigslist.org/ssd/bik/d/1969...
1,$60,Bauer/Brievo Tow Hitch Mount - 2 Bike Rack Holder,https://sandiego.craigslist.org/esd/bop/d/baue...
2,$60,20 in Huffy Mt Bike - Brand New - Youth,https://sandiego.craigslist.org/esd/bik/d/20-i...
3,,Motorized bike 4 trade brandnew 185mpg 2 avail...,https://sandiego.craigslist.org/csd/bar/d/moto...
4,$260,Men's Vintage Fuji Cadenza Chrome Mountain Bik...,https://sandiego.craigslist.org/nsd/bik/d/mens...


In [6]:
data.shape

(360, 3)

In [8]:
data.describe()

Unnamed: 0,price,title,url
count,335,360,360
unique,100,323,360
top,$100,Bike platform hitch rack carrier,https://sandiego.craigslist.org/csd/bik/d/56cm...
freq,20,6,1
