In [1]:
#import get to call a get request on the site
from requests import get

#Source Code taken from the following Site - https://towardsdatascience.com/web-scraping-craigslist-a-complete-tutorial-c41cea4f4981

#get the first page of bikes from craigslist with narrowed search
response = get('https://newyork.craigslist.org/search/bia?min_price=500&max_price=1500')
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')

#get the macro-container for the bike posts
posts = html_soup.find_all('li', class_= 'result-row')
print(type(posts)) #to double check that I got a ResultSet
print(len(posts)) #to double check I got 120 (elements/page)

<class 'bs4.element.ResultSet'>
120


In [2]:
#grab the first post
post_one = posts[0]

In [3]:
#grab the price of the first post
post_one_price = post_one.a.text
post_one_price.strip()

'$500'

In [4]:
#grab the time of the post in datetime format to save on cleaning efforts
post_one_time = post_one.find('time', class_= 'result-date')
post_one_datetime = post_one_time['datetime']

In [5]:
#title is a and that class, link is grabbing the href attribute of that variable
post_one_title = post_one.find('a', class_='result-title hdrlnk')
post_one_link = post_one_title['href']

#easy to grab the post title by taking the text element of the title variable
post_one_title_text = post_one_title.text


In [6]:
#the neighborhood is grabbed by finding the span class 'result-hood' and pulling the text element from that
post_one_hood = posts[0].find('span', class_='result-hood').text

In [7]:
#build out the loop
from time import sleep
import re
from random import randint #avoid throttling by not sending too many requests one after the other
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np


#find the total number of posts to find the limit of the pagination
results_num = html_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text) #pulled the total count of posts as the upper bound of the pages array

#each page has 119 posts so each new page is defined as follows: s=120, s=240, s=360, and so on. So we need to step in size 120 in the np.arange function
pages = np.arange(0, results_total+1, 120)

iterations = 0

post_timing = []
post_hoods = []
post_title_texts = []
post_links = []
post_prices = []

for page in pages:
    
    #get request
    response = get("https://newyork.craigslist.org/d/for-sale/search/sss" 
                   + "s=" #the parameter for defining the page number 
                   + str(page) #the page number in the pages array from earlier
                   + "&hasPic=1"
                   + "&availabilityMode=0")

    sleep(randint(1,5))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    #define the html text
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #define the posts
    posts = html_soup.find_all('li', class_= 'result-row')
        
    #extract data item-wise
    for post in posts:

        if post.find('span', class_ = 'result-hood') is not None:

            #posting date
            #grab the datetime element 0 for date and 1 for time
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_timing.append(post_datetime)

            #neighborhoods
            post_hood = post.find('span', class_= 'result-hood').text
            post_hoods.append(post_hood)

            #title text
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_title_texts.append(post_title_text)

            #post link
            post_link = post_title['href']
            post_links.append(post_link)
            
            #removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            post_price = (post.a.text.strip().replace("$", ""))
            post_prices.append(post_price)
            
            
                
    iterations += 1
    print("Page " + str(iterations) + " scraped successfully!")

print("\n")

print("Scrape complete!")

Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!


Scrape complete!


In [8]:
import pandas as pd

cl_roadbikes = pd.DataFrame({'posted': post_timing,
                       'neighborhood': post_hoods,
                       'post title': post_title_texts,
                        'URL': post_links,
                        'price': post_prices})
print(cl_roadbikes.info())
cl_roadbikes.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 642 entries, 0 to 641
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   posted        642 non-null    object
 1   neighborhood  642 non-null    object
 2   post title    642 non-null    object
 3   URL           642 non-null    object
 4   price         642 non-null    object
dtypes: object(5)
memory usage: 25.2+ KB
None


Unnamed: 0,posted,neighborhood,post title,URL,price
0,2020-10-01 20:38,(Upper West Side),Bike Ross Apollo 3Sp Stickshifter,https://newyork.craigslist.org/mnh/bik/d/new-y...,500
1,2020-10-01 20:23,(Staten Island N Y),WANTED: SCHWINN ORANGE KRATE BIKE,https://newyork.craigslist.org/stn/bik/d/state...,1234
2,2020-10-01 20:06,(Brooklyn),Folding E-Bike-Blix Vika-like new,https://newyork.craigslist.org/brk/bik/d/brook...,1300
3,2020-10-01 20:03,(Brooklyn),Fuji touring bike. Very solid,https://newyork.craigslist.org/brk/bik/d/brook...,740
4,2020-10-01 20:03,(Brooklyn),Very nice all redone trek fast track 470,https://newyork.craigslist.org/brk/bik/d/brook...,540
5,2020-10-01 19:55,(Brooklyn),Bridgestone MB-3 Mountain Bike. 23 in. Excell...,https://newyork.craigslist.org/brk/bik/d/brook...,500
6,2020-10-01 19:53,"(Southampton, NY)",Ultra Motors Metro Electric Bicycle Bike Picku...,https://newyork.craigslist.org/lgi/bik/d/south...,750
7,2020-10-01 19:49,(West Village),2 Vintage Raleigh Folding Bikes - $500 for both!,https://newyork.craigslist.org/mnh/bik/d/new-y...,500
8,2020-10-01 19:48,(Oakland Gardens),Brand new,https://newyork.craigslist.org/que/bik/d/oakla...,525
9,2020-10-01 19:39,(Battery Park),New 36 volt electric bike - www.electricbikesa...,https://newyork.craigslist.org/mnh/bik/d/new-y...,899


In [9]:
#cl_roadbikes.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')
cl_roadbikes.to_csv('cl_roadbikes.csv')