# This an intro to web-scraping with beautifulSoup using HTML and CSS tags and attributes -  notebook
*This text is in italics*

**This text is bold**


In [207]:
import os #to do system stuff, like making folders

# import the requests library
import requests

# import beautifulsoup
from bs4 import BeautifulSoup

#import pypdf
import pypdf

#import reportlab to create pdfs
from reportlab.pdfgen import canvas

In [324]:
# Make 4 folders, one to hold .txt files, another for PDFs, another for experiment's ouptuts and another for images
## only run this block one once! 
folder1 = "txtfiles"
folder2 = "PDFs"
folder3 = "images"
folder4 = "results"
# Create the folders in the current directory
os.makedirs(folder1, exist_ok=True)
os.makedirs(folder2, exist_ok=True)
os.makedirs(folder3, exist_ok=True)
os.makedirs(folder4, exist_ok=True)
print(f"Created folders: {folder1}, {folder2}, {folder3}, {folder4}")


Created folders: txtfiles, PDFs, images, results


In [287]:
# scrape the first page of a website
##remove or add the hashtags to test either craigslist or geological specimen supply!

##Craigslist:
response = requests.get('https://seattle.craigslist.org/search/roo?hasPic=1#search=1~gallery~0~0')

##geological specimen supply
##response = requests.get('https://geologicalspecimensupply.com/collections/whole-catalog-in-alphabetical-order')

# create a beautifulsoup object using the html
soup = BeautifulSoup(response.text)

In [290]:
# get a list of all x, notice that we have to change and modify what kinds of elements we are selecting, according to our inspection
#of the website in question:

#for craigslist, we find the elements under the tag a, then we look at elements with the tag div and class title
titles = soup.select("a div.title")

#for geological supplies, find everything tagged h2 with the class h1-style title, which corresponds to the section that has descriptions:
#titles = soup.select("h2", class_= 'h1-style title')
#iterate through all the titles, print them and make a pdf
for t in titles:
    print(t.text)

A room in a shared house with Mt. Rainer view available now
1 Furnished Bedroom, shared bathroom
3 Rooms Available in 8 Person Group House
Huge room for rent with spacious wardrobe
1 br available 2/1/25 $800
Amazing 1bd in beautiful 3bd townhouse!
Bedroom available Ready 03/01/25
Shared house in Snoqualmie for male
Furnished Room Available Today
Large Bedroom in Redmond mansion 10-15min from MSFT, 5 min to SpaceX
Very private master suite: two large rooms w/private bath/closet rooms
ROOM AVAILABLE  All Bills Paid  790.00
Private room
Old Town Mukilteo
One big room in house for renting $600
HUGE: Three rooms available, 2 bathrooms
Looking for a roommate who can also help clean, etc
Room for Rent
SPACIOUS 1-BED 1-BATH APARTMENT IMMEDIATE AVAILABILITY
Partial furnished rm (27ft x11ft), private bath &kitchen, hardwood flo
Nice Room + 1/2 in Downtown Renton/ Prefer Female Roommate
Affordable Private Bedroom with Sink and Toilet in U-District
Spacious Newly Renovated Room for Rent Near UW
Af

In [291]:
# Save results to a .txt
file_name = 'txtfiles/aRoom.txt'  # Specify your file name here, change the name if you don't want to overwrite the file!
with open(file_name, 'w') as file:
    for t in titles:
        # Strip whitespace and print each title
        title_text = t.text.strip()
        # Write to file without line numbers
        file.write(f"{title_text}\n")

print(f"Titles saved to {file_name}")


Titles saved to txtfiles/aRoom.txt


# Extracting the links to images and saving them, first as links to a .txt file and then going through those links and getting the images


In [292]:
#both examples, the geological and craigslist websites, have images, lets see if we can fetch images from rocks! 
##geological specimen supply
response = requests.get('https://geologicalspecimensupply.com/collections/whole-catalog-in-alphabetical-order')

# create a beautifulsoup object using the html
soup = BeautifulSoup(response.text)

#for the geological images, we identify the tag img where we can see a link to said image in the src(source) attribute,
images = soup.select("img")

# to get these links, we can iterate through all the elements under img tag and select the src attribute of each of them. 
# then we append the https so that they become actual links! 

##we can save this to a .txt file. 
file_name = 'txtfiles/rocksImgsLinks.txt'  # Specify your file name here, change the name if you don't want to overwrite the file!

# Open the file in write mode
with open(file_name, 'w') as file:
    for img in images:
        # Get the src attribute
        src = img.get('src')
        if src.startswith("//"):
            src = "https:" + src
        # Write to file without line numbers
        file.write(f"{src}\n")
        print(src)

print(f"Image links saved to {file_name}")


https://geologicalspecimensupply.com/cdn/shop/t/10/assets/logo.png?v=41447789289055930281628149658
https://geologicalspecimensupply.com/cdn/shop/products/acid_dropper_bottles_large.jpg?v=1584165902
https://geologicalspecimensupply.com/cdn/shop/products/acid_dropper_bottles_large.jpg?v=1584165902
https://geologicalspecimensupply.com/cdn/shop/products/acid_dropper_bottles_ffe184d7-c9d7-49f5-a95d-0246d3df63a6_large.jpg?v=1589755472
https://geologicalspecimensupply.com/cdn/shop/products/acid_dropper_bottles_ffe184d7-c9d7-49f5-a95d-0246d3df63a6_large.jpg?v=1589755472
https://geologicalspecimensupply.com/cdn/shop/files/agateslicesSt_857bc042-ef33-4879-bd12-df455c37a197_large.jpg?v=1709892790
https://geologicalspecimensupply.com/cdn/shop/files/agateslicesSt_857bc042-ef33-4879-bd12-df455c37a197_large.jpg?v=1709892790
https://geologicalspecimensupply.com/cdn/shop/files/agateslice7.5kpcl3coins_db8b6389-291b-4d07-9c59-d5f397392841_large.jpg?v=1725058153
https://geologicalspecimensupply.com/cdn/sh

In [325]:
# Create a new folder named 'rockImages'
folder_name = 'images/rockImages'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Open the file we just created with the image links
file_name = 'txtfiles/rocksImgsLinks.txt'
with open(file_name, 'r') as file:
    img_links = file.readlines()

# Download each image and save it to the new folder
for i, link in enumerate(img_links):
    # Strip whitespace and get the link
    link = link.strip()
    # Get or request the image from the website (we are entering bot territory and websites dont like so many requests!)
    img_data = requests.get(link).content
    # Create a file name for each image
    img_name = os.path.join(folder_name, f'image_{i+1}.jpg')
    # Save the image
    with open(img_name, 'wb') as img_file:
        img_file.write(img_data)
        print(f"Image saved: {img_name}")

print(f"got all imgs and saved them in '{folder_name}' folder.")


Image saved: images/rockImages/image_1.jpg
Image saved: images/rockImages/image_2.jpg
Image saved: images/rockImages/image_3.jpg
Image saved: images/rockImages/image_4.jpg
Image saved: images/rockImages/image_5.jpg
Image saved: images/rockImages/image_6.jpg
Image saved: images/rockImages/image_7.jpg
Image saved: images/rockImages/image_8.jpg
Image saved: images/rockImages/image_9.jpg
Image saved: images/rockImages/image_10.jpg
Image saved: images/rockImages/image_11.jpg
Image saved: images/rockImages/image_12.jpg
Image saved: images/rockImages/image_13.jpg
Image saved: images/rockImages/image_14.jpg
Image saved: images/rockImages/image_15.jpg
Image saved: images/rockImages/image_16.jpg
Image saved: images/rockImages/image_17.jpg
Image saved: images/rockImages/image_18.jpg
Image saved: images/rockImages/image_19.jpg
Image saved: images/rockImages/image_20.jpg
Image saved: images/rockImages/image_21.jpg
Image saved: images/rockImages/image_22.jpg
Image saved: images/rockImages/image_23.j

# With Craigslist, lets find the links to each posting, save them in a .txt file and then go through each link and get images and descriptions.

In [323]:
#Lets test with craigslist, similar methdology, but different tags to look for, we will look for the links to the postings.
# to save time and spare you having to uncomment and modify previous blocks to re-load our soup object with the craigslist website,
#i will do the whole code in one block:

##Craigslist:
response = requests.get('https://seattle.craigslist.org/search/roo?hasPic=1#search=1~gallery~0~0')

# create a beautifulsoup object using the html
soup = BeautifulSoup(response.text)

#for craigslist, we select everythin under the tag li:
results = soup.find_all('li', class_='cl-static-search-result')

#specify the nameof the file to contain these links:
file_name = 'txtfiles/roomsLinks.txt'  # Specify your file name here, change the name if you don't want to overwrite the file!

with open(file_name, 'w') as file:
    for result in results:
        a_tag = result.find('a')
        if a_tag:
            href = a_tag.get('href')
        # Write to file without line numbers
        file.write(f"{href}\n")
        print(href)

print(f"posting links saved to {file_name}")

https://seattle.craigslist.org/skc/roo/d/auburn-room-in-shared-house-with-mt/7822499105.html
https://seattle.craigslist.org/tac/roo/d/university-place-furnished-bedroom/7822495792.html
https://seattle.craigslist.org/see/roo/d/seattle-rooms-available-in-person-group/7822495144.html
https://seattle.craigslist.org/est/roo/d/lynnwood-huge-room-for-rent-with/7820546886.html
https://seattle.craigslist.org/sno/roo/d/marysville-br-available/7820578662.html
https://seattle.craigslist.org/see/roo/d/seattle-amazing-1bd-in-beautiful-3bd/7822490335.html
https://seattle.craigslist.org/kit/roo/d/port-orchard-bedroom-available-ready/7822489707.html
https://seattle.craigslist.org/est/roo/d/snoqualmie-shared-house-in-snoqualmie/7822485774.html
https://seattle.craigslist.org/tac/roo/d/tacoma-furnished-room-available-today/7816920837.html
https://seattle.craigslist.org/est/roo/d/redmond-large-bedroom-in-redmond/7815500824.html
https://seattle.craigslist.org/est/roo/d/sammamish-very-private-master-suite-tw

### Let's open one of the posting links and inspect it to see if we can get a single image and a description 

In [368]:
##Craigslist single post:
response = requests.get('https://seattle.craigslist.org/kit/roo/d/port-orchard-bedroom-available-ready/7822489707.html')

# create a beautifulsoup object using the html
soup = BeautifulSoup(response.text)

#Lets find the right tags and sections:
results = soup.find_all('figure', class_='iw multiimage')

#Iterate through the results and find the 'img' tags
for result in results:
    img_tag = result.find('img')
    if img_tag:
        src = img_tag.get('src')
        print(src)


https://images.craigslist.org/00w0w_gAEUoFW0hT2_0gx0t2_600x450.jpg


In [369]:
#Lets try to get the description of this single posting
#Lets find the right tags and sections:

posting_body = soup.find('section', id='postingbody')

# Extract the text content from the section
if posting_body:
    text_content = posting_body.get_text(separator='\n').strip()
    print(text_content)
else:
    print("Section with id 'postingbody' not found.")

QR Code Link to This Post





Hello,



We are looking for a roommate to occupy one of our bedrooms that will be available to move-in 03/01/2025. We are very clean and respectful and looking for the same. We have 2-Boston Terriers that are very playful, please be pet friendly. Rent + utilities is $1,250 per month. 



This is a 4-bedroom, 3-bathroom house in a brand new home community. The home is equipped with AC and a large beautiful fenced backyard! 



Bedroom 1 - Occupied by a couple



Bedroom 2 - Occupied by military 



Bedroom 3 - available 



Bedroom 4 - Not for rent



Thank you for looking and if you are interested, please feel free to txt or call me. 😊


In [378]:
# notice how the description has all these empty lines, and we also got some text that is not really part of the description, 
# like "QR Code Link to This Post"
# we can use a bit of regex, to clean this description.
#first import re
import re
posting_body = soup.find('section', id='postingbody')

# Extract the text content from the section
if posting_body:
    # Get the text content with line breaks and strip leading/trailing whitespace
    text_content = posting_body.get_text(separator='\n').strip()
    
    #Lets  remove the "QR Code Link to This Post" using regex
    text_content = re.sub(r'QR Code Link to This Post\n\s*', '', text_content)
    
    # Use regex to remove empty lines
    cleaned_text_content = re.sub(r'\n\s*\n', '\n', text_content)
    #\n: is a newline character. It indicates the start of a line.
    #\s*: This expression matches zero or more whitespace characters, like spaces or tabs
    #The asterisk (*) or wild card will find all the instances of such expression
    #\n: indicates the end of the sequence we want to match.
    # so we find a new line that is followed by n amount of blank or whitespaces which are then followed by a new line 
    #What follows after the , is another new line, so all the blank space will be replaced by just one single new line
    #and then we will add the next line of actual text. 

        
    print(cleaned_text_content)
else:
    print("Section with id 'postingbody' not found.")

Hello,
We are looking for a roommate to occupy one of our bedrooms that will be available to move-in 03/01/2025. We are very clean and respectful and looking for the same. We have 2-Boston Terriers that are very playful, please be pet friendly. Rent + utilities is $1,250 per month. 
This is a 4-bedroom, 3-bathroom house in a brand new home community. The home is equipped with AC and a large beautiful fenced backyard! 
Bedroom 1 - Occupied by a couple
Bedroom 2 - Occupied by military 
Bedroom 3 - available 
Bedroom 4 - Not for rent
Thank you for looking and if you are interested, please feel free to txt or call me. 😊


## Once we have figured out how to get an image and the description from a single post, whislt cleaning it with regex, we can, hopefully, safely assume that the other postings will follow the same patterns we have discovered here. 
### so instead of doing one post at a time, which would be time consuming, now we can attempt to automate this process for each of the 120 postings! 

### Using the links to the individual craigslist postings that we got from the last block, we will:
1. iterate through the list of links
2. open each of them and create a new beautiful soup request, like we did at the beginning of the notebook
3. in this new request, we will scrape the image link, like we did for the rocks but will also scrape the longer descriptons that come inside each post
4. We will save the images in a folder and save the descriptions in a new .txt file

In [380]:
#import regex
import re

# Create a new folder to save the images
folder_name = 'images/roomImgs'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Specify the name of the file to contain the descriptions
desc_file_name = 'txtfiles/roomsdescriptions.txt'  # Specify your file name here, change it if you dont want to verwrite the file!

# Open the file we made containing the links to each posting
links_file_name = 'txtfiles/roomsLinks.txt'
with open(links_file_name, 'r') as file:
    posting_links = file.readlines()

# Open the descriptions file in write mode so that as we iterate through the postings, we can write the descriptions into the new .txt file
with open(desc_file_name, 'w') as desc_file:
    # Iterate through each posting link to scrape the images and descriptions
    for i, link in enumerate(posting_links):
        link = link.strip()
        post_response = requests.get(link)
        post_soup = BeautifulSoup(post_response.text, 'html.parser')

        # Scrape the image links with the same formula we found in the previous step!
        image_tags = post_soup.find_all('figure', class_='iw multiimage')
        img_links = []
        for img_tag in image_tags:
            img = img_tag.find('img')
            if img:
                img_src = img.get('src')
                if img_src:
                    img_links.append(img_src)

        # Save the images to the folder
        for j, img_link in enumerate(img_links):
            img_data = requests.get(img_link).content
            img_name = os.path.join(folder_name, f'image_{i+1}_{j+1}.jpg')
            with open(img_name, 'wb') as img_file:
                img_file.write(img_data)
                print(f"Image saved: {img_name}")

        # Scrape the description with the same formula we found previously! 
        posting_body = post_soup.find('section', id='postingbody')
        if posting_body:
            # Get the text content with line breaks and strip start/ending whitespaces
            text_content = posting_body.get_text(separator='\n').strip()
            # Remove the very specific text "QR Code Link to This Post" using regex
            text_content = re.sub(r'QR Code Link to This Post\n\s*', '', text_content)
            # Use regex expression to remove empty lines
            cleaned_text_content = re.sub(r'\n\s*\n', '\n', text_content)
            
            # Write the description to the file
            desc_file.write(f"Description for posting {i+1}:\n{cleaned_text_content}\n\n")
            print(f"Description for posting {i+1}: {cleaned_text_content}")

print("All postings saved! yay!")


Image saved: images/roomImgs/image_1_1.jpg
Description for posting 1: One bedroom with beautiful Mt. Rainer view, in a shared house available now by Lea Hill in Auburn, WA. The house is located in a newer neighborhood. It is close to Green River College and the bus line of 181 &164. Easy access to highway 18. The room can be furnished with a bed/ desk and chair. Rent is $775. Utilities are shared by the roommates.
Please be advised that no Pets allowed.
Background check is required. Call / Text  
show contact info
 if you are interested in
Image saved: images/roomImgs/image_2_1.jpg
Description for posting 2: This is a house. Near bus lines. 10 minutes away from Chambers Bay. Single moms are welcome as I am a single mother as well.  No couples, no pets. Be hygienic, and respectful. Background check required which you pay for, proof of income. Rent includes utilities. $500 deposit, The closet will be emptied, I just wanted to post pictures ASAP.
PREFER FEMALE ROOMMATES
Image saved: image