In [1]:
import requests # http requests over the internet
from bs4 import BeautifulSoup

In [2]:
site = requests.get('https://translegislation.com/bills/2024/US')
html_source_code = site.content
soup = BeautifulSoup(html_source_code, 'lxml')

In [3]:
soup.title

<title>United States Bills | Anti-trans legislation</title>

In [4]:
soup.h3['class']

['chakra-heading', 'css-1vygpf9']

In [5]:
soup.find('h3', class_='css-1vygpf9').text

'US HB1064'

In [6]:
titles = soup.find_all('h3', class_='css-1vygpf9')

In [7]:
# for i in titles:
#     print(i.text)

In [8]:
soup.h3.a['href']

'/bills/2024/US/HB1064'

In [9]:
extention = soup.h3.a['href']

In [10]:
# f-strings

url = 'https://translegislation.com'

In [11]:
url + extention

'https://translegislation.com/bills/2024/US/HB1064'

In [12]:
### Goal: scrape each item from each bill card, save it to a CSV

# 1. identify the HTML element that encompasses all the bill cards
# 2. identify the specific elements within each bill card
# 3. process our data into lists
# 4. save our data as a csv file

In [13]:
cards = soup.find('div', class_='css-1ftdpv0')

In [14]:
cards.text

'US HB1064MILITARYINTRODUCEDEnsuring Military Readiness Act of 2023To provide requirements related to the eligibility of transgender individuals from serving in the Armed Forces.Transgender persons who require or have undergone gender transition are disqualified from military service.View BillUS HB1112MILITARYINTRODUCEDEnsuring Military Readiness Act of 2023To provide requirements related to the eligibility of individuals who identify as transgender from serving in the Armed Forces.View BillUS HB1276HEALTHCAREINTRODUCEDProtect Minors from Medical Malpractice Act of 2023To protect children from medical malpractice in the form of gender transition procedures.A medical practitioner, in any circumstance described in subsection (c), who performs a gender-transition procedure on an individual who is less than 18 years of age shall, as described in subsection (b), be liable to the individual if injured (including any physical, psychological, emotional, or physiological harms) by such procedur

In [15]:
type(cards)

bs4.element.Tag

In [16]:
# grabbing all of the cards individually using "find_all()"
cards = soup.find_all('div', class_ = 'css-4rck61')

In [17]:
titles = []
categories = []
captions = []
descriptions = []
links = []

for card in cards:
    title = card.h3.text # print the title
    category = card.span.text # category
    caption = card.h2.text # caption
    if card.p: # description
        description = card.p.text
    else:
        description = 'N/a'
    link = f"https://translegislation.com{card.a['href']}"

    titles.append(title)
    categories.append(category)
    captions.append(caption)
    descriptions.append(description)
    links.append(link)

In [18]:
import pandas as pd

In [19]:
df = pd.DataFrame({
    'title': titles,
    'category': categories,
    'caption': captions,
    'description': descriptions,
    'link': links
})

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        80 non-null     object
 1   category     80 non-null     object
 2   caption      80 non-null     object
 3   description  80 non-null     object
 4   link         80 non-null     object
dtypes: object(5)
memory usage: 3.3+ KB


In [21]:
df.to_csv('data.csv')

In [22]:
site = requests.get('https://www.moma.org/collection/?classifications=any&date_begin=Pre-1850&date_end=2024&include_uncataloged_works=false&on_view=false&q=van+gogh&recent_acquisitions=false&with_images=true')
source = site.content
soup = BeautifulSoup(source, 'lxml')

In [23]:
soup

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]--><!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]--><!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]--><!--[if gt IE 8]><!--><html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<title>Attention Required! | Cloudflare</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="noindex, nofollow" name="robots"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="/cdn-cgi/styles/cf.errors.css" id="cf_styles-css" rel="stylesheet"/>
<!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" /><![endif]-->
<style>body{margin:0;padding:0}</style>
<!--[if gte IE 10]><!-->
<script>
  if (!navigator.cookieEnabled) {
    window.addEventListener('DOMContentLoaded', function () {