# HW4

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Web Scraping

# 1. Define the target URL and use a session (so we can attach cookies for logged-in view)
url = "https://hooslist.virginia.edu/1262/Group/DataScience"
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'})

# Optional: use your browser's cookies to get real locations (see cell above)
USE_LOGGED_IN_SESSION = False  # set True after pasting cookie below
BROWSER_COOKIES = {}  # e.g. {'_hooslist_session': 'paste_value_from_devtools'}
if USE_LOGGED_IN_SESSION and BROWSER_COOKIES:
    for name, value in BROWSER_COOKIES.items():
        session.cookies.set(name, value, domain='hooslist.virginia.edu')

response = session.get(url)

# 2. Parse the HTML
soup = BeautifulSoup(response.content, 'html.parser')
# print(f'soup: {soup}')

# 3. Find all section links â€” data lives in <a> tags with class js-section-link (data-* attributes)
section_links = soup.find_all('a', class_='js-section-link')
print(f'Found {len(section_links)} section links')

# 4. Extract only fields needed for HW4 (4.2 schema + 4.3 queries):
#   - 4.3.1: subject, course name, meeting time, location, instructor
#   - 4.3.2: course name, meeting time, location, professor
#   - 4.3.3/4.3.4: room + meeting time for conflict detection
#   - 4.3.5: enrollment (salary not on Lou's List; add separately)
rows = []
for link in section_links:
    rows.append({
        'class_number': link.get('data-classnumber'),
        'title': link.get('data-title'),
        'instructors': link.get('data-instructors'),
        'subject': link.get('data-subject'),
        'section': link.get('data-sectioncode'),
        'term': link.get('data-term'),
        'meeting_time': link.get('data-meetings'),  # days, start, end
        'location': link.get('data-location'),
        'enrollment': link.get('data-enrollment'),
    })

print(f'rows: {rows}')

# 5. Load into Pandas
df = pd.DataFrame(rows)
# Drop rows where we didn't get a class number (e.g. header or non-section links)
df = df.dropna(subset=['class_number']).reset_index(drop=True)
print(f'Extracted {len(df)} course sections')
print(df[['class_number', 'title', 'instructors', 'subject', 'section', 'meeting_time', 'location', 'enrollment']].head(10))

Found 169 section links
Extracted 169 course sections
  class_number                         title       instructors subject  \
0        15256    Foundation of Data Science      Brian Wright      DS   
1        15257    Foundation of Data Science        Ali Rivera      DS   
2        15296    Foundation of Data Science               TBD      DS   
3        15258    Foundation of Data Science        Ali Rivera      DS   
4        15259    Foundation of Data Science               TBD      DS   
5        15260    Foundation of Data Science               TBD      DS   
6        15248  Programming for Data Science       Mai Dahshan      DS   
7        15295  Programming for Data Science       Mai Dahshan      DS   
8        15246          Data Science Systems       Jon Tupitza      DS   
9        15251          Data Science Systems  Jason Williamson      DS   

  section                                       meeting_time location  \
0     100  [{"days":"Mo|We","start":"02:00 PM","end":"03:.

# 4.1

Confidence: 7