## Guiding questions

Which locations does the Columbia Visits You program target?

In [25]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import re

In [2]:
ROOT = 'https://apply.college.columbia.edu/portal/register'

In [7]:
# Open and parse an HTML file

fname = 'events_2019-05-11T01:23:54.html'
html = open('../documents/' + fname, 'r').read()
soup = BeautifulSoup(html)

# Find and store links to each region's events

content = soup.find('div', {'id': 'content'}).form.div
regions = content.find_all('ul')
event_hrefs = []
for region in regions:
    events = region.find_all('li')
    for event in events:
        event_hrefs.append(ROOT + event.a['href'])

In [47]:
LATLNG_REGEX = re.compile(r'center=([^%]*)%2c([^&]*)')
TIME_RANGE_REGEX = re.compile(r'at (\d{2}:\d{2} \wM) until (\d{2}:\d{2} \wM)')

def parse_duration(string):
    times = TIME_RANGE_REGEX.search(string)
    start, end = [datetime.strptime(times.group(i), '%I:%M %p') for i in [1, 2]]
    return (end - start).seconds

def get_event_info(href):

    # Create page soup and find event information
    
    request = requests.get(href)
    request.raise_for_status()
    soup = BeautifulSoup(request.content)
    event = soup.find('div', {'id': 'content'}).form
    
    # Get title, date, duration, and venue name
    
    title = event.h1.string
    date = event.find('input', {'id': 'form_date'})['value']
    duration = parse_duration(event.find('p', {'id': 'register_date'}).contents[1])
    venue = event.find('p', {'id': 'register_location'}).contents[0]
    
    # Parse lat/lng of event from event map
    
    map_styles = event.find('div', {'id': 'map'})['style']
    latlng = LATLNG_REGEX.search(map_styles)
    lat = float(latlng.group(1))
    lng = float(latlng.group(2))

    return title, date, duration, venue, lat, lng

In [50]:
[get_event_info(h) for h in event_hrefs]

[('EEE Evening Program in Wilmington, DE',
  '2019-06-05T19:00:00',
  7200,
  'Doubletree Hotel Wilmington',
  39.742,
  -75.548),
 ('EEE Evening Program in Washington, D.C.',
  '2019-05-21T19:00:00',
  7200,
  'Washington Marriott Wardman Park',
  38.925,
  -77.054),
 ('EEE Afternoon Program in Chicago, IL',
  '2019-06-02T14:00:00',
  7200,
  'Chicago Marriott Oak Brook',
  41.846,
  -87.953),
 ('EEE Evening Program in Chicago, IL',
  '2019-06-02T19:00:00',
  7200,
  'Palmer House a Hilton Hotel',
  41.88,
  -87.627),
 ('EEE Evening Program in Indianapolis, IN',
  '2019-06-05T19:00:00',
  7200,
  'Sheraton Indianapolis',
  39.77,
  -86.159),
 ('EEE Evening Program in Baltimore, MD',
  '2019-05-19T19:00:00',
  7200,
  'Baltimore Marriott Inner Harbor at Camden Yards',
  39.287,
  -76.622),
 ('EEE Evening Program in Gaithersburg, MD',
  '2019-05-20T19:00:00',
  7200,
  'Gaithersburg Marriott Washingtonian Center',
  39.116,
  -77.196),
 ('EEE Evening Program in Detroit, MI',
  '2019-06-