## Guiding questions

Where does Columbia Visits You visit?

In [4]:
from bs4 import BeautifulSoup
from datetime import datetime

import os
import requests
import re

In [6]:
ROOT = 'https://apply.college.columbia.edu/portal/register'

In [52]:
EVENT_FNAME_REGEX = re.compile(r'events_([^\.]*)\.html')

documents = [f for f in os.listdir('../documents') if EVENT_FNAME_REGEX.match(f) is not None]

In [28]:
# Returns event hrefs from a document
def get_event_hrefs(fname):

    # soup file contents

    fname = 'events_2019-05-11T01:23:54.html'
    html = open('../documents/' + fname, 'r').read()
    soup = BeautifulSoup(html)

    # Find and store information to each region's events

    content = soup.find('div', {'id': 'content'}).form.div
    regions = content.find_all('ul')
    event_hrefs = []
    for region in regions:
        events = region.find_all('li')
        for event in events:
            event_hrefs.append(ROOT + event.a['href'])

    return event_hrefs

In [50]:
EVENT_ID_REGEX = re.compile(r'register\?id=([\d\w-]*)')
LATLNG_REGEX = re.compile(r'center=([^%]*)%2c([^&]*)')
TIME_RANGE_REGEX = re.compile(r'at (\d{2}:\d{2} \wM) until (\d{2}:\d{2} \wM)')

# Get event duration from a time range
def parse_duration(string):
    times = TIME_RANGE_REGEX.search(string)
    start, end = [datetime.strptime(times.group(i), '%I:%M %p') for i in [1, 2]]
    return (end - start).seconds

EVENTS = {}

# Get information from an event href
def get_event_info(href):
    
    # Check event cache to see if event has already been stored

    event_id = EVENT_ID_REGEX.search(href).group(1)
    if event_id in EVENTS:
        return EVENTS[event_id]
    else:
        print('parsing', href)
    # soup the page and find event information
    
    request = requests.get(href)
    request.raise_for_status()
    soup = BeautifulSoup(request.content)
    event = soup.find('div', {'id': 'content'}).form
    
    # Parse lat/lng of event from event map
    
    map_styles = event.find('div', {'id': 'map'})['style']
    latlng = LATLNG_REGEX.search(map_styles)
    
    # Store event in cache
    
    event_info = {
        'title': event.h1.string,
        'timestamp': event.find('input', {'id': 'form_date'})['value'],
        'duration': parse_duration(event.find('p', {'id': 'register_date'}).contents[1]),
        'venue': event.find('p', {'id': 'register_location'}).contents[0],
        'location': [float(latlng.group(1)), float(latlng.group(2))]
    }

    EVENTS[event_id] = event_info
    
    return event_info

# Populate EVENTS
for d in documents:
    print('DOCUMENT', d)
    for href in get_event_hrefs(d):
        get_event_info(href)

DOCUMENT events_2019-04-30T10:39:28.html
parsing https://apply.college.columbia.edu/portal/register?id=0bcd2809-11ac-49b6-899d-ef0be516d202
parsing https://apply.college.columbia.edu/portal/register?id=4ef4a03c-e9fe-44df-8277-529962d87156
parsing https://apply.college.columbia.edu/portal/register?id=71cf1c0b-43e1-4b22-9fd0-83d573590ca1
parsing https://apply.college.columbia.edu/portal/register?id=15fdef21-b664-47cb-95eb-b6ec11345989
parsing https://apply.college.columbia.edu/portal/register?id=e33f2094-c0dd-477a-a8b9-6aefb3a13558
parsing https://apply.college.columbia.edu/portal/register?id=1f020ed7-44f9-4fc9-b711-7d6156b5d3a8
parsing https://apply.college.columbia.edu/portal/register?id=09300ce1-83a0-40e9-862c-aa4317981290
parsing https://apply.college.columbia.edu/portal/register?id=cb63f28e-73e6-418c-8f71-c16068cbe683
parsing https://apply.college.columbia.edu/portal/register?id=2aa092e3-705e-4a04-ab0a-68f81679e99b
parsing https://apply.college.columbia.edu/portal/register?id=ccbaa3

In [51]:
EVENTS

{'0bcd2809-11ac-49b6-899d-ef0be516d202': {'title': 'EEE Evening Program in Wilmington, DE',
  'timestamp': '2019-06-05T19:00:00',
  'duration': 7200,
  'venue': 'Doubletree Hotel Wilmington',
  'location': [39.742, -75.548]},
 '4ef4a03c-e9fe-44df-8277-529962d87156': {'title': 'EEE Evening Program in Washington, D.C.',
  'timestamp': '2019-05-21T19:00:00',
  'duration': 7200,
  'venue': 'Washington Marriott Wardman Park',
  'location': [38.925, -77.054]},
 '71cf1c0b-43e1-4b22-9fd0-83d573590ca1': {'title': 'EEE Afternoon Program in Chicago, IL',
  'timestamp': '2019-06-02T14:00:00',
  'duration': 7200,
  'venue': 'Chicago Marriott Oak Brook',
  'location': [41.846, -87.953]},
 '15fdef21-b664-47cb-95eb-b6ec11345989': {'title': 'EEE Evening Program in Chicago, IL',
  'timestamp': '2019-06-02T19:00:00',
  'duration': 7200,
  'venue': 'Palmer House a Hilton Hotel',
  'location': [41.88, -87.627]},
 'e33f2094-c0dd-477a-a8b9-6aefb3a13558': {'title': 'EEE Evening Program in Indianapolis, IN',
 