## Guiding questions

Where does Columbia Visits You visit?

In [86]:
from bs4 import BeautifulSoup
from datetime import datetime

import os
import requests
import re
import json

In [88]:
# DUPLICATE OF THE VERSION IN SCRAPER

EVENTS_URLS = {
    'brown': 'https://apply.college.brown.edu/portal/brown-near-you?c=&country=',
    'columbia': 'https://apply.college.columbia.edu/portal/register?c=&country=',
    'mit': 'https://mitadmissions.org/visit/mit-visits-you/fall-travel',
    'princeton': 'https://apply.princeton.edu/portal/upcoming_events?c=&country=',
    'stanford': 'https://apply.stanford.edu/portal/stanfordinyourarea?c=&country=',
    'uchicago': 'https://prospects.uchicago.edu/register/?c=&country=',
    'upenn': 'https://key.admissions.upenn.edu/portal/penn-in-your-town?c=&country=',
    'yale': 'https://apps.admissions.yale.edu/portal/events?c=&country=',    
    'eco': 'http://www.exploringcollegeoptions.org/',
}

In [89]:
documents = {}

for directory in os.listdir('../documents'):
    if directory == '.DS_Store':
        continue
    documents[directory] = sorted(
        [f for f in os.listdir('../documents/' + directory) if EVENT_FNAME_REGEX.match(f) is not None]
    )

In [96]:
# Returns event hrefs from a document
def get_event_hrefs(fname, uni):

    # soup file contents

    html = open('../documents/' + fname, 'r').read()
    soup = BeautifulSoup(html)
    # Find and store information to each region's events

    content = soup.find('div', {'id': 'content'}).form.div
    if content is None:
        content = []
        regions = []
    else:
        regions = content.find_all('ul')
        
    event_hrefs = []
    for region in regions:
        events = region.find_all('li')
        for event in events:
            event_hrefs.append(EVENTS_URLS[uni] + event.a['href'])

    return event_hrefs

In [94]:
EVENT_ID_REGEX = re.compile(r'register\?id=([\d\w-]*)')
LATLNG_REGEX = re.compile(r'center=([^%]*)%2c([^&]*)')
TIME_RANGE_REGEX = re.compile(r'at (\d{1,2}:\d{2} \wM) until (\d{1,2}:\d{2} \wM)')

# Get event duration from a time range
def parse_duration(string):
    times = TIME_RANGE_REGEX.search(string)
    start, end = [datetime.strptime(times.group(i), '%I:%M %p') for i in [1, 2]]
    return (end - start).seconds

EVENTS = {}

# Get information from an event href
def get_event_info(href):
    
    # Check event cache to see if event has already been stored

    event_id = EVENT_ID_REGEX.search(href).group(1)
    if event_id in EVENTS:
        return EVENTS[event_id]
    else:
        print('parsing', href)
    # soup the page and find event information
    
    request = requests.get(href)
    request.raise_for_status()
    soup = BeautifulSoup(request.content)
    event = soup.find('div', {'id': 'content'}).form
    
    # Parse lat/lng of event from event map
    
    map_styles = event.find('div', {'id': 'map'})['style']
    latlng = LATLNG_REGEX.search(map_styles)
    
    # Store event in cache
    
    event_info = {
        'title': event.h1.string,
        'timestamp': event.find('input', {'id': 'form_date'})['value'],
        'duration': parse_duration(event.find('p', {'id': 'register_date'}).contents[1]),
        'venue': event.find('p', {'id': 'register_location'}).contents[0],
        'lat': float(latlng.group(1)),
        'lng': float(latlng.group(2)),
    }

    EVENTS[event_id] = event_info
    
    return event_info

In [97]:
# Populate EVENTS
uni = 'brown'

for f in documents[uni]:
    f = uni + '/' + f
    hrefs = get_event_hrefs(f, uni)
    print(f'=== DOCUMENT {f}, with {len(hrefs)} events')
    for href in hrefs:
        get_event_info(href)

=== DOCUMENT brown/events_2019-05-12T10:29:01.html, with 17 events


AttributeError: 'NoneType' object has no attribute 'group'

In [58]:
EVENTS

{'3ef22548-0d01-4163-8303-a8956ff36f3c': {'title': 'EEE Evening Program in Irvine, CA',
  'timestamp': '2019-04-29T19:00:00',
  'duration': 7200,
  'venue': 'Hilton Irvine/Orange County Airport',
  'lat': 33.675,
  'lng': -117.861},
 '30fd0668-2d90-4373-ab12-4a1926bb0dbd': {'title': 'EEE Evening Program in San Diego, CA',
  'timestamp': '2019-04-30T19:00:00',
  'duration': 7200,
  'venue': 'DoubleTree by Hilton Hotel San Diego - Mission Valley',
  'lat': 32.77,
  'lng': -117.16},
 '40d3cb6f-1740-42b3-a324-224704faa8a3': {'title': 'EEE Evening Program in Berkeley, CA',
  'timestamp': '2019-05-01T19:00:00',
  'duration': 7200,
  'venue': 'DoubleTree by Hilton Hotel Berkeley Marina',
  'lat': 37.868,
  'lng': -122.314},
 '303bb4cf-35ae-47cf-b3f0-3f0656ec8f83': {'title': 'EEE Evening Program in San Jose, CA',
  'timestamp': '2019-05-02T19:00:00',
  'duration': 7200,
  'venue': 'San Jose Marriott',
  'lat': 37.33,
  'lng': -121.888},
 '0bcd2809-11ac-49b6-899d-ef0be516d202': {'title': 'EEE E

In [59]:
EVENTS_LIST = []

for key in EVENTS.keys():
    event = EVENTS[key]
    event['id'] = key
    EVENTS_LIST.append(event)

In [65]:
with open('../data/events.json', 'w') as f:
    f.write(json.dumps(EVENTS_LIST))