## Guiding questions

Where does Columbia Visits You visit?

In [78]:
from bs4 import BeautifulSoup, NavigableString
from datetime import datetime

import os
import requests
import re
import json

In [139]:
# DUPLICATE OF THE VERSION IN SCRAPER

EVENTS_URLS = {
    'brown': 'https://apply.college.brown.edu/portal/brown-near-you',
    'columbia': 'https://apply.college.columbia.edu/portal/register',
    'mit': 'https://mitadmissions.org/visit/mit-visits-you/fall-travel',
    'princeton': 'https://apply.princeton.edu/portal/upcoming_events',
    'stanford': 'https://apply.stanford.edu/portal/stanfordinyourarea',
    'uchicago': 'https://prospects.uchicago.edu/register/?c=&country=',
    'upenn': 'https://key.admissions.upenn.edu/portal/penn-in-your-town?c=&country=',
    'yale': 'https://apps.admissions.yale.edu/portal/events',    
    'eco': 'http://www.exploringcollegeoptions.org',
}

In [150]:
documents = {}

for directory in os.listdir('../documents'):
    if directory == '.DS_Store':
        continue
    documents[directory] = sorted(
        [f for f in os.listdir('../documents/' + directory) if 'events_' in f]
    )

In [172]:
# Returns event hrefs from a document
def get_event_hrefs(fname, uni):
    # soup file contents
    html = open('../documents/' + fname, 'r').read()
    soup = BeautifulSoup(html)
    
    # Find and store information to each region's events
    if uni == 'uchicago':
        content = soup.find('div', {'class': 'content2'})
    elif uni == 'mit':
        content = soup.find('div', {'id': 'page-text-mod'})
        rsvps = [
            a['href'] for a in content.findAll('a')
            if a.text == 'RSVP'
        ]
        return [
            r for r in rsvps
            if r != 'https://mitadmissions.org/rsvp-az/'
        ]
    else:
        content = soup.find('div', {'id': 'content'})
    content = content.form.div
    if content is None:
        content = []
        regions = []
    else:
        regions = content.find_all('ul')
        
    event_hrefs = []
    for region in regions:
        events = region.find_all('li')
        for event in events:
            event_hrefs.append(EVENTS_URLS[uni].split('?')[0] + event.a['href'])

    return event_hrefs

In [195]:
EVENT_ID_REGEX = re.compile(r'\?id=([\d\w-]*)')
LATLNG_REGEX = re.compile(r'center=([^%]*)%2c([^&]*)')
TIME_RANGE_REGEX = re.compile(r'at (\d{1,2}:\d{2} \wM)( until (\d{1,2}:\d{2} \wM))?')

# Get event duration from a time range
def parse_duration(string):
    times = TIME_RANGE_REGEX.findall(string)[0]
    if len(times[1]) == 0 and len(times[2]) == 0:
        return None
    start, end = [datetime.strptime(times[i], '%I:%M %p') for i in [0, 2]]
    return (end - start).seconds

EVENTS = {}

# Get information from an event href
def get_event_info(href):    
    # Check event cache to see if event has already been stored
    event_id = EVENT_ID_REGEX.search(href)
    if event_id is None:
        if href.find('https://apps.admissions.yale.edu/register/') == 0:
            event_id = href.split('/')[-1]
    else:
        event_id = event_id.group(1)
    if event_id in EVENTS:
        return EVENTS[event_id]
    else:
        print('parsing', href)
    # soup the page and find event information
    
    request = requests.get(href)
    request.raise_for_status()
    soup = BeautifulSoup(request.content)

    event = soup.find('div', {'id': 'content'})
    if event is None: # uchicago
        event = soup.find('div', {'class': 'content2'})
        title = soup.find('h1', {'class': 'title'})
    event = event.form
    
    event_info = {
        'title': (event.h1 or title).string,
        'timestamp': event.find('input', {'id': 'form_date'})['value'],
        'duration': parse_duration(event.find('p', {'id': 'register_date'}).contents[1]),
        'venue': event.find('p', {'id': 'register_location'}).contents[0],
    }
    
    # Parse location of event from event map
    map_div = event.find('div', {'id': 'map'})
    if map_div is None:
        event_info['address'] = '\n'.join([
            x for x in list(event.find('p', {'id': 'register_location'}).children)
            if type(x) == NavigableString
        ])
    else:
        map_styles = event.find('div', {'id': 'map'})['style']
        latlng = LATLNG_REGEX.search(map_styles)
        event_info['lat'] = float(latlng.group(1))
        event_info['lng'] = float(latlng.group(2))

    EVENTS[event_id] = event_info
    
    return event_info

In [196]:
# Populate EVENTS
for uni in EVENTS_URLS.keys():
    if uni == 'eco':
        continue
    print(f'***** {uni} *****')
    for f in documents[uni]:
        f = uni + '/' + f
        hrefs = get_event_hrefs(f, uni)
        print(f'=== DOCUMENT {f}, with {len(hrefs)} events')
        for href in hrefs:
            get_event_info(href)

***** brown *****
=== DOCUMENT brown/events_2019-05-12T10:29:01.html, with 17 events
parsing https://apply.college.brown.edu/portal/brown-near-you?id=7eecdbcc-8b39-47cb-b95c-fa6723bf2b70
parsing https://apply.college.brown.edu/portal/brown-near-you?id=1a45ac57-b6f9-4b27-8c77-8c1d758f8bad
parsing https://apply.college.brown.edu/portal/brown-near-you?id=043ad2a6-870e-4d0a-9ad5-0aace683644c
parsing https://apply.college.brown.edu/portal/brown-near-you?id=26beb80e-d2f6-4848-8006-980ea35034e3
parsing https://apply.college.brown.edu/portal/brown-near-you?id=f62f987d-15a6-424d-8a2e-424ae4a51fd9
parsing https://apply.college.brown.edu/portal/brown-near-you?id=320675e9-88c1-4036-b9eb-32624e1d2fad
parsing https://apply.college.brown.edu/portal/brown-near-you?id=edada695-dd94-4cf2-afcf-63d9a5daa70a
parsing https://apply.college.brown.edu/portal/brown-near-you?id=2c96a53b-26fc-4e4a-8247-9c54855cc4ae
parsing https://apply.college.brown.edu/portal/brown-near-you?id=81d48a5b-b85b-474b-92de-bf6bbc5e65

parsing https://apply.college.columbia.edu/portal/register?id=b61f6b51-8ae2-4c3d-9b7c-12dba5948505
parsing https://apply.college.columbia.edu/portal/register?id=346ca5c0-d44f-4a02-a786-70b752269194
parsing https://apply.college.columbia.edu/portal/register?id=798060c4-d8c3-4651-b4be-8ad65339bd3f
parsing https://apply.college.columbia.edu/portal/register?id=fa5ce5f4-4657-4c10-8628-eb6fda662611
parsing https://apply.college.columbia.edu/portal/register?id=c52eb675-7aef-4a20-87f5-6c4823e0ab62
=== DOCUMENT columbia/events_2019-04-30T10:39:28.html, with 26 events
=== DOCUMENT columbia/events_2019-05-02T17:02:08.html, with 20 events
=== DOCUMENT columbia/events_2019-05-04T14:43:02.html, with 17 events
=== DOCUMENT columbia/events_2019-05-06T10:54:59.html, with 17 events
=== DOCUMENT columbia/events_2019-05-07T10:25:50.html, with 17 events
=== DOCUMENT columbia/events_2019-05-07T11:08:36.html, with 17 events
=== DOCUMENT columbia/events_2019-05-09T12:00:47.html, with 17 events
=== DOCUMENT co

=== DOCUMENT mit/events_2019-06-15T00:00:02.html, with 5 events
=== DOCUMENT mit/events_2019-06-16T10:00:07.html, with 5 events
=== DOCUMENT mit/events_2019-06-17T22:00:04.html, with 5 events
=== DOCUMENT mit/events_2019-06-18T08:00:02.html, with 5 events
=== DOCUMENT mit/events_2019-06-19T00:00:03.html, with 5 events
=== DOCUMENT mit/events_2019-06-20T09:00:02.html, with 5 events
=== DOCUMENT mit/events_2019-06-21T00:00:03.html, with 5 events
=== DOCUMENT mit/events_2019-06-22T10:00:01.html, with 5 events
=== DOCUMENT mit/events_2019-06-23T00:00:02.html, with 5 events
=== DOCUMENT mit/events_2019-06-24T09:01:41.html, with 5 events
=== DOCUMENT mit/events_2019-06-25T01:00:02.html, with 5 events
=== DOCUMENT mit/events_2019-06-26T00:00:03.html, with 5 events
=== DOCUMENT mit/events_2019-06-28T12:00:02.html, with 5 events
=== DOCUMENT mit/events_2019-06-29T00:00:02.html, with 5 events
=== DOCUMENT mit/events_2019-06-30T14:00:02.html, with 5 events
=== DOCUMENT mit/events_2019-07-01T22:00

parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=14af4a36-f8a1-4c1a-8674-3bf3a4de6715
parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=b6197770-f0d9-4340-a62e-1267e2a09b58
parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=1bb7dcf3-a66b-4da8-8f9d-abc881122229
parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=c91f2da1-47fc-4448-b5f3-e9e85941249b
parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=7cfb30d3-0195-4e61-9fe2-11a1400e4c45
parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=b04a7cf5-12b1-4c27-9995-d1846479145f
parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=002d46ab-8416-4462-9a28-2685b4783ba7
parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=50aa5396-8c0b-455e-a56e-028953a07fda
parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=b3536394-42ef-4c3c-8f44-d8985076c065
parsing https://apply.stanford.edu/portal/stanfordinyourarea?id=ea5302e4-6716-4b53-a58e-3db

=== DOCUMENT stanford/events_2019-06-20T09:00:03.html, with 0 events
=== DOCUMENT stanford/events_2019-06-21T00:00:03.html, with 0 events
=== DOCUMENT stanford/events_2019-06-22T10:00:02.html, with 0 events
=== DOCUMENT stanford/events_2019-06-23T00:00:03.html, with 0 events
=== DOCUMENT stanford/events_2019-06-24T09:01:42.html, with 0 events
=== DOCUMENT stanford/events_2019-06-25T01:00:03.html, with 0 events
=== DOCUMENT stanford/events_2019-06-26T00:00:03.html, with 0 events
=== DOCUMENT stanford/events_2019-06-28T12:00:02.html, with 0 events
=== DOCUMENT stanford/events_2019-06-29T00:00:02.html, with 0 events
=== DOCUMENT stanford/events_2019-06-30T14:00:02.html, with 0 events
=== DOCUMENT stanford/events_2019-07-01T22:00:03.html, with 0 events
=== DOCUMENT stanford/events_2019-07-02T08:00:02.html, with 0 events
=== DOCUMENT stanford/events_2019-07-03T00:00:05.html, with 0 events
=== DOCUMENT stanford/events_2019-07-07T10:00:03.html, with 0 events
=== DOCUMENT stanford/events_2019-

parsing https://prospects.uchicago.edu/register/?id=126b946b-fd70-4cde-a209-aa62351dc215
parsing https://prospects.uchicago.edu/register/?id=72f84dc6-28eb-4dee-966d-0696977f4fd4
parsing https://prospects.uchicago.edu/register/?id=2730515d-fbd0-40b5-92a7-d46b9a7d2edd
parsing https://prospects.uchicago.edu/register/?id=1adec1ae-5bbf-49f0-a141-30791e06efd5
=== DOCUMENT uchicago/events_2019-06-12T09:00:05.html, with 42 events
parsing https://prospects.uchicago.edu/register/?id=dfe8f643-17df-486f-b0d6-3635493cfc15
parsing https://prospects.uchicago.edu/register/?id=61e37325-4c3c-4e6e-bd92-20eff03fdd4e
parsing https://prospects.uchicago.edu/register/?id=b388cc59-0885-4cc6-bb9b-2cd3c406b067
parsing https://prospects.uchicago.edu/register/?id=aedd62c8-0c3f-4f57-8a85-69a12c2ecee7
parsing https://prospects.uchicago.edu/register/?id=5bfa0163-bed4-49e8-849a-74fd50dba751
=== DOCUMENT uchicago/events_2019-06-13T09:00:04.html, with 50 events
parsing https://prospects.uchicago.edu/register/?id=983d852

parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=c2737e66-f57e-47b3-b943-6fa73b1b5de8
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=fa1b4e07-6a51-4602-928e-be0ada7fb9dd
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=8cd7a9bc-acb2-4dbf-966d-4ece078ca22e
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=8173cf12-189b-4aac-80d2-49c9d666b2b2
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=6e345bc1-4e0e-460b-a841-416c7ab82235
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=2d5816f1-78c6-47b6-9578-4698f7df7eb2
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=5bcd5613-e271-4900-85f3-d3c922a945b2
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=c1864b0e-d554-4a5c-a43c-9f64de3e0ec5
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=e8b5d7de-951f-499d-af1c-0d61c4520ce3
parsing https://key.admissions.upenn.edu/porta

=== DOCUMENT upenn/events_2019-07-10T08:00:04.html, with 1 events
=== DOCUMENT upenn/events_2019-07-11T08:00:06.html, with 1 events
=== DOCUMENT upenn/events_2019-07-12T23:00:04.html, with 1 events
=== DOCUMENT upenn/events_2019-07-13T00:00:04.html, with 1 events
=== DOCUMENT upenn/events_2019-07-14T00:00:06.html, with 1 events
=== DOCUMENT upenn/events_2019-07-15T00:00:05.html, with 1 events
=== DOCUMENT upenn/events_2019-07-16T00:00:03.html, with 1 events
=== DOCUMENT upenn/events_2019-07-17T00:00:03.html, with 5 events
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=c300054e-64b9-4208-ac1b-bf4d491f2b10
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=a8d95db7-fc60-430d-99e3-ab69ceadf2f0
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=0049f578-0fda-4bfd-aab8-ddc63b1861bb
parsing https://key.admissions.upenn.edu/portal/penn-in-your-town?id=dacd8096-07c6-407b-a3d6-b524525ee1c1
=== DOCUMENT upenn/events_2019-07-18T00:00:03.ht

In [197]:
EVENTS_LIST = []

for key in EVENTS.keys():
    event = EVENTS[key]
    event['id'] = key
    EVENTS_LIST.append(event)

In [198]:
with open('../data/events-20190719.json', 'w') as f:
    f.write(json.dumps(EVENTS_LIST))