# Cluster Data

Scraps, embeds, and clusters past events. Meant to be ran (hopefully) only once manually.

In [4]:
import requests, json, pickle, urllib, time, random, os
from bs4 import BeautifulSoup
from datetime import datetime
import numpy as np
from sklearn.cluster import KMeans

import utils

DATA_DIR = '../data/'
TMP_DIR  = DATA_DIR + 'cluster_data/'
os.makedirs(TMP_DIR, exist_ok = True)

OAI_KEY = os.getenv('OAI_KEY')
if not OAI_KEY: print('No OAI_KEY!')

ModuleNotFoundError: No module named 'sklearn'

In [16]:
def get_week_event_urls(n_weeks, baseURL = 'https://events.umich.edu/week/'):
    # Gets urls of weekly endpoints starting at the current week and going back n_weeks

    urls = []
    currentURL = baseURL
    r = requests.get(currentURL)
    b = BeautifulSoup(r.text, 'html.parser')

    while n_weeks > 0:

        n_weeks -= 1

        # Find link to '/week/...' with 'Prev' in text
        week_links = b.find_all('a', href = lambda x: x and '/week/' in x)
        prev_link = None
        for link in week_links:
            if link and 'Prev' in link.text:
                prev_link = link.get('href')
                break
        if prev_link is None: break

        currentURL = urllib.parse.urljoin(baseURL, prev_link)
        r = requests.get(currentURL)
        b = BeautifulSoup(r.text, 'html.parser')
        urls.append(currentURL)
    
    return urls

In [17]:
# Get links for past 10 years
# urls = get_week_event_urls(53 * 10)
# with open(TMP_DIR + 'event_urls.json', 'w+') as f:
#     json.dump(urls, f)

In [None]:
# Scrap events for the urls we got
# events = []
# for i, url in enumerate(urls):
#     url += '/json?v=2'
#     r = requests.get(url)
#     tmp = r.json()
#     events.extend(tmp)
#     print(url, f'got {len(tmp)} events')
#     if i % 10 == 0: time.sleep(random.random() * 5)

# with open('events.json', 'w+') as f:
#     json.dump(events, f, indent = 2)

In [20]:
# Load events
with open(TMP_DIR + 'events.json', 'r') as f:
    events = json.load(f)
len(events)

: 

In [None]:
# Find unique ones
unique_titles = set([])
unique_events = []
for e in reversed(events):
    if e['event_title'] in unique_titles or e['occurrence_count'] > 1: continue
    unique_titles.add(e['event_title'])
    unique_events.append(e)
len(unique_events)

In [None]:
# Convert to strings
to_embed = [utils.stringify_event(e) for e in unique_events]
len(to_embed)

In [None]:
ix = np.random.choice(len(to_embed), 1)
print(to_embed[ix])

In [None]:
# Get the embeddings
embeddings = []
for i, e in enumerate(to_embed):
    embeddings.append(utils.get_embedding(e, OAI_KEY))
    if i % 200 == 0: time.sleep(random.random() * 5)

In [None]:
# Save them
with open('to_embed.json', 'w+') as f: json.dump(to_embed, f)

E = np.array([np.array(e) for e in embeddings])
np.save('embeddings.npy', E)
E.shape

In [None]:
# Cluster them
n_clusters = 500
m = KMeans(n_clusters = n_clusters, n_init = 5, random_state = 42)
m.fit(E)

# Save the model and centroids
with open('kmeans_model.pkl', 'wb') as f: pickle.dump(m, f)
np.save('centroids.npy', m.cluster_centers_)

In [None]:
# View n events per cluster
np.unique(m.labels_, return_counts = True)[1]

In [None]:
# Inspect
for c in range(n_clusters):
    print(f'Cluster {c}')
    print(np.array(to_embed)[m.labels_ == c][:5])
    print('\n\n')