# Cluster Data

Scraps, embeds, and clusters past events. Meant to be ran (hopefully) only once manually.

In [1]:
import requests, json, pickle, urllib, time, random, os
from bs4 import BeautifulSoup
import numpy as np
from sklearn.cluster import KMeans


import utils

DATA_DIR = '../data/'
TMP_DIR  = DATA_DIR + 'cluster_data/'
os.makedirs(TMP_DIR, exist_ok = True)

# with open('oai.key') as f: OAI_KEY = f.read().strip()

In [2]:
def get_week_event_urls(n_weeks, baseURL = 'https://events.umich.edu/week/'):
    # Gets urls of weekly endpoints starting at the current week and going back n_weeks

    urls = []
    currentURL = baseURL
    r = requests.get(currentURL)
    b = BeautifulSoup(r.text, 'html.parser')

    while n_weeks > 0:

        n_weeks -= 1

        # Find link to '/week/...' with 'Prev' in text
        week_links = b.find_all('a', href = lambda x: x and '/week/' in x)
        prev_link = None
        for link in week_links:
            if link and 'Prev' in link.text:
                prev_link = link.get('href')
                break
        if prev_link is None: break

        currentURL = urllib.parse.urljoin(baseURL, prev_link)
        r = requests.get(currentURL)
        b = BeautifulSoup(r.text, 'html.parser')
        urls.append(currentURL)
    
    return urls

In [5]:
# Get links for past 10 years
# urls = get_week_event_urls(53 * 10)
# with open(TMP_DIR + 'event_urls.json', 'w+') as f:
#     json.dump(urls, f)

In [6]:
# Scrap events for the urls we got
# events = []
# for i, url in enumerate(urls):
#     url += '/json?v=2'
#     r = requests.get(url)
#     tmp = r.json()
#     events.extend(tmp)
#     print(url, f'got {len(tmp)} events')
#     if i % 10 == 0: time.sleep(random.random() * 5)

# with open('events.json', 'w+') as f:
#     json.dump(events, f, indent = 2)

In [5]:
# Load events
with open(TMP_DIR + 'events.json', 'r') as f:
    events = json.load(f)
len(events)

170480

In [6]:
# Find unique ones
unique_titles = set([])
unique_events = []
for e in reversed(events):
    if e['event_title'] in unique_titles or e['occurrence_count'] > 1: continue
    unique_titles.add(e['event_title'])
    unique_events.append(e)
len(unique_events)

59745

In [7]:
# Convert to strings
to_embed = [utils.stringify_event(e) for e in unique_events]
len(to_embed)

59745

In [8]:
ix = np.random.randint(0, len(to_embed))
print(to_embed[ix])

Reception / Open House:Welcome Back to the Library
Welcome to the University of Michigan Library!

* Alumni can sign up to receive the Library's magazine to get a free U-M Library scarf.

* Record your personal U-M story on the StoryCorps App for archiving in the Library of Congress.

* Get an inside look at some of our collections and view demos as part of the U-M Third Century Expo, 4-7 p.m.
Where:Shapiro Library
When:Friday 10
Sponsors:University Library


In [26]:
# Get the embeddings
embeddings = []
for i in range(0, len(to_embed), 1000):
    embeddings.extend(utils.get_embedding(to_embed[i:i + 1000], OAI_KEY))
    time.sleep(15)

In [29]:
# Save them
with open(TMP_DIR + 'to_embed.json', 'w+') as f: json.dump(to_embed, f)

E = np.array([np.array(e) for e in embeddings])
np.save(TMP_DIR + 'embeddings.npy', E)
E.shape

(59745, 1536)

In [51]:
# Cluster them
n_clusters = 1000
m = KMeans(n_clusters = n_clusters, n_init = 5, random_state = 42)
m.fit(E)

# Save the model and centroids
with open(TMP_DIR + 'kmeans_model.pkl', 'wb') as f: pickle.dump(m, f)
np.save(DATA_DIR + 'centroids.npy', m.cluster_centers_)

In [2]:
# View n events per cluster
# np.unique(m.labels_, return_counts = True)[1]

In [3]:
# Inspect
# for c in range(50):#range(n_clusters):
#     print(f'Cluster {c}')
#     ix = np.random.choice(np.where(m.labels_ == c)[0], 10)
#     print(np.array(to_embed)[ix])
#     print('\n\n')