# Cluster Data

Scraps, embeds, and clusters past events. Meant to be ran (hopefully) only once manually.

In [2]:
import requests, json, pickle, urllib, time, random, os
from bs4 import BeautifulSoup
import numpy as np
from sklearn.cluster import KMeans


import utils

DATA_DIR = '../data/'
TMP_DIR  = DATA_DIR + 'cluster_data/'
os.makedirs(TMP_DIR, exist_ok = True)

# with open('oai.key') as f: OAI_KEY = f.read().strip()

In [2]:
def get_week_event_urls(n_weeks, baseURL = 'https://events.umich.edu/week/'):
    # Gets urls of weekly endpoints starting at the current week and going back n_weeks

    urls = []
    currentURL = baseURL
    r = requests.get(currentURL)
    b = BeautifulSoup(r.text, 'html.parser')

    while n_weeks > 0:

        n_weeks -= 1

        # Find link to '/week/...' with 'Prev' in text
        week_links = b.find_all('a', href = lambda x: x and '/week/' in x)
        prev_link = None
        for link in week_links:
            if link and 'Prev' in link.text:
                prev_link = link.get('href')
                break
        if prev_link is None: break

        currentURL = urllib.parse.urljoin(baseURL, prev_link)
        r = requests.get(currentURL)
        b = BeautifulSoup(r.text, 'html.parser')
        urls.append(currentURL)
    
    return urls

In [5]:
# Get links for past 10 years
# urls = get_week_event_urls(53 * 10)
# with open(TMP_DIR + 'event_urls.json', 'w+') as f:
#     json.dump(urls, f)

In [6]:
# Scrap events for the urls we got
# events = []
# for i, url in enumerate(urls):
#     url += '/json?v=2'
#     r = requests.get(url)
#     tmp = r.json()
#     events.extend(tmp)
#     print(url, f'got {len(tmp)} events')
#     if i % 10 == 0: time.sleep(random.random() * 5)

# with open('events.json', 'w+') as f:
#     json.dump(events, f, indent = 2)

In [5]:
# Load events
with open(TMP_DIR + 'events.json', 'r') as f:
    events = json.load(f)
len(events)

170480

In [6]:
# Find unique ones
unique_titles = set([])
unique_events = []
for e in reversed(events):
    if e['event_title'] in unique_titles or e['occurrence_count'] > 1: continue
    unique_titles.add(e['event_title'])
    unique_events.append(e)
len(unique_events)

59745

In [7]:
# Convert to strings
to_embed = [utils.stringify_event(e) for e in unique_events]
len(to_embed)

59745

In [8]:
ix = np.random.randint(0, len(to_embed))
print(to_embed[ix])

Reception / Open House:Welcome Back to the Library
Welcome to the University of Michigan Library!

* Alumni can sign up to receive the Library's magazine to get a free U-M Library scarf.

* Record your personal U-M story on the StoryCorps App for archiving in the Library of Congress.

* Get an inside look at some of our collections and view demos as part of the U-M Third Century Expo, 4-7 p.m.
Where:Shapiro Library
When:Friday 10
Sponsors:University Library


In [26]:
# Get the embeddings
embeddings = []
for i in range(0, len(to_embed), 1000):
    embeddings.extend(utils.get_embedding(to_embed[i:i + 1000], OAI_KEY))
    time.sleep(15)

In [29]:
# Save them
with open(TMP_DIR + 'to_embed.json', 'w+') as f: json.dump(to_embed, f)

E = np.array([np.array(e) for e in embeddings])
np.save(TMP_DIR + 'embeddings.npy', E)
E.shape

(59745, 1536)

In [51]:
# Cluster them
n_clusters = 1000
m = KMeans(n_clusters = n_clusters, n_init = 5, random_state = 42)
m.fit(E)

# Save the model and centroids
with open(TMP_DIR + 'kmeans_model.pkl', 'wb') as f: pickle.dump(m, f)
np.save(DATA_DIR + 'centroids.npy', m.cluster_centers_)

In [58]:
# View n events per cluster
np.unique(m.labels_, return_counts = True)[1]

array([ 43, 109,  89,  81,  63,  64,  61,  50, 105,  97,  20,  61,  71,
        54,  34, 114, 112,  10,  77,  38, 110,  48,  45,  84,  86,  48,
        92, 125,  74,  30,   8,  34, 179,  60,  67,  58,  21,  97,  44,
        66,  89,  42,  73,  46,  63,  78,  64,  94, 102,  47, 227,  66,
        14, 104, 112,  44,  55,  46,  62,  52,  73,  89,  71, 107, 150,
       100,  72, 110, 159,  34,  75,  66,  58,  64,  39, 100,  65,  94,
        64, 102,  76,  45,  77,  32,  80,  78,  24,  42,  33,  49,  16,
        47,  72,  89,  63,  84,  40,  38,  26, 107,  72,  34, 103,  92,
        25,  61,  35,  81,  97, 136,  73,  17,  54,  19,  64, 100,  30,
        39,  41,  61,  49,  94, 169, 117,  85,  20,  78,  90,  73, 122,
        48,  21,  73,  47,  48,  54,  64,  60,  70, 107,  81,  59,  33,
        73, 108, 117,  73,   6,  14,  60,  37,  94, 106,  93, 154,  60,
       181,  61,  85,  11, 101,  79, 174,  93,  60,  42,  61,  12,  65,
        89,  62,  18,  27,   8, 118,  17,  47,  15, 122,  54,  4

In [53]:
# Inspect
for c in range(50):#range(n_clusters):
    print(f'Cluster {c}')
    ix = np.random.choice(np.where(m.labels_ == c)[0], 10)
    print(np.array(to_embed)[ix])
    print('\n\n')

Cluster 0
['Film Screening:Stockholm: Old Friends Die Hard: Noa Yedlin\nBestselling author and screenwriter Noa Yedlin will present a Screening of episodes from the Israeli TV show "Stockholm", followed by academic talk about writing for the page and writing for the screen. \r\n\r\nNoa Yedlin is an Israeli author, the recipient of the Sapir Prize (the Israeli Man Booker) and the Prime Minister\'s Literature Award and author of the bestselling House Arrest, Stockholm, People Like Us and The Wrong Book. Yedlin was named by Haaretz Magazine one of "66 Israeli Women You Should Know”. Yedlin is also the creator of a two-season TV series based on her bestselling novel Stockholm (the Israeli Best Mini-Series TV Award).\r\n\r\nFill out this form to RSVP and receive a link to pre-screen episode 1: https://myumi.ch/AZ9DA\nWhere:North Quad\nWhen:Tuesday 16\nSponsors:Judaic Studies'
 'Film Screening:Arab American National Museum and CMENAS Film Screening. Flesh Out: 2019. Mauritania, Italy. 94 min

In [61]:
for c in [280]:#range(n_clusters):
    print(f'Cluster {c}')
    ix = np.random.choice(np.where(m.labels_ == c)[0], 10)
    print(np.array(to_embed)[ix])
    print('\n\n')

Cluster 280
['Lecture / Discussion:Keepin it Real with VP Harper\nJoin Trotter for another edition of Keepin it Real with VP Harper!Continue the current discussions about diversity on campus with guest, Regent Diggs*.\xa0*Regent Ryder Diggs, a Democrat from Grosse Pointe, is vice chair of the Board of Regents. A graduate of the University of Michigan Medical School in 1994. Regent Ryder Diggs is a fellow of the American Academy of Dermatology and the American Society of Dermatologic Surgeons. Regent Ryder Diggs is personally and professionally involved in her community. She serves as chair of the board of the Blue Cross Blue Shield of Michigan Foundation and is a member of the Blue Care Network Board. Regent Ryder Diggs was elected to the Board of Regents in 2012. Her term expires January 1, 2021.\nWhere:Trotter Multicultural Center\nWhen:Wednesday 17\nSponsors:Maize Pages Student Organizations'
 "Lecture / Discussion:My Brothers: Setting Goals\nMy Brothers: Setting Goals  \r\n\r\nWith