### Imports and settings

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tqdm.notebook import tqdm
import pickle
import sys
import calendar

In [None]:
filename = 'trinity.csv'

### Get hrefs

In [None]:
urlbase = pickle.load(open('urlbase.pickle', 'rb'))

hrefs = []
for year in tqdm(range(2010, 2021)):
    count_hrefs = 0
    for month in range(1, 13):
        # assemble URL of current month to extract hrefs of events
        url = urlbase+"/concerts?field_date_value%5Bvalue%5D%5Bmonth%5D="+str(month)
        url += "&field_date_value%5Bvalue%5D%5Byear%5D="+str(year)
        url += "&tid=All"
        
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        for event in soup.find_all('div', class_="month mini-day-on"):
            hrefs.append(event.find_all('a')[0].get('href'))
            count_hrefs += 1
    
    print(year, "found", count_hrefs)
            
print(hrefs)

In [None]:
pickle.dump(hrefs, open('hrefs.pickle', 'wb'))

### Parse

In [None]:
hrefs = pickle.load(open('hrefs.pickle', 'rb'))

In [None]:
urlbase = pickle.load(open('urlbase.pickle', 'rb'))

lines = []
for href in tqdm(hrefs):
    url = urlbase + href

    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    suptitle = soup.find_all('h1', class_="title gutter")[0].contents[0]

    fields = soup.find_all('div', class_="field-items")

    try:
        location, title, description = '', '', ''
        datetime = soup.find_all('span', class_="date-display-single")[0].get('content')[0:-9]
        for i in range(len(fields)):
            paragraphs = fields[i].find_all('p')
            if len(paragraphs) > 0: 
                title = paragraphs[0].renderContents().decode("utf-8")
                if len(paragraphs) > 1:
                    location = paragraphs[1].contents[0]
                    if len(paragraphs) > 2: 
                        description = paragraphs[2].contents[0]
                break
                
    except IndexError as e:
        print(url)
        print(soup.find_all('p')[0])
        sys.exit()

    artists, composers = [], []
    composerMode = False # whether we're in the artist (False) or composer (True) column

    subfields = [field.find_all('div', class_="field-item even")[0] if len(field.find_all('div', class_="field-item even")) > 0 else None for field in fields[2:]]
    for subfield in subfields:
        if subfield is not None: 
            subfield = subfield.next # enter subfield
            if str(subfield).find('composer') > -1: composerMode = True
            if str(subfield).find('<') == -1 and len(subfield) > 1: 
                if composerMode: composers.append(subfield)
                else:            artists.append(subfield) # TODO: append things like Director/soprano/alto/tenor/bass to the previous element
    
    lines.append([datetime[0:10], datetime[11:16], suptitle, title, location, description, ', '.join(artists), ', '.join(composers), url])
    
print(len(lines), "lines added successfully")

In [None]:
pickle.dump(lines, open('lines.pickle', 'wb'))

### Save file

In [None]:
lines = pickle.load(open('lines.pickle', 'rb'))

In [None]:
import csv
with open(filename, 'w+', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(["date", "time", "suptitle", "title", "location", "description", "artists", "composers", "url"])
    for line in lines:
        writer.writerow(line)

### Load csv

In [None]:
#pd.set_option('display.max_colwidth', None)
csv = pd.read_csv(filename, keep_default_na=False)
csv

### Analytics

In [None]:
def count_dates(column=None, query=None, plot=False):
    if column is not None and query is not None:
        dates = [datetime.strptime(date, '%Y-%m-%d') for d, date in enumerate(csv["date"]) if query in csv[column][d]]
    else:
        dates = [datetime.strptime(date, '%Y-%m-%d') for date in csv["date"]]
    df = pd.DataFrame({"dates": dates})
    df.dates = pd.to_datetime(df.dates)
    if plot:
        df.groupby(df.dates.dt.year).count().plot(kind='bar')   
    return df

In [None]:
tcc = count_dates("artists", "Trinity College", True)
polyphony = count_dates("artists", "Polyphony", True)

In [None]:
df = count_dates()
count = df['dates'].groupby([df.dates.dt.year, df.dates.dt.month]).agg('count')
count_years = [date[0] for date in count.index.values]
count_months = [date[1] for date in count.index.values]
df = pd.DataFrame({"Year": count_years, "Month": count_months, "Count":count.values})
pt = df.pivot_table(index="Month", columns="Year", values="Count", aggfunc="sum").fillna(0)
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
sns.heatmap(pt, annot=True, cmap="Purples", yticklabels=months)