In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

In [2]:
# Disable cell text truncation
pd.set_option('display.max_colwidth', -1)

In [3]:
# Markdown output file
md_file_path = Path('output') / 'amld_workshop_schedule.md'
md_file_path_w_prereqs = Path('output') / 'amld_workshop_schedule_w_prereqs.md'

base_url = 'https://www.appliedmldays.org'
front_url = f'{base_url}/workshops.html'

r = requests.get(front_url)
front_soup = BeautifulSoup(r.text)

In [4]:
def get_time_from_soup(soup):
    h1 = soup.select_one('.masthead-text').find('h1')
    h1.find('strong').clear()
    time_str = h1.text.strip()
    parts = time_str.split()
    if len(parts) > 2:
        return ''.join(parts[:-2]), ' '.join(parts[-2:])
    return '', time_str

free = '-' * 3
busy = '█' * 3

def get_timeline_blocks(time):
    start, end = (int(v.split(':',1)[0]) for v in time.split('-'))
    tl = ''
    tl += free if start > 10 else busy
    tl += free if end < 13 else busy
    return tl

def get_level_from_soup(soup):
    badge = soup.select_one('.badge-grey')
    return badge.text.split()[0] if badge is not None else ''

def get_prerequisites_from_soup(soup):
    try:
        return [li.text.strip() for li in soup.find('h3', text='Prerequisites').findNext('ul').findAll('li')]
    except AttributeError:
        # One or more of the elements are None
        return []
    
def get_prerequisite_html_from(prerequisites):
    if not prerequisites:
        return ''
    lis = '</li><li>'.join(f' - {p}' for p in prerequisites)
    return f'<ul><li>{ lis }</li></ul>'

def generate_rows():
    for workshop_title in front_soup.find_all(class_='track-name'):
        title = workshop_title.text
        a = workshop_title.parent
        url = a.attrs.get('href')
        url = f'{base_url}{url}'
        ws_soup = BeautifulSoup(requests.get(url).text)
        time, date = get_time_from_soup(ws_soup)
        level = get_level_from_soup(ws_soup)
        prerequisites = get_prerequisites_from_soup(ws_soup)
        timeline = get_timeline_blocks(time)
        is_full_day = timeline[0] == timeline[-1]
        level_nr = 1 if level.lower().startswith('be') else 2 if level.lower().startswith('in') else 3
        yield timeline, time, date, title, level, prerequisites, url, is_full_day, level_nr

In [5]:
# bss = '▀ ▁ ▂ ▃ ▄ ▅ ▆ ▇ █ ▉ ▊ ▋ ▌ ▍ ▎ ▏'.split()
# bn = '█ ▀ ▄'

# for bi, b in enumerate(bss):
#     print(bn)
#     print(b * 5, bi, b)
#     print('')


In [6]:
df = pd.DataFrame(generate_rows(), columns=[
    'Timeline', 'Time', 'Date', 'Title', 'Level', 'Prerequsites', 'Link', 'FullDay', 'LevelNo'
])

In [7]:
sdf = df.sort_values(by=['Date', 'FullDay', 'Time', 'LevelNo'], ascending=[True, False, True, True])
sdf.head()

Unnamed: 0,Timeline,Time,Date,Title,Level,Prerequsites,Link,FullDay,LevelNo
2,██████,09:00-16:30,January 26,Document Digitization Challenge,Beginner,"[A laptop with Anaconda pre-installed, Python programming knowledge]",https://www.appliedmldays.org/workshops/document-digitization-challenge,True,1
5,██████,09:00-16:30,January 26,Data exploration and preparation for Machine Learning,Beginner,"[Basic Python and statistics knowledge, Running Python installation on own laptop, and Jupyter Notebook installed]",https://www.appliedmldays.org/workshops/data-exploration-and-preparation-for-machine-learning,True,1
0,██████,09:00-16:30,January 26,TensorFlow Basics - Saturday,Intermediate,[good Python programming skills],https://www.appliedmldays.org/workshops/tensorflow-basics-saturday,True,2
1,██████,09:00-16:30,January 26,TDA crash course: theory and practice for ML applications,Intermediate,"[basic Python programming skills (laptop with Python 2.7/3 installed), basic Jupyter notebook skills, basic git/data management skills, notions of linear algebra and elementary topology are helpful but not needed]",https://www.appliedmldays.org/workshops/tda-crash-course-theory-and-practice-for-ml-applications,True,2
3,██████,09:00-16:30,January 26,Applied Machine Learning for Anomaly Detection on Equipment,Intermediate,"[Intermediate level in ML and data science, Working knowledge in open source Python Machine Learning stack is preferred, but R and Matlab users welcome, No business knowledge or expertise required, Only Python can be deployed to the cloud]",https://www.appliedmldays.org/workshops/applied-machine-learning-for-anomaly-detection-on-equipment,True,2


In [8]:
markdown_str = f"""
## Applied Machine Learning Days 2018
Table of workshops generated from [the overview page][ws]. 

Versions: [normal][n] | [with prerequisites][wp] 



\[ For the curious on [how it was created][jnb] \]

[n]: ./{md_file_path.name}
[wp]: ./{md_file_path_w_prereqs.name}
[ws]: https://www.appliedmldays.org/workshops.html
[jnb]: https://nbviewer.jupyter.org/github/fauskanger/public/blob/master/AppliedMachineLearningDaysWorkshopTableGenerator.ipynb

"""

In [9]:
def create_md_table():
    md_table = '| Timeline | Time | Date | Level | Title \n'
    md_table += '|---|---|---|---|---|\n'
    prev_date = None
    for row in sdf.iterrows():
        i, r = row
        md_table += f'| `{r.Timeline}` | {r.Time} | {r.Date} | {r.Level} | [{r.Title}]({r.Link})| \n'
        if prev_date is not None and prev_date != r.Date:
            md_table += '| | | | | |\n'
        prev_date = r.Date
    return md_table

def create_md_table_with_prereqs(use_br=True):
    md_table = '| Timeline | Time | Date | Level | Title | Prerequisites \n'
    md_table += '|---|---|---|---|---|---|\n'
    prev_date = None
    for row in sdf.iterrows():
        i, r = row
    #     prereqs = get_prerequisite_html_from(r.Prerequsites)
        if use_br:
            prereqs = '<br/>'.join(' - ' + k[:1].upper() + k[1:] for k in r.Prerequsites)
        else:
            prereqs = ' :: '.join(k[:1].upper() + k[1:] for k in r.Prerequsites)
        md_table += f'| `{r.Timeline}` | {r.Time} | {r.Date} | {r.Level} | [{r.Title}]({r.Link}) | {prereqs} | \n'
        if prev_date is not None and prev_date != r.Date:
            md_table += '| | | | | |\n'
    return md_table

In [10]:
md_file_path.write_text(markdown_str + create_md_table(), encoding='utf8')
md_file_path_w_prereqs.write_text(markdown_str + create_md_table_with_prereqs(), encoding='utf8')

11142