## Notes
There's a pattern of workshop data from year 2017 to 2022, can be scraped by incrementing the year in the following url:
https://icml.cc/Conferences/2017/Schedule?type=Workshop to https://icml.cc/Conferences/2022/Schedule?type=Workshop
After which the format of the website changes and so new code will have to be written.

In [1]:
import requests
import csv
from bs4 import BeautifulSoup as bs

In [None]:
fields_title = ['title', 'year']

In [None]:
for year in range(17,23):
    
    URL = f'https://icml.cc/Conferences/20{year}/Schedule?type=Workshop'

    #print(URL, year)
    
    req = requests.get(URL)
    soup = bs(req.text, 'html.parser')
      
    titles = soup.find_all('div',attrs = {'class','maincardBody'})
    
    with open('titles_icml', 'a') as f:
        
        write = csv.writer(f)
        
        #write.writerow(fields_title)
        for titleNumber in range(0,len(titles)):
            #print(titles[titleNumber].text)
            
            write.writerow([titles[titleNumber].text, f'20{year}'])
  
    #print(titles[0].text)

In [None]:
print(titles[0].text)

## Scraping abstracts
Annoyingly there's no obvious pattern in the website for the different abstracts, so a manual inspect of each html element must be performed

In [None]:
array17 = [[2017], [*range(1,22),930]]
array18 = [[2018], [*range(3280, 3353)]]
array19 = [[2019], [*range(3502, 3533)]]
array20 = [[2020],[*range(5715, 5749)]]
array21 = [[2021],[*range(8347, 8377)]]
array22 = [[2022], [*range(13446, 13479),21435]]

In [None]:
allArrays = [array17, array18, array19, array20, array21, array22]

### After manually inspecting each year to assertain the eventID we iterate over each year and list of eventIDs to produce a list of abstracts, sadly there are no abstracts for year 2018

In [None]:
for array in allArrays:
    
    for eventID in array[1]:
    
        #Change below later so can iterate through all arrays from 17-22
        year = array[0][0]
    
        URL = f'https://icml.cc/Conferences/{year}/Schedule?showEvent={eventID}'

        #print(URL, year)
    
        req = requests.get(URL)
        soup = bs(req.text, 'html.parser')
      
        abstracts = soup.find_all('div',attrs = {'class','abstractContainer'})
    
        with open('abstracts_icml', 'a') as f:
        
            write = csv.writer(f)
        
            for abstractNumber in range(0,len(abstracts)):
                #print(abstracts[abstractNumber].text)
            
                write.writerow([abstracts[abstractNumber].text, f'{year}'])

### With both titles and abstracts for each year and workshop we now perform some simple data exploration, starting with BOW

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
data = pd.read_csv(r'abstracts_icml')
df = pd.DataFrame(data)
df.columns = ['abstract', 'year']
df = df.dropna(axis=0)

In [4]:
df

Unnamed: 0,abstract,year
0,"In recent years, deep learning has revolutioni...",2017
1,For details see:http://machlearn.gitlab.io/hit...,2017
2,Although dramatic progress has been made in th...,2017
3,Probabilistic models are a central implement i...,2017
4,Retrieval techniques operating on text or sema...,2017
...,...,...
144,A long-standing objective of AI research has b...,2022
145,We propose the 1st ICML Workshop on Safe Learn...,2022
146,As modern astrophysical surveys deliver an unp...,2022
147,This workshop proposal builds on the success o...,2022


In [5]:
df2017 = df.loc[df['year'] == 2017]
df2019 = df.loc[df['year'] == 2019]
df2020 = df.loc[df['year'] == 2020]
df2021 = df.loc[df['year'] == 2021]
df2022 = df.loc[df['year'] == 2022]

In [6]:
from sklearn.feature_extraction import text 

stopwords = text.ENGLISH_STOP_WORDS.union(['machine learning','machine','ml','learning','workshop', 'community', 'ai'])

CountVec17 = CountVectorizer(ngram_range=(1,2), stop_words = stopwords)
CountVec19 = CountVectorizer(ngram_range=(1,2), stop_words = stopwords)
CountVec20 = CountVectorizer(ngram_range=(1,2), stop_words = stopwords)
CountVec21 = CountVectorizer(ngram_range=(1,2), stop_words = stopwords)
CountVec22 = CountVectorizer(ngram_range=(1,2), stop_words = stopwords)
CountVecAll = CountVectorizer(ngram_range=(1,2), stop_words = stopwords)

### Now lets explore the different topic as a function of year

## Interesting, now lets run LDA on the entire dataset

In [7]:
bow17 = CountVec17.fit_transform(df2017['abstract'])
bow19 = CountVec19.fit_transform(df2019['abstract'])
bow20 = CountVec20.fit_transform(df2020['abstract'])
bow21 = CountVec21.fit_transform(df2021['abstract'])
bow22 = CountVec22.fit_transform(df2022['abstract'])
bowAll = CountVecAll.fit_transform(df['abstract'])

In [13]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [33]:
ldaBow17  = LDA(n_components=10, random_state=42) 
ldaBow17.fit(bow17)
ldaBow19  = LDA(n_components=10, random_state=42) 
ldaBow19.fit(bow19)
ldaBow20  = LDA(n_components=10, random_state=42) 
ldaBow20.fit(bow20)
ldaBow21  = LDA(n_components=10, random_state=42) 
ldaBow21.fit(bow21)
ldaBow22  = LDA(n_components=10, random_state=42) 
ldaBow22.fit(bow22)
ldaBowAll  = LDA(n_components=50, random_state=42) 
ldaBowAll.fit(bowAll)

In [34]:
import pyLDAvis 
import pyLDAvis.sklearn 

pyLDAvis.enable_notebook()

In [35]:
display17 = pyLDAvis.sklearn.prepare(ldaBow17, bow17, CountVec17)
display19 = pyLDAvis.sklearn.prepare(ldaBow19, bow19, CountVec19)
display20 = pyLDAvis.sklearn.prepare(ldaBow20, bow20, CountVec20)
display21 = pyLDAvis.sklearn.prepare(ldaBow21, bow21, CountVec21)
display22 = pyLDAvis.sklearn.prepare(ldaBow22, bow22, CountVec22)
displayAll = pyLDAvis.sklearn.prepare(ldaBowAll, bowAll, CountVecAll)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  by='saliency', ascending=False).head(R).drop('saliency', 1)
  by='saliency', ascending=False).head(R).drop('saliency', 1)
  by='saliency', ascending=False).head(R).drop('saliency', 1)
  by='saliency', ascending=False).head(R).drop('saliency', 1)
  by='saliency', ascending=False).head(R).drop('saliency', 1)


## LDA 40 topics, entire dataset

In [37]:
pyLDAvis.display(displayAll)

## LDA 20 topics, 2017

In [38]:
pyLDAvis.display(display17)

## LDA 20 topics, 2019

In [39]:
pyLDAvis.display(display19)

## LDA 20 topics, 2020

In [40]:
pyLDAvis.display(display20)

## LDA 20 topics, 2021

In [41]:
pyLDAvis.display(display21)

## LDA 20 topics, 2022

In [42]:
pyLDAvis.display(display22)