In [1]:
import pandas as pd
from datetime import datetime, time
import requests
from bs4 import BeautifulSoup
import re
pd.set_option('display.max_rows', 10000)

In [2]:
#Series ID is a simple id to differential duplicate Series Names if they occur more than once a week or repeat during a different semester
#Series Name is the name of the workshop as listed in the Qualtrics form
#The Start Time is the start time of the workshop in 24 hour format
#The End Time is the time of the end of the workshop listed in 24 hour format
#Day of the week tells what day Monday-Sunday the workshop took place, with Monday being 0 and Sunday being 6
seriesList = [[1, 'Data Engineering', 'https://datascience.arizona.edu/events/navigating-world-data-engineering', time(14, 0, 0), time(15,0,0), 'SP', '2024'],
                [2, 'NextGen Geospatial Data Science', 'https://datascience.arizona.edu/events/nextgen-geospatial-data-science', time(14, 0, 0), time(15,0,0), 'SP', '2024'],
                [3, 'Intro To Data Science', 'https://datascience.arizona.edu/events/data-science-essentials-jupyter-ai-tools', time(15, 0, 0), time(16,0,0), 'SP', '2024'],
                [4, 'Cracking The Coding Interview', 'https://datascience.arizona.edu/events/cracking-coding-interview', time(16, 0, 0), time(17,0,0), 'SP', '2024'],
                [5, 'Data Science Tapas', 'https://datascience.arizona.edu/events/data-science-tapas-savor-tools-data-mastery', time(13, 0, 0), time(14,0,0), 'SP', '2024'],
                [6, 'Classical Machine Learning', 'https://datascience.arizona.edu/events/mastering-machine-learning-your-path-data-driven-research', time(14, 0, 0), time(15,0,0), 'SP', '2024'],
                [7, 'Intro to Deep Learning', 'https://datascience.arizona.edu/events/introduction-deep-learning', time(15, 0, 0), time(16,0,0), 'SP', '2024']
                ]
series = pd.DataFrame(seriesList, columns = ['SeriesID', 'Series Name', 'URL', 'Start Time', 'End Time', 'Semester', 'Year'])

In [3]:
def scrapeWorkshops(url, seriesName):
    response = requests.get(url)
    if (response.status_code == 200):
        soup = BeautifulSoup(response.content, "html.parser")
        
        #Finds all instances of a paragraph with the word 'Topic" in the header of the paragraph
        results = soup.find_all(lambda tag: tag.name == 'p' and any(text in tag.get_text() for text in ['Topic']))
        
        data = []

        if (len(results) != 0):
            spans = results[0].find_all('span')
            
            #Sometimes the header is disconnected from the main list so we need the next element
            if (len(spans) == 1):
                results = results[0].findNext('p')
                spans = results.find_all('span')
            else:
                #Remove the first element since its the header 'Topic"
                spans.pop(0)
            
            i = 0
            year = datetime.now().year

            while i < len(spans):
                temp = []
                text = spans[i].get_text().strip()

                #isName = re.findall(r'^[A-Za-z\s]+$', text) #Finds if a line contains just a name and no date

                #Checks if the next element is a 'c-mrkdwn__tab' and skips it if so
                try:
                    attrs = spans[i+1].attrs
                    if (len(attrs) != 0) and attrs["class"][0] == 'c-mrkdwn__tab':
                        i += 1
                except:
                    pass

                #Checks to see if the element is a date in the format mm/dd
                if re.findall(r'^\d{1,2}/\d{1,2}$', text):
                    date = text + '/' + str(year)
                    temp.append(datetime.strptime(date, '%m/%d/%Y'))
                    temp.append(spans[i+1].get_text().strip())
                    i += 1
                elif re.findall(r'^\d{1,2}/\d{1,2}:$', text):
                    date = text + '/' + str(year)
                    temp.append(datetime.strptime(date, '%m/%d/%Y:'))
                    temp.append(spans[i+1].get_text().strip())
                    i += 1
                elif re.findall(r'^\d{1,2}/\d{1,2}/\d{1,2}$', text):
                    temp.append(datetime.strptime(text, '%m/%d/%y'))
                    temp.append(spans[i+1].get_text().strip())
                    i += 1
                elif re.findall(r'^\d{1,2}/\d{1,2}/\d{1,2}:$', text):
                    temp.append(datetime.strptime(text, '%m/%d/%y:'))
                    temp.append(spans[i+1].get_text().strip())
                    i += 1
                #If neither of those, then the date and name are in the same span and need to be split
                else:
                    split_content = text.split(maxsplit=1)
                    #Checks the format of the date mm/dd
                    if re.findall(r'^\d{1,2}/\d{1,2}$', split_content[0]): 
                        date = split_content[0] + '/' + str(year)
                        temp.append(datetime.strptime(date, '%m/%d/%Y'))
                    elif re.findall(r'^\d{1,2}/\d{1,2}:$', split_content[0]): 
                        date = split_content[0] + '/' + str(year)
                        temp.append(datetime.strptime(date, '%m/%d/%Y:'))
                    #Checks to see if the format of the date is mm/dd/yy
                    elif re.findall(r'^\d{1,2}/\d{1,2}/\d{1,2}$', split_content[0]):
                        temp.append(datetime.strptime(split_content[0], '%m/%d/%y'))
                    elif re.findall(r'^\d{1,2}/\d{1,2}/\d{1,2}:$', split_content[0]):
                        temp.append(datetime.strptime(split_content[0], '%m/%d/%y:'))

                    try:
                        temp.append(split_content[1])
                    except IndexError:
                        temp.append(None)
                    
                data.append(temp)
                i += 1
        else:
            results = soup.find(lambda tag: tag.name == 'h2' and any(text in tag.get_text() for text in ['When'])).findNext('div')
            results.contents.pop(0)
            data = []
            for i in range(0, len(results.contents), 2):
                temp = []
                date = results.contents[i].get_text().strip().split(', ', maxsplit=1)[1]
                for fmt in ('%b. %d, %Y', '%B %d, %Y'):
                    try:
                        temp.append(datetime.strptime(date, fmt))
                    except ValueError:
                        pass
                temp.append(seriesName)
                data.append(temp)
        
        return pd.DataFrame(data, columns = ['Date', 'Workshop Name'])
    else:
        return None

In [6]:
workshops = pd.DataFrame()

for i in range(len(series)):
    tempDF = scrapeWorkshops(series.loc[i, "URL"], series.loc[i, "Series Name"])
    tempDF['SeriesID'] = series.loc[i, "SeriesID"]
    tempList = [workshops, tempDF]
    workshops = pd.concat(tempList)

In [7]:
workshops

Unnamed: 0,Date,Workshop Name,SeriesID
0,2024-01-29,Building Python based webapps with Streamlit a...,1
1,2024-02-05,Deploying ML models and creating demos with St...,1
2,2024-02-12,Introduction to SQL Part-1 (Basic Commands and...,1
3,2024-02-19,"Introduction to SQL Part-2 (Functions, Sub-que...",1
4,2024-02-26,Introduction to noSQL Part-1 (Types of noSQL D...,1
5,2024-03-04,Spring Break (No Class),1
6,2024-03-11,Introduction to noSQL Part-2 (Basics of Cassan...,1
7,2024-03-18,Introduction to Spark and Hadoop Part-1 (Hadoo...,1
8,2024-03-25,Introduction to Spark and Hadoop Part-2 (Intro...,1
0,2024-01-16,Geospatial Data APIs,2
