# Scrape job postings from [CESNET-L](https://www.cesnet-l.net/)

Emilio Lehoucq - 3/4/24

Note: this is not the most optimal code, but it solves the problem in a reasonable way.

## Import libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from datetime import datetime
from time import sleep
from pandas import DataFrame
from pandas import read_csv
from pandas import concat
from glob import glob

## Define custom functions for this script

In [2]:
# I'm not turning the first three functions into a class with three methods because it doesn't work within
# soup_compilation.find_all('a', href = True, string = Class.method)
def contains_posting(text):
    '''
    Function to filter HTML <a> elements containing postings.

    Input: text in an <a> element.
    Outupt: boolean--True if contains posting, False otherwise.
    '''
    return text and ('faculty position' in text.lower() or 'professor' in text.lower())

def contains_plain_text(text):
    '''
    Function to filter HTML <a> elements containing 'text/plain'.

    Input: text in an <a> element.
    Outupt: boolean--True if contains 'text/plain', False otherwise.
    '''
    return text and ('text/plain' in text.lower())

def contains_html(text):
    '''
    Function to filter HTML <a> elements containing 'text/html'.

    Input: text in an <a> element.
    Outupt: boolean--True if contains 'text/html', False otherwise.
    '''
    return text and ('text/html' in text.lower())

def get_compilations(web_driver):
    '''
    Function to log into Kent State University listserv, go to CESNET-L, and find all HTML <li> elements.
    The <li> elements contain the hyperlinks to all previous weekly compilations of messages in the listserv.
    
    Input: web driver.
    Output: list of selenium.webdriver.remote.webelement.WebElement.
    
    Dependencies: selenium webdriver.
    
    Note: I created this function because otherwise looping over <li> elements I got a stale element error.
    There's probably a better way to fix that error, but this solves the problem and works in this context.
    '''
    # Define https://listserv.kent.edu/ credentials
    email = "your_email"
    password = "your_password"

    # Go to log in page 
    url_login = "https://listserv.kent.edu/cgi-bin/wa.exe?LOGON"
    web_driver.get(url_login)

    # Find email field and send the email to the input field
    web_driver.find_element("id", "Email Address").send_keys(email)

    # Find password input field and insert password
    web_driver.find_element("id", "Password").send_keys(password)

    # Click log in button
    web_driver.find_element("name", "e").click()

    # Find the CESNET-L listserv and click on it to get to the archive
    cesnet_archive_element = web_driver.find_element(By.LINK_TEXT, "CESNET-L")
    cesnet_archive_element.click()

    # Find all <li> elements
    li_elements = web_driver.find_elements(By.CSS_SELECTOR, "li")
    
    # Return all selenium.webdriver.remote.webelement.WebElement
    return li_elements

def get_message_data(web_driver, url_message, dictionary, beginning_key):
    '''
    Function to go to URL of message, add source code to dictionary, parse page source and add
    BeautifulSoup object to dictionary, and get text and add to dictionary.

    Inputs:
        web_driver: web driver.
        url_message: url (string).
        dictionary: dictionary to store data.
        beginning_key: string that should be at the beginning of each of the keys in the dictionary.
    Output: none.

    Dependencies: selenium webdriver, BeautifulSoup.
    '''
    # Go to the URL of the message
    web_driver.get(url_message)
    # Adding sleep time before getting source code
    sleep(10)
    # Add source code to dictionary
    dictionary[beginning_key + '_message_source_code'] = web_driver.page_source
    # Parse page source
    soup_message = BeautifulSoup(web_driver.page_source)
    # Add BeautifulSoup object to dictionary
    dictionary[beginning_key + '_message_soup'] = soup_message
    # Get text and add to dictionary
    dictionary[beginning_key + '_message_text'] = soup_message.get_text()
    
def populate_none(dictionary, beginning_key):  
    '''
    Function to populate missing data to dictionary.
    
    Inputs:
        dictionary: dictionary to populate.
        beginning_key: string that should be at the beginning of each of the keys in the dictionary.
    Outputs: none.
    '''
    dictionary[beginning_key + '_message_source_code'] = None
    dictionary[beginning_key + '_message_soup'] = None
    dictionary[beginning_key + '_message_text'] = None

## Get current number of weekly compilations

Doing this because number changes every week, so it's worth even if only working on this script on two or three separate weeks.

In [3]:
# Initialize driver
driver = webdriver.Chrome()

# Get current number of weekly compilations in CESNET-L archive
number_compilations = len(get_compilations(driver))

# Quit driver
driver.quit()

## Collect postings

In [4]:
# Iterate over current number of weekly compilations
for i in range(number_compilations):
# # This way I can test my code across compilations in different years in case structure changes
# for i in range(1, number_compilations, 50):
    # Create list to store the data for current weekly compilation
    data_weekly_compilation = []
    # Initialize Chrome driver
    driver = webdriver.Chrome()
    # Get compilations
    weekly_compilations = get_compilations(driver)
    # Get current weekly compilation
    compilation = weekly_compilations[i]
    # Get the week of the compilation
    week_compilation = compilation.text
    # Go to compilation
    driver.get(compilation.find_element(By.TAG_NAME, "a").get_attribute("href"))
    # Adding sleep time before getting source code
    sleep(10)
    # For the next two steps, I'm sure there's a way to do it with selenium without having to 
    # get the source code and parse it. But this gets the job done and came to mind first
    # Get and parse source code of compilation
    soup_compilation = BeautifulSoup(driver.page_source)
    # Find URLs for postings
    url_postings = [a_element['href'] for a_element in soup_compilation.find_all('a', href = True, string = contains_posting) if 'https' in a_element['href']]
    # Iterate over each posting
    for url in url_postings:
        # Create dictionary to store posting data
        posting_data = {}
        # Add current timestamp to dictionary
        posting_data['timestamp_collection'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        # Add week of the compilation to dictionary
        posting_data['week_compilation'] = week_compilation
        # Go to the URL of the posting
        driver.get(url)
        # Adding sleep time before getting source code
        sleep(10)
        # Again, the next two steps can probably be done differently
        # Get and parse source code
        soup_posting = BeautifulSoup(driver.page_source)
        # Try to get data for messages
        # Doing error handling because some messages have only 'text/plain', others only 'text/html', 
        # and others both (maybe some neither?)  
        # 'text/plain' and 'text/html' seem to give the same results. Could keep only one, but better safe than sorry
        # Define URL base
        url_base = 'https://listserv.kent.edu'
        # Try to get 'text/plain'
        key_base = 'plain_text'
        try:
            # Find the URL of the plain text message
            url_plain_text = url_base + soup_posting.find('a', href = True, string = contains_plain_text)['href']
            # Get message data
            get_message_data(driver, url_plain_text, posting_data, key_base)
        except:
            populate_none(posting_data, key_base)
        # Try to get 'text/html'
        key_base = 'html'
        try:
            # Find the URL of the html message
            url_html = url_base + soup_posting.find('a', href = True, string = contains_html)['href']
            # Get message data
            get_message_data(driver, url_plain_text, posting_data, key_base)
        except:
            populate_none(posting_data, key_base)
        # Appending posting data dictionary to list for current weekly compilation
        data_weekly_compilation.append(posting_data)
        # Wait before going to next posting
        sleep(10)
    # Convert list for current weekly compilation to data frame
    data_weekly_compilation_df = DataFrame(data_weekly_compilation)
    # Save data frame to csv. Saving for each compilation in case something happens, I can restart there
    data_weekly_compilation_df.to_csv('data_compilation_' + week_compilation.replace(",", "").replace(" ", "_").lower() + '.csv', index = False)
    # Quit driver
    driver.quit()