# Webscraping Information from 'https://www.usf.edu/business/graduate/ms-bais/'

In [2]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.common.by import By
import re
import dateparser

In [12]:
# the following works on macos if I have gecko driver in the same folder as the script
driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))

# load page with Selenium
# we need to use selenium because the page loads additional records as you scroll down
# if we used requests, we would only get the first page of speeches
url = 'https://www.usf.edu/business/graduate/ms-bais/' 
driver.get(url)
driver.implicitly_wait(10)

#keep scrolling down until page stops loading additional records#
pause_scroll = 4
last_try = 0
initialcoord = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(pause_scroll)
    newcoord = driver.execute_script("return document.body.scrollHeight")
    if newcoord == initialcoord:
        break
    initialcoord = newcoord

In [13]:
base_url = 'https://www.usf.edu'

relative_links = [
    "/business/graduate/ms-info-session.aspx",
    "/business/graduate/ms-bais/application-process.aspx",
    "/business/graduate/ms-bais-global/index.aspx",
    "/business/graduate/ms-bais/faculty.aspx",
    "/business/graduate/ms-bais/",
    "/business/graduate/ms-bais/faq.aspx",
    "/business/graduate/ms-bais/student-spotlights.aspx",
    "/business/graduate/ms-bais/new-student.aspx"
]

link_list = [base_url + link for link in relative_links]


In [14]:
page_wait = 3

# Lists to store scraped data
page_urls = []
content = []
titles = []

for link in link_list:
    # Access page with Selenium and load html source into Beautifulsoup
    driver.get(link)
    time.sleep(page_wait)  # Wait for the page to load properly
    page_source = driver.page_source
    bsobject_msbais = bs(page_source, 'lxml')

    # Add current URL to the page_urls list
    page_urls.append(link)

    # Scrape titles
    try:
        title = bsobject_msbais.find('h3', class_="mainContent_well u-flexItem--largeExtra").text
        titles.append(title.rstrip())
    except AttributeError:
        titles.append("No title available")

    # Scrape content
    main_content = bsobject_msbais.find('div', class_="mainContent_well u-flexItem--largeExtra")
    if main_content:
        paragraphs = [p.text for p in main_content.find_all('p')]
        lists = [li.text for li in main_content.find_all(['li', 'ol', 'ul'])]
        content.append('\n'.join(paragraphs + lists))
    else:
        content.append("No content available")

# Ensure that you properly close the driver at the end
driver.close()


In [16]:
import json

# Combine data into a list of dictionaries
data = [{"url": link, "title": title, "content": cont} for link, title, cont in zip(link_list, titles, content)]

# Save to a JSON file
with open("scraped_data.json", "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=4)


# Making the json file more readable

In [19]:
import json

def make_content_readable(content_str):
    """Converts the single-string content into a list of paragraphs."""
    return [paragraph.strip() for paragraph in content_str.split("\n") if paragraph.strip()]

# Load the JSON file
with open("scraped_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Modify each item in the JSON data
for item in data:
    item['content'] = make_content_readable(item['content'])

# Save the modified data to a new JSON file
with open("readable_data.json", "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=4)
