# Webscraping Information from 'https://www.usf.edu/business/graduate/ms-bais/'

In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.common.by import By
import re
import dateparser

In [2]:
# the following works on macos if I have gecko driver in the same folder as the script
driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))

# load page with Selenium
# we need to use selenium because the page loads additional records as you scroll down
# if we used requests, we would only get the first page of speeches
url = 'https://www.usf.edu/business/graduate/ms-bais/' 
driver.get(url)
driver.implicitly_wait(10)

#keep scrolling down until page stops loading additional records#
pause_scroll = 4
last_try = 0
initialcoord = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(pause_scroll)
    newcoord = driver.execute_script("return document.body.scrollHeight")
    if newcoord == initialcoord:
        break
    initialcoord = newcoord

In [3]:
base_url = 'https://www.usf.edu'

relative_links = [
    "/business/graduate/ms-info-session.aspx",
    "/business/graduate/ms-bais/application-process.aspx",
    "/business/graduate/ms-bais-global/index.aspx",
    "/business/graduate/ms-bais/faculty.aspx",
    "/business/graduate/ms-bais/",
    "/business/graduate/ms-bais/faq.aspx",
    "/business/graduate/ms-bais/student-spotlights.aspx",
    "/business/graduate/ms-bais/new-student.aspx"
]

link_list = [base_url + link for link in relative_links]


In [4]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup

# Set the waiting time for the page to load (in seconds)
page_wait = 3

# Lists to store scraped data
page_urls = []
content = []
titles = []

for link in link_list:
    try:
        # Access the page with Selenium and load the HTML source
        driver.get(link)
        time.sleep(page_wait)  # Wait for the page to load properly
        page_source = driver.page_source
        bsobject_msbais = BeautifulSoup(page_source, 'html.parser')

        # Add current URL to the page_urls list
        page_urls.append(link)

        # Scrape titles (h3 tags)
        h3_elements = bsobject_msbais.find_all('h3')
        if h3_elements:
            for h3 in h3_elements:
                title = h3.text.strip()  # Remove leading/trailing whitespace
                titles.append(title)
        else:
            titles.append("No title available")

        # Scrape content (you can modify this part based on your needs)
        main_content = bsobject_msbais.find('div', class_="mainContent_well u-flexItem--largeExtra")
        if main_content:
            paragraphs = [p.text for p in main_content.find_all('p')]
            lists = [li.text for li in main_content.find_all(['li', 'ol', 'ul'])]
            content.append('\n'.join(paragraphs + lists))
        else:
            content.append("No content available")

    except Exception as e:
        print(f"Error while scraping: {e}")

# Ensure that you properly close the driver at the end
driver.quit()

# Print the scraped data
print("Titles:")
for title in titles:
    print(title)

print("\nContent:")
for c in content:
    print(c)


Titles:
INFO SESSION SCHEDULE
Contact Information
Office Hours
Program Admission Requirements
Prerequisites
Admission Process
Accepting applications for SUMMER 2023
Leadership Focus
Independent Study
Tuition
Contact
No title available
ADMISSIONS
Admission Process
Costs
Graduate Assistantships
CURRICULUM
Global executive Program
CONTACT
Rankings
Fit
Placement
Finances
Admissions
International students
Application
Program
Others
No title available
New & Prospective Students:

Content:
Information sessions help potential students make sound decisions and USF graduate
               business admissions counselors can help find and identify the program that is right
               for each student, based on career goals and student interests. The information sessions
               are casual and relaxed, giving potential graduate students a chance to ask questions
               in order to make informed decisions.
The USF graduate business recruitment team will discuss curriculum structur

In [5]:
import json

# Combine data into a list of dictionaries
data = [{"url": link, "title": title, "content": cont} for link, title, cont in zip(link_list, titles, content)]

# Save to a JSON file
with open("scraped_data.json", "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=4)


# Making the json file more readable

In [6]:
import json

def make_content_readable(content_str):
    """Converts the single-string content into a list of paragraphs."""
    return [paragraph.strip() for paragraph in content_str.split("\n") if paragraph.strip()]

# Load the JSON file
with open("scraped_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Modify each item in the JSON data
for item in data:
    item['content'] = make_content_readable(item['content'])

# Save the modified data to a new JSON file
with open("readable_data.json", "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=4)
