In [None]:
!pip install selenium

In [None]:
import time
from selenium import webdriver
from selenium.webdriver import Remote, ChromeOptions
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import urllib.parse

course_listings = []

def save_data():
    df = pd.DataFrame([vars(course_listing) for course_listing in course_listings])
    print(f'Total number of courses collected: {len(df)}')
    df.to_pickle('course_listings_2.pkl')

class CourseListing:
    def __init__(self, title, url, partner, description, rating=None):
        self.title = title
        self.url = url
        self.partner = partner
        self.description = description
        self.rating = rating

    def __str__(self):
        return f'Title: {self.title}\nURL: {self.url}\nPartner: {self.partner}\nDescription: {self.description}\nRating: {self.rating}'

AUTH = 'brd-customer-hl_2e9584da-zone-scraping_itzik1313:fhyvb16u7jgt'
SBR_WEBDRIVER = f'https://{AUTH}@zproxy.lum-superproxy.io:9515'

search_terms = [
    "AI and Machine Learning",
    "Business and Management",
    "Data Science and Analytics",
    "Digital Marketing",
    "Programming and Development",
    "Healthcare and Medicine",
    "Psychology and Mental Health",
    "Sustainability and Environmental Studies",
    "Project Management",
    "Cyber Security",
    "Law and Legal Studies",
    "Engineering and Robotics",
    "Education and Teaching",
    "Finance and Accounting",
    "Artificial Intelligence Ethics",
    "Creative Arts and Media",
    "Social Sciences",
    "Leadership and Personal Development",
    "Design and Architecture",
    "Data Engineering",
    "Science and Technology",
    "Economics",
    "Humanities and History",
    "Languages and Linguistics",
    "Entrepreneurship"
]

if __name__ == '__main__':
    print('Connecting to Scraping Browser...')
    sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, 'goog', 'chrome')
    driver = Remote(sbr_connection, options=ChromeOptions())

    for term in search_terms:
        print(f'Starting search for term: {term}')
        driver.get(f"https://www.futurelearn.com/search?q={urllib.parse.quote_plus(term)}")
        time.sleep(5)

        course_containers = BeautifulSoup(driver.page_source, 'html.parser').find_all('li', class_='m-link-list__item')
        if len(course_containers) == 0:
            print(f'No courses found for {term}')
            continue
        course_count = 0

        for course_container in course_containers:
            if course_count >= 100:
                break

            try:
                title = course_container.find('a').text.strip()
                url = 'https://www.futurelearn.com' + course_container.find('a')['href']
                partner = course_container.find('a').find_next('a').text.strip()
                description = course_container.find('p').text.strip()
                course_listing = CourseListing(title, url, partner, description)
                course_listings.append(course_listing)
                course_count += 1

            except Exception as e:
                print(f'Error scraping course: {e}')
                continue

        save_data()

    driver.quit()


Connecting to Scraping Browser...
Starting search for term: AI and Machine Learning
Total number of courses collected: 60
Starting search for term: Business and Management
Total number of courses collected: 160
Starting search for term: Data Science and Analytics
Total number of courses collected: 232
Starting search for term: Digital Marketing
Total number of courses collected: 306
Starting search for term: Programming and Development
Total number of courses collected: 406
Starting search for term: Healthcare and Medicine
Total number of courses collected: 506
Starting search for term: Psychology and Mental Health
Total number of courses collected: 606
Starting search for term: Sustainability and Environmental Studies
Total number of courses collected: 640
Starting search for term: Project Management
Total number of courses collected: 740
Starting search for term: Cyber Security
Total number of courses collected: 784
Starting search for term: Law and Legal Studies
Total number of cour

In [None]:
import pandas as pd

df = pd.read_pickle('course_listings_2.pkl')

print(f"Total rows: {df.shape[0]}")
print(f"Total columns: {df.shape[1]}")

print(f"Columns: {df.columns.tolist()}")

print(df.head())

print("Missing values in each column:")
print(df.isnull().sum())

print("Example course listing:")
print(df[['title', 'url', 'partner', 'description']].head())




Total rows: 1705
Total columns: 5
Columns: ['title', 'url', 'partner', 'description', 'rating']
                                               title  \
0  Practical Machine Learning for AI: Foundationa...   
1  Artificial Intelligence on Microsoft Azure: Ma...   
2  Microsoft Future Ready: Using Python Programmi...   
3    AI and Machine Learning Algorithms Using Python   
4  Introduction to Artificial Intelligence and Ma...   

                                                 url  \
0  https://www.futurelearn.com/courses/practical-...   
1  https://www.futurelearn.com/microcredentials/m...   
2  https://www.futurelearn.com/courses/cloudswyft...   
3  https://www.futurelearn.com/courses/cloudswyft...   
4  https://www.futurelearn.com/courses/introducti...   

                           partner  \
0               Cardiff University   
1  CloudSwyft Global Systems, Inc.   
2  CloudSwyft Global Systems, Inc.   
3  CloudSwyft Global Systems, Inc.   
4  CloudSwyft Global Systems, Inc.   

 