In [53]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# List of semesters you want to scrape
semesters = [f"{year}-{year+1}" for year in range(2013, 2024)]

# List of course numbers you're interested in
course_numbers = ["01005"]  # Add more course numbers as needed

# Base URL structure
base_url = "https://kurser.dtu.dk/course/"

# Function to fetch course table information
def fetch_course_table(semester, course_number):
    url = f"{base_url}{semester}/{course_number}"
    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find the table with the specific style
        table = soup.find("table", {"style": "table-layout:fixed"})
        if table:
            # Extract table rows and columns
            rows = table.find_all("tr")
            table_data = []
            for row in rows:
                cols = row.find_all("td")
                cols = [col.get_text(strip=True) for col in cols]
                table_data.append(cols)
            return table_data
        else:
            return "No table found."
    except requests.exceptions.Timeout:
        return "Timeout occurred"
    except requests.exceptions.RequestException as e:
        return f"Request failed: {e}"

# Initialize an empty list to hold the data
all_courses_data = []

# Loop over each semester and course number
for semester in semesters:
    for course_number in course_numbers:
        print(f"Fetching table data for course {course_number} in semester {semester}")
        table_data = fetch_course_table(semester, course_number)
        if isinstance(table_data, list):  # Ensure we got table data back
            # Include semester and course number in each row
            for row in table_data:
                all_courses_data.append([semester, course_number] + row)
        else:
            print(table_data)  # Print any error messages

# Convert the list to a DataFrame
df = pd.DataFrame(all_courses_data, columns=["Semester", "Course Number", "Column1", "Column2", "Column3"])  # Adjust column names as needed

# Save the DataFrame to a CSV file
df.to_csv("dtu_courses_table.csv", index=False, encoding='utf-8-sig')

print("Data fetching complete. Results saved to dtu_courses_table.csv")

Fetching table data for course 01005 in semester 2013-2014
No table found.
Fetching table data for course 01005 in semester 2014-2015
No table found.
Fetching table data for course 01005 in semester 2015-2016
No table found.
Fetching table data for course 01005 in semester 2016-2017
No table found.
Fetching table data for course 01005 in semester 2017-2018
No table found.
Fetching table data for course 01005 in semester 2018-2019
No table found.
Fetching table data for course 01005 in semester 2019-2020
No table found.
Fetching table data for course 01005 in semester 2020-2021
No table found.
Fetching table data for course 01005 in semester 2021-2022
No table found.
Fetching table data for course 01005 in semester 2022-2023
No table found.
Fetching table data for course 01005 in semester 2023-2024
No table found.
Data fetching complete. Results saved to dtu_courses_table.csv


In [65]:
url = 'https://kurser.dtu.dk/course/2021-2022/01005'
headers = {'User-Agent' : 'Clara'}
response = requests.get(url, headers=headers, timeout=5, verify=False)



soup = BeautifulSoup(response.content, 'html.parser')

soup




<!DOCTYPE html>

<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8"/>
<title>kurser.dtu.dk</title>
</head>
<body>
<script>
        setTimeout(function() {
            window.location.reload();
        }, 500);
    </script>
<iframe id="loginFrame" src="?forceLogin=true" style="display:none"></iframe>
</body>
</html>

In [47]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time

def fetch_all_text_with_selenium(semester, course_number):
    driver = webdriver.Chrome()  # Simple instantiation without custom path or options
    
    try:
        url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
        driver.get(url)
        
        # Optional: Scroll to the bottom of the page to trigger loading if needed
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Give it a moment to load
        
        # Get the page source and parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # Get all text from the page
        all_text = soup.get_text(separator="\n", strip=True)
        return all_text
    
    finally:
        driver.quit()

def save_text_to_csv(text_data, filename="course_info.csv"):
    # Split the text data by lines
    lines = text_data.split("\n")
    
    # Write data to a CSV file
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write each line as a row in the CSV file
        for line in lines:
            writer.writerow([line])

# Example usage
semester = "2013-2014"
course_number = "01005"
page_text = fetch_all_text_with_selenium(semester, course_number)
print(page_text)

# Save the extracted text to a CSV file
save_text_to_csv(page_text, filename="course_info.csv")

01005 Matematik 1 (2013/2014)
English
|
Log ind
Kursussøgning
Studieplanlæggeren
01005
2013/2014
01005 Matematik 1
Engelsk titel:
Advanced Engineering Mathematics 1
Sprog:
Dansk
Point( ECTS )
17,5
Kursustype:
Bachelor
Kurset udbydes under åben uddannelse
Skemaplacering:
Efterår og Forår
Kurset udbydes på tre forskellige skemaplaceringer afhængig af
bachelorlinje. Skema A: E1A, E2 og F1A, F2 Skema B: E3A, E4 og F3A,
F4 Skema C: E5, E3B og F5, F3B
Undervisningens placering:
Campus Lyngby
Undervisningsform:
Pr. uge: 2 forelæsninger, 5 timers
gruppearbejde/​klasseundervisning og 2 timers skemalagt
selvstudium. Derudover projektarbejde i nogle uger.
Kursets varighed:
13-uger + 13-uger
Eksamensplacering:
Særlig dag, Særlig dag
Evalueringsform:
Skriftlig eksamen og
bedømmelse af rapport(er)
Evalueringen består af fire delelementer med lige stor vægt: 1) 8
sæt hjemmeopgaver 2) En skr. prøve i to dele, en time efter 4. uge
og to timer i december/januar. 3) En 3-ugers gruppe-projektopgave.
4) En

In [68]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time

from selenium.webdriver.chrome.options import Options

def fetch_all_text_with_selenium(semester, course_number):
    options = Options()
    options.headless = True  # Run Chrome in headless mode
    driver = webdriver.Chrome(options=options)
    
    try:
        url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
        driver.get(url)
        
        # Optional: Scroll to the bottom of the page to trigger loading if needed
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Give it a moment to load
        
        # Get the page source and parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # Get all text from the page
        all_text = soup.get_text(separator="\n", strip=True)
        return all_text
    
    finally:
        driver.quit()

def save_text_to_csv(semester, course_number, text_data, filename="course_info.csv"):
    # Split the text data by lines
    lines = text_data.split("\n")
    
    # Write data to a CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.writer(file)
        
        # Write semester, course number, and each line of the text as a row in the CSV file
        for line in lines:
            writer.writerow([semester, course_number, line])

# List of semesters from 2013-2014 to 2023-2024
semesters = [f"{year}-{year+1}" for year in range(2013, 2024)]

# Example usage with multiple semesters
course_number = "01005"

# Clear the CSV file if it exists
with open("course_info.csv", mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Semester", "Course Number", "Text"])

# Loop through each semester and scrape data
for semester in semesters:
    page_text = fetch_all_text_with_selenium(semester, course_number)
    print(f"Fetched data for {semester} - {course_number}")
    
    # Save the extracted text to a CSV file
    save_text_to_csv(semester, course_number, page_text, filename="course_info.csv")

print("Data fetching complete. Results saved to course_info.csv")

Fetched data for 2013-2014 - 01005
Fetched data for 2014-2015 - 01005


KeyboardInterrupt: 

In [70]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time

def fetch_all_text_with_selenium(driver, semester, course_number):
    url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
    driver.get(url)
    
    # Optional: Scroll to the bottom of the page to trigger loading if needed
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Give it a moment to load
    
    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Get all text from the page
    all_text = soup.get_text(separator="\n", strip=True)
    return all_text

def save_text_to_csv(semester, course_number, text_data, filename="course_info.csv"):
    # Split the text data by lines
    lines = text_data.split("\n")
    
    # Write data to a CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.writer(file)
        
        # Write semester, course number, and each line of the text as a row in the CSV file
        for line in lines:
            writer.writerow([semester, course_number, line])

# List of semesters from 2013-2014 to 2023-2024
semesters = [f"{year}-{year+1}" for year in range(2017, 2024)]

# Example usage with multiple semesters
course_number = "01005"

# Clear the CSV file if it exists
with open("course_info.csv", mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Semester", "Course Number", "Text"])

# Start the browser session once
driver = webdriver.Chrome()  # Simple instantiation without custom path or options

try:
    # Loop through each semester and scrape data
    for semester in semesters:
        page_text = fetch_all_text_with_selenium(driver, semester, course_number)
        print(f"Fetched data for {semester} - {course_number}")
        
        # Save the extracted text to a CSV file
        save_text_to_csv(semester, course_number, page_text, filename="course_info.csv")
finally:
    # Close the browser session when done
    driver.quit()

print("Data fetching complete. Results saved to course_info.csv")

Fetched data for 2017-2018 - 01005
Fetched data for 2018-2019 - 01005
Fetched data for 2019-2020 - 01005
Fetched data for 2020-2021 - 01005
Fetched data for 2021-2022 - 01005
Fetched data for 2022-2023 - 01005
Fetched data for 2023-2024 - 01005
Data fetching complete. Results saved to course_info.csv


In [None]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time

def fetch_all_text_with_selenium(driver, semester, course_number):
    url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
    driver.get(url)
    
    time.sleep(2)  # Give it a moment to load
    
    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Get all text from the page
    all_text = soup.get_text(separator="\n", strip=True)
    return all_text

def save_text_to_csv(semester, course_number, text_data, filename="course_info.csv"):
    # Split the text data by lines
    lines = text_data.split("\n")
    
    # Write data to a CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.writer(file)
        
        # Write semester, course number, and each line of the text as a row in the CSV file
        for line in lines:
            writer.writerow([semester, course_number, line])

# List of semesters from 2017-2018 to 2023-2024
semesters = [f"{year}-{year+1}" for year in range(2017, 2024)]

# Generate course numbers from 01005 to 88717
course_numbers = [str(num).zfill(5) for num in range(1005, 88718)]

# Clear the CSV file if it exists
with open("course_info.csv", mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Semester", "Course Number", "Text"])

# Start the browser session once
driver = webdriver.Chrome()  # Simple instantiation without custom path or options

try:
    # Loop through each semester and scrape data
    for semester in semesters:
        page_text = fetch_all_text_with_selenium(driver, semester, course_number)
        print(f"Fetched data for {semester} - {course_number}")
        
        # Save the extracted text to a CSV file
        save_text_to_csv(semester, course_number, page_text, filename="course_info.csv")
finally:
    # Close the browser session when done
    driver.quit()

print("Data fetching complete. Results saved to course_info.csv")

In [72]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time

def fetch_table_data_with_selenium(driver, semester, course_number):
    url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
    driver.get(url)
    
    time.sleep(2)  # Give it a moment to load fully if necessary
    
    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Find the specific table within the div with class "box information"
    table = soup.find("div", class_="box information").find("table")
    
    if table:
        # Extract table rows and columns
        rows = table.find_all("tr")
        table_data = []
        for row in rows:
            cols = row.find_all("td")
            cols = [col.get_text(strip=True) for col in cols]
            table_data.append(cols)
        return table_data
    else:
        return None

def save_table_data_to_csv(semester, course_number, table_data, filename="course_info.csv"):
    # Write table data to a CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.writer(file)
        
        # Write the header only if the file is empty
        if file.tell() == 0:
            writer.writerow(["Semester", "Course Number", "Column1", "Column2", "Column3", "Column4"])  # Adjust header names as needed
        
        # Write each row of the table with the semester and course number
        for row in table_data:
            writer.writerow([semester, course_number] + row)

# List of semesters from 2017-2018 to 2023-2024
semesters = [f"{year}-{year+1}" for year in range(2017, 2024)]

# Generate course numbers from 01005 to 88717
course_number = "01005"

# Start the browser session once
driver = webdriver.Chrome()  # Simple instantiation without custom path or options

try:
    # Loop through each semester and course number and scrape data
    for semester in semesters:
        for course_number in course_numbers:
            table_data = fetch_table_data_with_selenium(driver, semester, course_number)
            if table_data:
                print(f"Fetched data for {semester} - {course_number}")
                # Save the extracted table data to a CSV file
                save_table_data_to_csv(semester, course_number, table_data, filename="course_info.csv")
            else:
                print(f"No table found for {semester} - {course_number}")
finally:
    # Close the browser session when done
    driver.quit()

print("Data fetching complete. Results saved to course_info.csv")

Fetched data for 2017-2018 - 01005
Fetched data for 2018-2019 - 01005
Fetched data for 2019-2020 - 01005
Fetched data for 2020-2021 - 01005
Fetched data for 2021-2022 - 01005
Fetched data for 2022-2023 - 01005
Fetched data for 2023-2024 - 01005
Data fetching complete. Results saved to course_info.csv


In [73]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options

def fetch_information_with_selenium(driver, semester, course_number):
    url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
    driver.get(url)
    
    time.sleep(2)  # Give it a moment to load fully if necessary
    
    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Find the specific div with class "information"
    information_div = soup.find("div", class_="information")
    
    if information_div:
        # Extract all the text within the div
        information_text = information_div.get_text(separator="\n", strip=True)
        return information_text
    else:
        return "No information found"

def save_information_to_csv(semester, course_number, information_text, filename="course_info.csv"):
    # Write information to a CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.writer(file)
        
        # Write the header only if the file is empty
        if file.tell() == 0:
            writer.writerow(["Semester", "Course Number", "Information"])
        
        # Write the information with the semester and course number
        writer.writerow([semester, course_number, information_text])

# List of semesters from 2017-2018 to 2023-2024
semesters = [f"{year}-{year+1}" for year in range(2017, 2024)]

# Generate course numbers from 01005 to 88717
course_number = "01005"

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")  # Optional: Start maximized
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model

# Start the browser session once
driver = webdriver.Chrome(options=chrome_options)

try:
    # Loop through each semester and course number and scrape data
    for semester in semesters:
        for course_number in course_numbers:
            information_text = fetch_information_with_selenium(driver, semester, course_number)
            if information_text:
                print(f"Fetched information for {semester} - {course_number}")
                # Save the extracted information to a CSV file
                save_information_to_csv(semester, course_number, information_text, filename="course_info.csv")
            else:
                print(f"No information found for {semester} - {course_number}")
finally:
    # Close the browser session when done
    driver.quit()

print("Data fetching complete. Results saved to course_info.csv")

Fetched information for 2017-2018 - 01005
Fetched information for 2018-2019 - 01005
Fetched information for 2019-2020 - 01005
Fetched information for 2020-2021 - 01005
Fetched information for 2021-2022 - 01005
Fetched information for 2022-2023 - 01005
Fetched information for 2023-2024 - 01005
Data fetching complete. Results saved to course_info.csv


In [74]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options

def fetch_kursusinformation_with_selenium(driver, semester, course_number):
    url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
    driver.get(url)
    
    time.sleep(2)  # Give it a moment to load fully if necessary
    
    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Find the specific div with class "box information"
    kursusinformation_div = soup.find("div", class_="box information")
    
    if kursusinformation_div:
        # Extract all the text within the Kursusinformation div
        kursusinformation_text = kursusinformation_div.get_text(separator="\n", strip=True)
        return kursusinformation_text
    else:
        return "No Kursusinformation found"

def save_kursusinformation_to_csv(semester, course_number, kursusinformation_text, filename="course_info.csv"):
    # Write Kursusinformation data to a CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.writer(file)
        
        # Write the header only if the file is empty
        if file.tell() == 0:
            writer.writerow(["Semester", "Course Number", "Kursusinformation"])
        
        # Write the Kursusinformation with the semester and course number
        writer.writerow([semester, course_number, kursusinformation_text])

# List of semesters from 2017-2018 to 2023-2024
semesters = [f"{year}-{year+1}" for year in range(2017, 2024)]

# Generate course numbers from 01005 to 88717
course_number = "01005"

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")  # Optional: Start maximized
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model

# Start the browser session once
driver = webdriver.Chrome(options=chrome_options)

try:
    # Loop through each semester and course number and scrape data
    for semester in semesters:
        for course_number in course_numbers:
            kursusinformation_text = fetch_kursusinformation_with_selenium(driver, semester, course_number)
            if kursusinformation_text:
                print(f"Fetched Kursusinformation for {semester} - {course_number}")
                # Save the extracted Kursusinformation to a CSV file
                save_kursusinformation_to_csv(semester, course_number, kursusinformation_text, filename="course_info.csv")
            else:
                print(f"No Kursusinformation found for {semester} - {course_number}")
finally:
    # Close the browser session when done
    driver.quit()

print("Data fetching complete. Results saved to course_info.csv")

Fetched Kursusinformation for 2017-2018 - 01005
Fetched Kursusinformation for 2018-2019 - 01005
Fetched Kursusinformation for 2019-2020 - 01005
Fetched Kursusinformation for 2020-2021 - 01005
Fetched Kursusinformation for 2021-2022 - 01005
Fetched Kursusinformation for 2022-2023 - 01005
Fetched Kursusinformation for 2023-2024 - 01005
Data fetching complete. Results saved to course_info.csv


In [77]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options

def fetch_kursusinformation_with_selenium(driver, semester, course_number):
    url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
    driver.get(url)
    
    time.sleep(2)  # Give it a moment to load fully if necessary
    
    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Find the specific "Kursusinformation" table or section within the div
    kursusinformation_section = soup.select_one("div.box.information")
    
    if kursusinformation_section:
        # Extract all the text within the Kursusinformation section
        kursusinformation_text = kursusinformation_section.get_text(separator="\n", strip=True)
        return kursusinformation_text
    else:
        return "No Kursusinformation found"

def save_kursusinformation_to_csv(semester, course_number, kursusinformation_text, filename="kursusinformation.csv"):
    # Write Kursusinformation data to a CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.writer(file)
        
        # Write the header only if the file is empty
        if file.tell() == 0:
            writer.writerow(["Semester", "Course Number", "Kursusinformation"])
        
        # Write the Kursusinformation with the semester and course number
        writer.writerow([semester, course_number, kursusinformation_text])

# List of semesters from 2017-2018 to 2023-2024
semesters = [f"{year}-{year+1}" for year in range(2017, 2024)]

# Generate course numbers from 01005 to 88717
course_number = "01005"

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")  # Optional: Start maximized
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model

# Start the browser session once
driver = webdriver.Chrome(options=chrome_options)

try:
    # Loop through each semester and course number and scrape data
    for semester in semesters:
        for course_number in course_numbers:
            kursusinformation_text = fetch_kursusinformation_with_selenium(driver, semester, course_number)
            if kursusinformation_text:
                print(f"Fetched Kursusinformation for {semester} - {course_number}")
                # Save the extracted Kursusinformation to a CSV file
                save_kursusinformation_to_csv(semester, course_number, kursusinformation_text, filename="kursusinformation.csv")
            else:
                print(f"No Kursusinformation found for {semester} - {course_number}")
finally:
    # Close the browser session when done
    driver.quit()

print("Data fetching complete. Results saved to kursusinformation.csv")

Fetched Kursusinformation for 2017-2018 - 01005
Fetched Kursusinformation for 2018-2019 - 01005
Fetched Kursusinformation for 2019-2020 - 01005
Fetched Kursusinformation for 2020-2021 - 01005
Fetched Kursusinformation for 2021-2022 - 01005
Fetched Kursusinformation for 2022-2023 - 01005
Fetched Kursusinformation for 2023-2024 - 01005
Data fetching complete. Results saved to kursusinformation.csv
