In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the range of years
start_year = 2017
end_year = 2023

# Example list of department codes (replace these with actual department codes)
departments = ['1', '10', '11', '12', '13', '22', '23', '24', '25', '26', '27', '28', '29', '30', '33', '34', '36', '38', '41', '42', '46', '47', '56', '62', '63', '83', '88', 'CB', 'KU']
course_codes = set()  # Using a set to automatically handle duplicates

# Loop over each academic year
for year in range(start_year, end_year + 1):
    academic_year = f"{year}-{year+1}"
    
    # Loop over each department code
    for dept in departments:
        url = f"https://kurser.dtu.dk/archive/{academic_year}/department/{dept}"
        
        # Fetch the page content
        response = requests.get(url)
        if response.status_code == 200:  # Proceed only if the page exists
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all course codes (assuming they are in <td> tags inside a table row <tr>)
            rows = soup.find_all('tr')

            for row in rows:
                cells = row.find_all('td')
                if cells:
                    # Assuming the course code is in the first cell (td)
                    course_code = cells[0].get_text(strip=True)
                    course_codes.add(course_code)  # Add to the set to avoid duplicates

# Convert the set to a sorted list
course_codes_list = sorted(course_codes)

# Create a DataFrame for better visualization and export
df = pd.DataFrame(course_codes_list, columns=["Course Code"])

# Display the first few rows to check
print(df.head())

# Save to Excel or CSV for further analysis
df.to_excel('course_codes.xlsx', index=False)

  Course Code
0       01001
1       01002
2       01003
3       01004
4       01005


In [16]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import random
from selenium.webdriver.chrome.options import Options

def fetch_kursusinformation_with_selenium(driver, semester, course_number):
    url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
    driver.get(url)
    
    time.sleep(random.uniform(1.08, 1.44))  # Give it a moment to load fully if necessary
    
    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Find the specific "Kursusinformation" table or section within the div
    kursusinformation_section = soup.select_one("div.box.information")
    
    if kursusinformation_section:
        # Extract all the text within the Kursusinformation section
        kursusinformation_text = kursusinformation_section.get_text(separator="\n", strip=True)
        return kursusinformation_text
    else:
        return None  # Return None if no information is found

def save_kursusinformation_to_csv(semester, course_number, kursusinformation_text, filename="kursusinformation.csv"):
    # Write Kursusinformation data to a CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.writer(file)
        
        # Write the header only if the file is empty
        if file.tell() == 0:
            writer.writerow(["Semester", "Course Number", "Kursusinformation"])
        
        # Write the Kursusinformation with the semester and course number
        writer.writerow([semester, course_number, kursusinformation_text])

# List of semesters from 2017-2018 to 2023-2024
semesters = [f"{year}-{year+1}" for year in range(2017, 2024)]

# Generate the first 20 course numbers between 01000 to 88718
course_numbers = df['Course Code'].tolist()

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")  # Optional: Start maximized
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model

# Start the browser session once
driver = webdriver.Chrome(options=chrome_options)

try:
    # Loop through each semester and the first 20 course numbers and scrape data
    for semester in semesters:
        for course_number in course_numbers:
            kursusinformation_text = fetch_kursusinformation_with_selenium(driver, semester, course_number)
            if kursusinformation_text:
                print(f"Fetched Kursusinformation for {semester} - {course_number}")
                # Save the extracted Kursusinformation to a CSV file
                save_kursusinformation_to_csv(semester, course_number, kursusinformation_text, filename="kursusinformation.csv")
finally:
    # Close the browser session when done
    driver.quit()

print("Data fetching complete. Results saved to kursusinformation.csv")

Fetched Kursusinformation for 2017-2018 - 01003
Fetched Kursusinformation for 2017-2018 - 01005
Fetched Kursusinformation for 2017-2018 - 01006
Fetched Kursusinformation for 2017-2018 - 01015
Fetched Kursusinformation for 2017-2018 - 01016
Fetched Kursusinformation for 2017-2018 - 01017
Fetched Kursusinformation for 2017-2018 - 01018
Fetched Kursusinformation for 2017-2018 - 01019
Fetched Kursusinformation for 2017-2018 - 01025
Fetched Kursusinformation for 2017-2018 - 01034
Fetched Kursusinformation for 2017-2018 - 01035
Fetched Kursusinformation for 2017-2018 - 01036
Fetched Kursusinformation for 2017-2018 - 01037
Fetched Kursusinformation for 2017-2018 - 01125
Fetched Kursusinformation for 2017-2018 - 01227
Fetched Kursusinformation for 2017-2018 - 01236
Fetched Kursusinformation for 2017-2018 - 01237
Fetched Kursusinformation for 2017-2018 - 01257
Fetched Kursusinformation for 2017-2018 - 01325
Fetched Kursusinformation for 2017-2018 - 01405
Fetched Kursusinformation for 2017-2018 

TimeoutError: [Errno 60] Operation timed out

In [19]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import random
from selenium.webdriver.chrome.options import Options

def fetch_kursusinformation_with_selenium(driver, semester, course_number):
    url = f"https://kurser.dtu.dk/course/{semester}/{course_number}"
    driver.get(url)
    
    time.sleep(random.uniform(1.08, 1.44))  # Give it a moment to load fully if necessary
    
    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Find the specific "Kursusinformation" table or section within the div
    kursusinformation_section = soup.select_one("div.box.information")
    
    if kursusinformation_section:
        # Extract all the text within the Kursusinformation section
        kursusinformation_text = kursusinformation_section.get_text(separator="\n", strip=True)
        return kursusinformation_text
    else:
        return None  # Return None if no information is found

def save_kursusinformation_to_csv(semester, course_number, kursusinformation_text, filename="kursusinformation.csv"):
    # Write Kursusinformation data to a CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.writer(file)
        
        # Write the header only if the file is empty
        if file.tell() == 0:
            writer.writerow(["Semester", "Course Number", "Kursusinformation"])
        
        # Write the Kursusinformation with the semester and course number
        writer.writerow([semester, course_number, kursusinformation_text])

# List of semesters from 2017-2018 to 2023-2024
semesters = [f"{year}-{year+1}" for year in range(2019, 2020)]

# Generate the first 20 course numbers between 01000 to 88718
course_numbers = df['Course Code'].tolist()

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")  # Optional: Start maximized
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model

# Start the browser session once
driver = webdriver.Chrome(options=chrome_options)

try:
    # Loop through each semester and the first 20 course numbers and scrape data
    for semester in semesters:
        for course_number in course_numbers:
            kursusinformation_text = fetch_kursusinformation_with_selenium(driver, semester, course_number)
            if kursusinformation_text:
                print(f"Fetched Kursusinformation for {semester} - {course_number}")
                # Save the extracted Kursusinformation to a CSV file
                save_kursusinformation_to_csv(semester, course_number, kursusinformation_text, filename="kursusinformation.csv")
finally:
    # Close the browser session when done
    driver.quit()

print("Data fetching complete. Results saved to kursusinformation.csv")

Fetched Kursusinformation for 2019-2020 - 01003
Fetched Kursusinformation for 2019-2020 - 01005
Fetched Kursusinformation for 2019-2020 - 01006
Fetched Kursusinformation for 2019-2020 - 01015
Fetched Kursusinformation for 2019-2020 - 01016
Fetched Kursusinformation for 2019-2020 - 01017
Fetched Kursusinformation for 2019-2020 - 01018
Fetched Kursusinformation for 2019-2020 - 01019
Fetched Kursusinformation for 2019-2020 - 01025
Fetched Kursusinformation for 2019-2020 - 01034
Fetched Kursusinformation for 2019-2020 - 01035
Fetched Kursusinformation for 2019-2020 - 01036
Fetched Kursusinformation for 2019-2020 - 01037
Fetched Kursusinformation for 2019-2020 - 01125
Fetched Kursusinformation for 2019-2020 - 01227
Fetched Kursusinformation for 2019-2020 - 01237
Fetched Kursusinformation for 2019-2020 - 01238
Fetched Kursusinformation for 2019-2020 - 01257
Fetched Kursusinformation for 2019-2020 - 01325
Fetched Kursusinformation for 2019-2020 - 01405
Fetched Kursusinformation for 2019-2020 

In [None]:
from pathlib import Path
from zipfile import ZipFile
fp_data = Path.cwd() / "data" #Create path 
Path.mkdir(fp_data, exist_ok=True) #Create subfolder

In [None]:
import pandas as pd
import os

# Define the years and periods
years = range(2017, 2024)  # Adjust as needed
seasons = ['Summer', 'Winter']

# Prepare the CSV file path
output_file_path = './data/combined_course_data.csv'

# Check if the file already exists; if not, create it with headers
if not os.path.exists(output_file_path):
    with open(output_file_path, 'w') as f:
        # Writing header for the first time
        f.write('Course,Year,Season,Column1,Column2,...\n')  # Adjust columns as needed

# Loop through each course number, year, and season
for course in course_codes:  # Assuming you're using the list course_codes from the Excel file
    for year in years:
        for season in seasons:
            # Construct the URL dynamically
            period = f'{season}-{year}'
            url = f'https://karakterer.dtu.dk/Histogram/1/{course}/{period}'
            
            try:
                # Parse all tables found on the page
                dfs = pd.read_html(url)
                
                # Check if any tables were found
                if dfs:
                    # Assuming the first table is the one you want
                    df = dfs[0]
                    
                    # Add additional columns to the DataFrame for course, year, and season
                    df.insert(0, 'Course', course)
                    df.insert(1, 'Year', year)
                    df.insert(2, 'Season', season)
            
                    
                    # Append the DataFrame to the CSV file
                    df.to_csv(output_file_path, mode='a', header=False, index=False)
                    
                    print(f"Data for {course} during {period} added.")
                else:
                    print(f"No data available for {course} during {period}. Skipping...")
            
            except ValueError:
                # Handle the case where no tables are found on the page
                print(f"No tables found for {course} during {period}. Skipping...")
            except Exception as e:
                print(f"An error occurred while processing {course} during {period}: {e}")

print(f"Data collection completed. Data saved to '{output_file_path}'.")