In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time
import logging

def scrape_courses(file_name):
    """
    Scrapes course information from a university website based on department links in a JSON file.
    
    Args:
        file_name (str): Name of the JSON file (without extension) containing department links
        
    Returns:
        dict: Dictionary containing all scraped course data
        
    The function also writes the results to a JSON file named '{file_name}_results.json'
    """
    # Set up logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler()
        ]
    )
    
    start_time = time.time()
    logging.info(f"Starting scraping process using links from {file_name}.json")
    
    # Load department links from JSON file
    try:
        with open(f'{file_name}.json', 'r', encoding='utf-8') as f:
            links = json.load(f)
        logging.info(f"Successfully loaded {len(links)} departments from {file_name}.json")
    except FileNotFoundError:
        logging.error(f"File {file_name}.json not found")
        return None
    except json.JSONDecodeError:
        logging.error(f"Invalid JSON format in {file_name}.json")
        return None
    
    # Print links for verification
    logging.info(f"Department links: {links}")
    
    base_url = "https://ects.bilgi.edu.tr"
    
    # Department URLs from the loaded JSON
    departments = links
    
    all_data = {}
    total_courses = 0
    processed_courses = 0
    
    # First pass to count total courses for estimation
    for dept_name, curriculum_url in departments.items():
        try:
            response = requests.get(curriculum_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            course_links = soup.find_all('a', href=True)
            dept_courses = sum(1 for link in course_links if '/Course/Detail?catalog_courseId=' in link['href'])
            total_courses += dept_courses
            
            logging.info(f"Department '{dept_name}' has {dept_courses} courses")
        except Exception as e:
            logging.error(f"Error counting courses for department '{dept_name}': {str(e)}")
    
    logging.info(f"Total courses to process: {total_courses}")
    
    # Estimate processing time (assuming ~3 seconds per course)
    seconds_per_course = 3
    estimated_total_seconds = total_courses * seconds_per_course
    estimated_hours = estimated_total_seconds // 3600
    estimated_minutes = (estimated_total_seconds % 3600) // 60
    estimated_seconds = estimated_total_seconds % 60
    
    logging.info(f"Estimated completion time: {estimated_hours}h {estimated_minutes}m {estimated_seconds}s")
    
    # Second pass to actually scrape the data
    for dept_name, curriculum_url in departments.items():
        dept_start_time = time.time()
        logging.info(f"Processing department: '{dept_name}' - URL: {curriculum_url}")
        
        try:
            response = requests.get(curriculum_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            courses = []
            
            course_links = soup.find_all('a', href=True)
            dept_course_count = 0
            
            for course in course_links:
                if '/Course/Detail?catalog_courseId=' in course['href']:
                    course_name = course.text.strip()
                    course_link = base_url + course['href']
                    
                    logging.info(f"Processing course: '{course_name}' - {processed_courses+1}/{total_courses}")
                    course_start_time = time.time()
                    
                    # Get each course's detail page
                    try:
                        course_response = requests.get(course_link)
                        course_soup = BeautifulSoup(course_response.text, 'html.parser')
                        
                        course_details = course_soup.find_all('tr')
                        course_description = ""
                        course_learning_outcomes = []
                        
                        for row in course_details:
                            cols = row.find_all('td')
                            if len(cols) >= 2:
                                title = cols[0].text.strip()
                                content = cols[1]
                                
                                if title == "Course Description":
                                    course_description = content.text.strip()
                                
                                elif "Learning Outcomes of the Course Unit" in title:
                                    course_learning_outcomes = [li.text.strip() for li in content.find_all('li')]
                        
                        # Save each course's data
                        if dept_name not in all_data:
                            all_data[dept_name] = []
                        
                        all_data[dept_name].append({
                            "course_name": course_name,
                            "course_link": course_link,
                            "course_description": course_description,
                            "course_learning_outcomes": course_learning_outcomes
                        })
                        
                        dept_course_count += 1
                        processed_courses += 1
                        
                        course_end_time = time.time()
                        course_duration = course_end_time - course_start_time
                        
                        # Update time estimation based on actual performance
                        if processed_courses > 5:  # After processing a few courses to get a better average
                            seconds_per_course = (course_end_time - start_time) / processed_courses
                            remaining_courses = total_courses - processed_courses
                            remaining_seconds = remaining_courses * seconds_per_course
                            
                            remaining_hours = int(remaining_seconds // 3600)
                            remaining_minutes = int((remaining_seconds % 3600) // 60)
                            remaining_seconds = int(remaining_seconds % 60)
                            
                            elapsed = course_end_time - start_time
                            elapsed_hours = int(elapsed // 3600)
                            elapsed_minutes = int((elapsed % 3600) // 60)
                            elapsed_seconds = int(elapsed % 60)
                            
                            logging.info(f"Course processed in {course_duration:.2f}s - " +
                                        f"Progress: {processed_courses}/{total_courses} " +
                                        f"({(processed_courses/total_courses*100):.1f}%) - " +
                                        f"Elapsed: {elapsed_hours}h {elapsed_minutes}m {elapsed_seconds}s - " +
                                        f"ETA: {remaining_hours}h {remaining_minutes}m {remaining_seconds}s")
                        
                    except Exception as e:
                        logging.error(f"Error processing course '{course_name}': {str(e)}")
            
            dept_end_time = time.time()
            dept_duration = dept_end_time - dept_start_time
            logging.info(f"Department '{dept_name}' completed - {dept_course_count} courses processed in {dept_duration:.2f}s")
            
        except Exception as e:
            logging.error(f"Error processing department '{dept_name}': {str(e)}")
    
    # Write all data to JSON file
    output_file = f"{file_name}_results.json"
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_data, f, ensure_ascii=False, indent=4)
        logging.info(f"Done! All data saved to '{output_file}'")
    except Exception as e:
        logging.error(f"Error writing to output file: {str(e)}")
    
    end_time = time.time()
    total_duration = end_time - start_time
    hours = int(total_duration // 3600)
    minutes = int((total_duration % 3600) // 60)
    seconds = int(total_duration % 60)
    
    logging.info(f"Scraping completed in {hours}h {minutes}m {seconds}s")
    logging.info(f"Processed {processed_courses} courses from {len(departments)} departments")
    
    return all_data

# Example usage
if __name__ == "__main__":
    # To use the function, call it with the JSON filename (without extension)
    # For example, if your file is "links.json":
    scrape_courses("")

2025-03-11 23:33:40,260 - INFO - Starting scraping process using links from linkler_v2.json
2025-03-11 23:33:40,263 - INFO - Successfully loaded 2 departments from linkler_v2.json
2025-03-11 23:33:40,265 - INFO - Department links: {'International Finance': 'https://ects.bilgi.edu.tr/Department/Curriculum?catalog_departmentId=214496', 'International Trade and Business': 'https://ects.bilgi.edu.tr/Department/Curriculum?catalog_departmentId=214500'}
2025-03-11 23:33:40,511 - INFO - Department 'International Finance' has 37 courses
2025-03-11 23:33:40,746 - INFO - Department 'International Trade and Business' has 38 courses
2025-03-11 23:33:40,747 - INFO - Total courses to process: 75
2025-03-11 23:33:40,749 - INFO - Estimated completion time: 0h 3m 45s
2025-03-11 23:33:40,751 - INFO - Processing department: 'International Finance' - URL: https://ects.bilgi.edu.tr/Department/Curriculum?catalog_departmentId=214496
2025-03-11 23:33:40,991 - INFO - Processing course: 'BUS 120' - 1/75
2025-03-