In [7]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import math
import os
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
from openpyxl import Workbook
from openpyxl.drawing.image import Image as ExcelImage

# Step 1: Gather input from the user
print("=" * 100)
print("This is a web crawler for collecting lecture data from the Hunet site.")
print("=" * 100)

query_txt = input('1. What keyword would you like to collect data for? (e.g., Python): ')

try:
    cnt = int(input('2. How many lectures would you like to collect? (default: 10): '))
except ValueError:
    cnt = 10
    print('Proceeding with the default value of 10 lectures.')

# Calculate the number of pages to scrape based on the number of lectures
page_cnt = math.ceil(cnt / 12)

f_dir = input('3. Enter the folder path to save files (default: c:\\py_temp\\): ')
if not f_dir:
    f_dir = 'c:\\py_temp\\'

# Step 2: Create a folder to save the results
n = time.localtime()
d = f'{n.tm_year:04d}-{n.tm_mon:02d}-{n.tm_mday:02d}-{n.tm_hour:02d}-{n.tm_min:02d}-{n.tm_sec:02d}'

sec_name = 'hunet_lecture'
save_path = os.path.join(f_dir, f'{d}-{query_txt}-{sec_name}')
os.makedirs(save_path, exist_ok=True)

img_dir = os.path.join(save_path, 'images')
os.makedirs(img_dir, exist_ok=True)

# Step 3: Set up the Chrome WebDriver and open the Hunet website
s = Service("/Users/ghulom97/Downloads/chromedriver-mac-arm64/chromedriver")
driver = webdriver.Chrome(service=s)

url = 'https://www.hunet.co.kr/'
driver.get(url)
driver.maximize_window()
time.sleep(3)

# Step 4: Search for the keyword
search_box = driver.find_element(By.ID, 'txtKeyword')
search_box.click()
search_box.send_keys(query_txt)
search_box.send_keys("\n")

# Step 5: Click 'Load More' to get the desired number of lectures
for _ in range(page_cnt):
    try:
        load_more_button = driver.find_element(By.XPATH, '//*[@id="divEducationList"]/div/div[2]/div[5]/a[1]')
        load_more_button.click()
        time.sleep(2)
    except Exception as e:
        print("No more pages to load or error occurred:", e)
        break

# Step 6: Extract lecture data
lecture_data = {
    'Number': [],
    'Title': [],
    'Satisfaction': [],
    'Reviews': [],
    'Price': []
}

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
lectures = soup.find('ul', class_='vod_list').find_all('li')

print(f'Total {len(lectures)} lectures found.')
print('Collecting images and lecture details...')

img_urls = [img['src'] for img in soup.find('ul', class_='vod_list').find_all('img')]
for idx, img_url in enumerate(img_urls):
    try:
        urllib.request.urlretrieve(img_url, os.path.join(img_dir, f'{idx+1}.jpg'))
        print(f'{idx+1} image saved.')
    except Exception as e:
        print(f"Error saving image {idx+1}: {e}")

for idx, lecture in enumerate(lectures, start=1):
    title = lecture.find('div', class_='title').get_text(strip=True)
    satisfaction = lecture.find('span', class_='ex').find('strong').get_text()
    reviews = lecture.find('span', class_='review').find('strong').get_text()
    price = lecture.find('span', class_='expense').find('strong').get_text()

    lecture_data['Number'].append(idx)
    lecture_data['Title'].append(title)
    lecture_data['Satisfaction'].append(satisfaction)
    lecture_data['Reviews'].append(reviews)
    lecture_data['Price'].append(price)

    print(f'{idx}. Title: {title}, Satisfaction: {satisfaction}, Reviews: {reviews}, Price: {price}')

# Step 7: Save the data to CSV and Excel files
lecture_df = pd.DataFrame(lecture_data)

csv_path = os.path.join(save_path, f'{d}-{query_txt}-{sec_name}.csv')
lecture_df.to_csv(csv_path, encoding='utf-8-sig', index=False)
print(f"CSV file saved at: {csv_path}")

excel_path = os.path.join(save_path, f'{d}-{query_txt}-{sec_name}.xlsx')
lecture_df.to_excel(excel_path, index=False)
print(f"Excel file saved at: {excel_path}")

# Adding images to Excel (Optional)
workbook = Workbook()
sheet = workbook.active

# Add headers
headers = ['Number', 'Title', 'Satisfaction', 'Reviews', 'Price', 'Image']
sheet.append(headers)

# Add lecture data and images to the Excel sheet
for idx, (number, title, satisfaction, reviews, price) in enumerate(zip(
    lecture_data['Number'], 
    lecture_data['Title'], 
    lecture_data['Satisfaction'], 
    lecture_data['Reviews'], 
    lecture_data['Price']
), start=1):
    # Add lecture data to the sheet
    sheet.append([number, title, satisfaction, reviews, price])
    
    # Add image if it exists
    img_path = os.path.join(img_dir, f'{number}.jpg')
    if os.path.exists(img_path):
        img = ExcelImage(img_path)
        img.width = 130
        img.height = 100
        sheet.add_image(img, f'F{idx+1}')  # Adding image in the 6th column

# Save the workbook with images
workbook.save(excel_path)
print(f"Excel file with images saved at: {excel_path}")

# Clean up
driver.quit()


This is a web crawler for collecting lecture data from the Hunet site.
1. What keyword would you like to collect data for? (e.g., Python): Python
2. How many lectures would you like to collect? (default: 10): 20
3. Enter the folder path to save files (default: c:\py_temp\): /Users/ghulom97/Desktop/gooo
No more pages to load or error occurred: Alert Text: 마지막 페이지입니다.
Message: unexpected alert open: {Alert text : 마지막 페이지입니다.}
  (Session info: chrome=127.0.6533.89)
Stacktrace:
0   chromedriver                        0x0000000104925088 cxxbridge1$str$ptr + 1887276
1   chromedriver                        0x000000010491d764 cxxbridge1$str$ptr + 1856264
2   chromedriver                        0x000000010452c82c cxxbridge1$string$len + 88524
3   chromedriver                        0x00000001045a89e8 cxxbridge1$string$len + 596872
4   chromedriver                        0x0000000104565474 cxxbridge1$string$len + 321044
5   chromedriver                        0x00000001045660e4 cxxbridge1$string