In [1]:
# -*- coding: utf-8 -*-
"""
교수 '리스트' 페이지 URL들을 미리 알고 있을 때의 간단 수집기 (requests + bs4)
출력: pcu_professors_bs.csv
"""
import re
import csv
import time
import pandas as pd
import openpyxl
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup

In [2]:
file_path = 'prof_pages.xlsx'

# 엑셀 파일 읽기 (기본적으로 첫 번째 시트를 읽음)
df = pd.read_excel(file_path)

df = df [['department', 'link_address']]
# 데이터 출력
# print(df)


In [3]:
LIST_URLS = df['link_address']

In [4]:
HEADERS = {"User-Agent": "Mozilla/5.0"}
OUTPUT = "pcu_professors_bs.csv"

In [None]:
# LIST_URLS = [
#     # 예시: 컴공/조경 등 '교수 소개' 리스트 페이지들
#     "https://hakgwa.pcu.ac.kr/klle/56/professor/25100200",
#     "https://hakgwa.pcu.ac.kr/scape/77/professor/25500200",
# ]

In [19]:
url ='https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/380'
url ='https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/40'

for _ in range(3):
    r = requests.get(url, headers=HEADERS, timeout=20)
    if r.status_code == 200:
        print( r.text)
    time.sleep(1)
r.raise_for_status()







<!DOCTYPE html>
<html>
<head>



<meta charset="UTF-8">
<meta http-equiv='X-UA-Compatible' content='IE=Edge' />
<meta name="viewport" content="width=device-width,initial-scale=1.0,minimum-scale=1.0,maximum-scale=2.0,user-scalable=yes">
<meta name="format-detection" content="telephone=no,email=no,address=no" />
<meta name="keywords" content="배재대학교 국어국문한국어교육학과" />
<meta property="og:title" content="배재대학교 국어국문한국어교육학과">
<title>교수 소개 &lt; 교수 소개배재대학교-국어국문한국어교육학과</title>

<script>
    /* js의 contextpath 적용 */
    var contextPath = '';
</script>

<script src="/js/babel.min.js"></script>
<script src="/js/babel.polyfill.min.js"></script>
<script src="/js/bluebird.js"></script>

<script type="text/babel" data-presets="es2015,stage-2">
    const CONTEXT_PATH = '';
    const mno = '';
</script>

<link rel="stylesheet" type="text/css" href="/css/site/core/core.css" media="all" />
<script type="text/javascript" src="/js/site/js.js"></script>
<!-- skin :s -->
<link rel="stylesheet" type="text/cs

In [21]:
print(r.raise_for_status())

None


In [5]:
def get(url):
    for _ in range(3):
        r = requests.get(url, headers=HEADERS, timeout=20)
        if r.status_code == 200:
            return r.text
        time.sleep(1)
    r.raise_for_status()

In [6]:
def collect_detail_urls(list_url):
    html = get(list_url)
    soup = BeautifulSoup(html, "html.parser")
    urls = set()
    for a in soup.select("a[href]"):
        href = a["href"]
        if re.search(r"/professor/\d+/\d+", href):
            urls.add(urljoin(list_url, href))
    return list(urls), soup

In [7]:
def extract_name_title(soup):
    # 페이지 상단/프로필에서 이름/직함 후보 찾기
    name = ""
    title = ""
    for sel in ["h3", "h2", ".name", ".prof_name", ".tit", ".title", ".professor_name", ".info .name"]:
        el = soup.select_one(sel)
        if el and el.get_text(strip=True):
            t = el.get_text(strip=True)
            m = re.match(r"(.+?)(?:\s*(교수|조교수|부교수|겸임교원|명예교수|학과장).*)?$", t)
            if m:
                name = m.group(1).strip()
                if m.group(2):
                    title = m.group(2).strip()
                break
    if not name:
        img = soup.select_one("img[alt]")
        if img:
            name = img.get("alt", "").strip()
    return name, title

In [8]:
def section_raw(soup, keywords):
    # '학력사항', '경력 및 활동' 섹션 텍스트 보존 추출
    for header in soup.find_all(["h2","h3","h4","strong","dt"]):
        if any(k in (header.get_text() or "") for k in keywords):
            nxt = header.find_next_sibling()
            tries = 0
            while nxt and tries < 5:
                if nxt.name in ["ul","ol","div","p","section"]:
                    break
                nxt = nxt.find_next_sibling()
                tries += 1
            if nxt:
                if nxt.name in ["ul","ol"]:
                    return "\n".join(li.get_text("\n", strip=True) for li in nxt.find_all("li", recursive=False))
                lis = nxt.find_all("li")
                if lis:
                    return "\n".join(li.get_text("\n", strip=True) for li in lis)
                ps = nxt.find_all("p")
                if ps:
                    return "\n".join(p.get_text("\n", strip=True) for p in ps)
                return nxt.get_text("\n", strip=True)
    return ""

In [16]:
def main():
    rows = []
    for list_url in LIST_URLS:
        detail_urls, soup_list = collect_detail_urls(list_url)
        # 대략적인 학과명 추출(타이틀/브레드크럼 등)
        dept = ""
        for sel in [".sub_visual .title",".page-tit","h2.title","h1.title",".location .now",".breadcrumb li.active"]:
            el = soup_list.select_one(sel)
            if el and el.get_text(strip=True):
                dept = el.get_text(strip=True)
                break

        for durl in detail_urls:
            html = get(durl)
            soup = BeautifulSoup(html, "html.parser")
            name, title = extract_name_title(soup)
            edu = section_raw(soup, ["학력","학력사항"])
            career = section_raw(soup, ["경력","경력 및 활동","경력사항","활동"])
            rows.append({
                "department": dept,
                "professor_name": name,
                "title": title,
                "detail_url": durl,
                "education_raw": edu,
                "career_raw": career,
            })
            print(f"[OK] {dept} | {name} | {durl}")

        print("****", rows)

    with open(OUTPUT, "w", newline="", encoding="utf-8-sig") as f:
        w = csv.DictWriter(f, fieldnames=["department","professor_name","title","detail_url","education_raw","career_raw"])
        w.writeheader()
        w.writerows(rows)

    print(f"\n완료! {len(rows)}건 저장 → {OUTPUT}")

In [17]:
if __name__ == "__main__":
    main()

[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/380
[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/381
[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/48
[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/41
[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/1985
[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/1224
[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/40
[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/39
[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/46
[OK]  | 교수 소개 | https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/382
**** [{'department': '', 'professor_name': '교수 소개', 'title': '', 'detail_url': 'https://hakgwa.pcu.ac.kr/klle/56/professor/25100200/380', 'education_raw': '', 'career_raw': ''}, {'department': '', 'professor_name': '교수 소개', 'title': '', 'detail_url': 'https://hakgwa.pcu.ac.kr/klle/5

KeyboardInterrupt: 