In [5]:
# pip install habanero backoff requests beautifulsoup4 pandas

import re, json, time, pandas as pd, requests
from typing import List, Dict
from bs4 import BeautifulSoup
from habanero import Crossref

HDRS = {
    "User-Agent": "Mozilla/5.0",
    "Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
}

def clean(s: str) -> str:
    if not s: return ""
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def clean_abs(s: str) -> str:
    if not s: return ""
    s = re.sub(r"</?jats:[^>]*>", " ", s)     # JATS 제거
    s = re.sub(r"<[^>]+>", " ", s)            # 기타 태그 제거
    return clean(s)

def fetch_urls_from_crossref(journal="Decision Support Systems", volume=164, year=2023) -> List[Dict]:
    """Crossref에서 URL, DOI, 제목/초록(폴백용)만 가져오기"""
    cr = Crossref(mailto="you@example.com")
    flt = {"container-title": journal}
    if year:
        flt["from-pub-date"] = f"{year}-01-01"
        flt["until-pub-date"] = f"{year}-12-31"

    res = cr.works(filter=flt, cursor="*", cursor_max=2000, limit=200)

    def items(it):
        if isinstance(it, dict):
            yield from it.get("message", {}).get("items", [])
        else:
            for chunk in it:
                yield from chunk.get("message", {}).get("items", [])

    out = []
    for it in items(res):
        if str(it.get("volume","")) != str(volume):
            continue
        url = it.get("URL") or (f"https://doi.org/{it.get('DOI')}" if it.get("DOI") else "")
        out.append({
            "url": url,
            "doi": it.get("DOI",""),
            "title_xref": (it.get("title") or [""])[0],
            "abstract_xref": clean_abs(it.get("abstract","")),
            "keywords_xref": ", ".join(it.get("subject", []) or [])
        })
    return out

def get_soup(url: str) -> BeautifulSoup | None:
    try:
        r = requests.get(url, headers=HDRS, timeout=30, allow_redirects=True)
        if r.status_code >= 400:
            return None
        return BeautifulSoup(r.text, "html.parser")
    except:
        return None

def parse_title_abs_kw_from_page(url: str, xref_fallback: Dict) -> Dict:
    soup = get_soup(url)
    title = abstract = ""
    keywords: List[str] = []

    if soup:
        # title
        cand_title = [
            "h1", "h1.article-title", "span.Title", "span.headline",
            "meta[property='og:title']"
        ]
        for sel in cand_title:
            if sel.startswith("meta"):
                el = soup.select_one(sel)
                if el and el.get("content"):
                    title = clean(el["content"]); break
            else:
                el = soup.select_one(sel)
                if el and clean(el.get_text()):
                    title = clean(el.get_text()); break

        # abstract
        cand_abs = [
            "section#abstract p", ".article__abstract p", "div.abstract p",
            "div.Abstracts div.abstract", "div#abspara p"
        ]
        for sel in cand_abs:
            el = soup.select_one(sel)
            if el and clean(el.get_text()):
                abstract = clean(el.get_text()); break
        if not abstract:
            meta = soup.select_one("meta[name='description']")
            if meta and meta.get("content"):
                abstract = clean(meta["content"])

        # keywords
        # 일반 블록
        blk = soup.select_one("ul.keywords, .keywords, #keywords, section.keywords")
        if blk:
            keywords = [clean(x.get_text()) for x in blk.select("li, span, a") if clean(x.get_text())]
        # 메타
        if not keywords:
            meta_kw = soup.select_one("meta[name='keywords']")
            if meta_kw and meta_kw.get("content"):
                keywords = [clean(x) for x in re.split(r",|;", meta_kw["content"]) if clean(x)]
        # 스키마(JSON-LD)
        if not keywords:
            for s in soup.select("script[type='application/ld+json']"):
                try:
                    data = json.loads(s.string or "{}")
                    if isinstance(data, dict) and "keywords" in data:
                        v = data["keywords"]
                        if isinstance(v, str):
                            keywords = [clean(x) for x in re.split(r",|;", v) if clean(x)]
                        elif isinstance(v, list):
                            keywords = [clean(str(x)) for x in v if clean(str(x))]
                        if keywords: break
                except:  # 잘못된 JSON 무시
                    pass

    # 폴백: Crossref 메타
    if not title:
        title = xref_fallback.get("title_xref","")
    if not abstract:
        abstract = xref_fallback.get("abstract_xref","")
    if not keywords:
        kw = xref_fallback.get("keywords_xref","")
        keywords = [k.strip() for k in kw.split(",") if k.strip()] if kw else []

    # 정리
    # 중복 제거
    seen, dedup = set(), []
    for k in keywords:
        if k not in seen:
            seen.add(k); dedup.append(k)

    return {
        "url": url,
        "title": title,
        "abstract": abstract,
        "keywords": ", ".join(dedup)
    }

def main():
    journal = "Decision Support Systems"
    volume, year = 164, 2023

    print("1) Crossref에서 URL 수집")
    seeds = fetch_urls_from_crossref(journal, volume, year)
    if not seeds:
        print("Crossref 결과 없음"); return

    # URL만 저장
    pd.DataFrame([{"url": s["url"], "doi": s["doi"]} for s in seeds]).to_csv(
        f"dss_vol{volume}_urls.csv", index=False, encoding="utf-8-sig"
    )
    print(f"URL 저장 완료 -> dss_vol{volume}_urls.csv (총 {len(seeds)}건)")

    print("2) 각 URL에서 title/abstract/keywords 수집")
    rows = []
    for i, sx in enumerate(seeds, 1):
        row = parse_title_abs_kw_from_page(sx["url"], sx)
        rows.append(row)
        print(f"[{i}/{len(seeds)}] {row['title'][:80]}")
        time.sleep(0.3)  # 예의상

    pd.DataFrame(rows, columns=["title","abstract","keywords","url"]).to_csv(
        f"dss_vol{volume}_details.csv", index=False, encoding="utf-8-sig"
    )
    print(f"세부 저장 완료 -> dss_vol{volume}_details.csv")

if __name__ == "__main__":
    main()

1) Crossref에서 URL 수집
URL 저장 완료 -> dss_vol164_urls.csv (총 8건)
2) 각 URL에서 title/abstract/keywords 수집
[1/8] Pay-for-performance schemes and hospital HIT adoption
[2/8] CATCHM: A novel network-based credit card fraud detection method using node repr
[3/8] Impact of content ideology on social media opinion polarization: The moderating 
[4/8] Assuring quality and waiting time in real-time spatial crowdsourcing
[5/8] IFC/Editorial Board
[6/8] The role of web browsing in credit risk prediction
[7/8] Exploring the effects of relationship quality and c-commerce behavior on firms' 
[8/8] A novel label-based multimodal topic model for social media analysis
세부 저장 완료 -> dss_vol164_details.csv


In [10]:
import requests
from bs4 import BeautifulSoup
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

def extract_paper_info(url):
    """
    ScienceDirect 논문에서 제목, 초록, 키워드 추출
    """
    # Chrome 옵션 설정
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # 브라우저 창 숨기기
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")

    paper_data = {
        'title': '',
        'abstract': '',
        'keywords': [],
        'url': url
    }

    try:
        # WebDriver 실행
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        time.sleep(5)  # 페이지 로딩 대기

        # HTML 파싱
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # 제목 추출 - #screen-reader-main-title > span
        title_element = soup.select_one('#screen-reader-main-title > span')
        if title_element:
            paper_data['title'] = title_element.get_text().strip()
            print(f"✓ 제목 추출 성공: {paper_data['title'][:50]}...")
        else:
            print("✗ 제목 추출 실패")

        # 초록 추출 - #sp0055
        abstract_element = soup.select_one('#sp0055')
        if abstract_element:
            paper_data['abstract'] = abstract_element.get_text().strip()
            print(f"✓ 초록 추출 성공: {len(paper_data['abstract'])}자")
        else:
            print("✗ 초록 추출 실패")

        # 키워드 추출 - #ks0005 div
        keyword_elements = soup.select('#ks0005 div')
        if keyword_elements:
            keywords = []
            for element in keyword_elements:
                keyword_text = element.get_text().strip()
                # 빈 텍스트나 "Keywords" 라벨 제외
                if keyword_text and keyword_text.lower() not in ['keywords', 'keyword']:
                    keywords.append(keyword_text)

            paper_data['keywords'] = keywords
            print(f"✓ 키워드 추출 성공: {len(keywords)}개")
            for i, keyword in enumerate(keywords):
                print(f"  {i+1}. {keyword}")
        else:
            print("✗ 키워드 추출 실패")

        driver.quit()
        return paper_data

    except Exception as e:
        print(f"오류 발생: {e}")
        if 'driver' in locals():
            driver.quit()
        return paper_data

def save_data(data, filename='paper_data'):
    """데이터를 JSON과 텍스트 파일로 저장"""

    # JSON 저장
    with open(f'{filename}.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    # 읽기 쉬운 텍스트 파일로 저장
    with open(f'{filename}.txt', 'w', encoding='utf-8') as f:
        f.write("="*80 + "\n")
        f.write("논문 정보\n")
        f.write("="*80 + "\n\n")

        f.write(f"URL: {data['url']}\n\n")

        f.write("제목 (Title):\n")
        f.write(f"{data['title']}\n\n")

        f.write("키워드 (Keywords):\n")
        if data['keywords']:
            for i, keyword in enumerate(data['keywords']):
                f.write(f"{i+1}. {keyword}\n")
        else:
            f.write("키워드 없음\n")
        f.write("\n")

        f.write("초록 (Abstract):\n")
        f.write(f"{data['abstract']}\n")
        f.write("\n" + "="*80)

    print(f"\n데이터가 {filename}.json과 {filename}.txt에 저장되었습니다.")

def main():
    url = "https://www.sciencedirect.com/science/article/pii/S0167923622001166"

    print("ScienceDirect 논문 정보 추출 시작...")
    print(f"URL: {url}")
    print("-" * 50)

    # 데이터 추출
    paper_data = extract_paper_info(url)

    # 결과 출력
    print("\n" + "="*50)
    print("추출 결과:")
    print("="*50)

    if paper_data['title']:
        print(f"제목: {paper_data['title']}")

    if paper_data['keywords']:
        print(f"\n키워드 ({len(paper_data['keywords'])}개):")
        for i, keyword in enumerate(paper_data['keywords']):
            print(f"  {i+1}. {keyword}")

    if paper_data['abstract']:
        print(f"\n초록 (길이: {len(paper_data['abstract'])}자):")
        # 초록이 너무 길면 일부만 출력
        if len(paper_data['abstract']) > 200:
            print(f"{paper_data['abstract'][:200]}...")
        else:
            print(paper_data['abstract'])

    # 파일로 저장
    if any([paper_data['title'], paper_data['abstract'], paper_data['keywords']]):
        save_data(paper_data, 'sciencedirect_paper')
    else:
        print("\n추출된 데이터가 없어 파일을 저장하지 않습니다.")
        print("\n문제 해결 방법:")
        print("1. Chrome WebDriver가 올바르게 설치되었는지 확인")
        print("2. 인터넷 연결 확인")
        print("3. VPN 사용 또는 대학 네트워크에서 실행")

if __name__ == "__main__":
    main()

ScienceDirect 논문 정보 추출 시작...
URL: https://www.sciencedirect.com/science/article/pii/S0167923622001166
--------------------------------------------------
✓ 제목 추출 성공: Impact of content ideology on social media opinion...
✓ 초록 추출 성공: 922자
✓ 키워드 추출 성공: 6개
  1. Social media
  2. Opinion polarization
  3. Sentiment analysis
  4. Ideology
  5. Functional affordance
  6. Symbolic expression

추출 결과:
제목: Impact of content ideology on social media opinion polarization: The moderating role of functional affordances and symbolic expressions

키워드 (6개):
  1. Social media
  2. Opinion polarization
  3. Sentiment analysis
  4. Ideology
  5. Functional affordance
  6. Symbolic expression

초록 (길이: 922자):
We offer theory and evidence regarding the impact of content ideology (i.e., emotionally charged beliefs expressed in sentiments) on opinion polarization (i.e., conflicting attitudes about an event) o...

데이터가 sciencedirect_paper.json과 sciencedirect_paper.txt에 저장되었습니다.


In [18]:
# pip install habanero backoff requests beautifulsoup4 pandas selenium

import re, json, time, pandas as pd, requests
from typing import List, Dict
from bs4 import BeautifulSoup
from habanero import Crossref
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

HDRS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
}

def clean(s: str) -> str:
    if not s: return ""
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def clean_abs(s: str) -> str:
    if not s: return ""
    s = re.sub(r"</?jats:[^>]*>", " ", s)     # JATS 제거
    s = re.sub(r"<[^>]+>", " ", s)            # 기타 태그 제거
    return clean(s)

class IntegratedPaperScraper:
    def __init__(self):
        self.driver = None
        self.setup_selenium()

    def setup_selenium(self):
        """Selenium WebDriver 설정"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # 브라우저 창 숨기기
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        except Exception as e:
            print(f"Chrome driver 설정 실패: {e}")
            self.driver = None

    def fetch_urls_from_crossref(self, journal="Decision Support Systems", volume=164, year=2023) -> List[Dict]:
        """Crossref에서 URL, DOI, 제목/초록(폴백용) 가져오기 (타임아웃 및 재시도 처리)"""
        print(f"Crossref에서 {journal} Vol.{volume} ({year}) 논문 검색 중...")

        # 여러 전략으로 시도
        strategies = [
            # 전략 1: 기본 검색
            {"filter": {"container-title": journal, "from-pub-date": f"{year}-01-01", "until-pub-date": f"{year}-12-31"}},
            # 전략 2: DOI 패턴으로 검색 (ScienceDirect는 보통 10.1016으로 시작)
            {"query": f'"{journal}" volume:{volume} {year}'},
            # 전략 3: 간단한 쿼리
            {"query": f"Decision Support Systems {year}"}
        ]

        out = []

        for i, strategy in enumerate(strategies, 1):
            try:
                print(f"  시도 {i}/{len(strategies)}: ", end="")
                cr = Crossref(mailto="researcher@example.com", timeout=30)

                if "filter" in strategy:
                    print("필터 방식")
                    res = cr.works(filter=strategy["filter"], limit=100, timeout=30)
                else:
                    print("쿼리 방식")
                    res = cr.works(query=strategy["query"], limit=100, timeout=30)

                # 결과 처리
                items = res.get("message", {}).get("items", []) if isinstance(res, dict) else []

                for it in items:
                    # 볼륨과 저널명 확인 (더 유연하게)
                    item_journal = ""
                    if it.get("container-title"):
                        item_journal = it["container-title"][0] if isinstance(it["container-title"], list) else str(it["container-title"])

                    item_volume = str(it.get("volume", ""))

                    # 저널명과 볼륨 매칭 (부분 매칭 허용)
                    journal_match = "decision support" in item_journal.lower() and "systems" in item_journal.lower()
                    volume_match = item_volume == str(volume) or not item_volume  # 볼륨 정보가 없으면 일단 포함

                    if not journal_match:
                        continue

                    url = it.get("URL") or (f"https://doi.org/{it.get('DOI')}" if it.get("DOI") else "")
                    if not url:
                        continue

                    # 중복 제거 (DOI 기준)
                    doi = it.get("DOI", "")
                    if doi and any(existing.get("doi") == doi for existing in out):
                        continue

                    out.append({
                        "url": url,
                        "doi": doi,
                        "title_xref": (it.get("title") or [""])[0],
                        "abstract_xref": clean_abs(it.get("abstract","")),
                        "keywords_xref": ", ".join(it.get("subject", []) or []),
                        "authors": self.extract_authors(it.get("author", [])),
                        "year": self.extract_year(it.get("published-print", it.get("published-online"))),
                        "volume": item_volume,
                        "journal": item_journal
                    })

                if out:
                    print(f"    -> {len(out)}개 논문 발견")
                    break
                else:
                    print("    -> 결과 없음")

            except Exception as e:
                print(f"    -> 오류: {e}")
                continue

        # 볼륨으로 한번 더 필터링
        if volume and out:
            filtered_out = [item for item in out if str(item.get("volume", "")) == str(volume)]
            if filtered_out:
                out = filtered_out
                print(f"볼륨 {volume} 필터링 후: {len(out)}개")

        if not out:
            print("⚠️  Crossref에서 논문을 찾을 수 없습니다. 대안 방법을 시도합니다...")
            # 대안: 알려진 DOI 패턴으로 URL 생성
            out = self.generate_fallback_urls(journal, volume, year)

        print(f"최종 수집: {len(out)}개 논문")
        return out

    def generate_fallback_urls(self, journal, volume, year) -> List[Dict]:
        """CrossRef 실패 시 대안 URL 생성"""
        print("대안 방법: 알려진 논문 패턴으로 URL 생성 중...")

        # ScienceDirect Decision Support Systems의 일반적인 DOI 패턴
        # 실제로는 이 방법보다는 저널 홈페이지를 크롤링하는 것이 좋습니다
        fallback_urls = [
            "https://www.sciencedirect.com/journal/decision-support-systems/vol/164/suppl/C"  # 볼륨 페이지
        ]

        out = []
        for i, url in enumerate(fallback_urls):
            out.append({
                "url": url,
                "doi": f"fallback_{i}",
                "title_xref": f"Fallback Paper {i+1}",
                "abstract_xref": "",
                "keywords_xref": "",
                "authors": "",
                "year": str(year),
                "volume": str(volume),
                "journal": journal
            })

        return out

    def extract_authors(self, authors_list: List[Dict]) -> str:
        """저자 정보 추출 및 포맷팅"""
        if not authors_list:
            return ''

        author_names = []
        for author in authors_list:
            given = author.get('given', '')
            family = author.get('family', '')
            if given and family:
                author_names.append(f"{given} {family}")
            elif family:
                author_names.append(family)

        return ', '.join(author_names)

    def extract_year(self, date_info) -> str:
        """발행년도 추출"""
        if not date_info:
            return ''

        if isinstance(date_info, dict):
            date_parts = date_info.get('date-parts', [[]])[0]
            if date_parts:
                return str(date_parts[0])

        return ''

    def extract_paper_details_selenium(self, url: str, xref_fallback: Dict) -> Dict:
        """Selenium을 사용하여 ScienceDirect에서 상세 정보 추출"""
        paper_data = {
            'url': url,
            'title': '',
            'abstract': '',
            'keywords': [],
            'authors': xref_fallback.get('authors', ''),
            'year': xref_fallback.get('year', ''),
            'doi': xref_fallback.get('doi', '')
        }

        if not self.driver:
            # Selenium이 없으면 requests로 시도
            return self.extract_paper_details_requests(url, xref_fallback)

        try:
            self.driver.get(url)
            time.sleep(3)  # 페이지 로딩 대기

            # HTML 파싱
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')

            # 제목 추출 - ScienceDirect 특화 셀렉터 우선
            title_element = soup.select_one('#screen-reader-main-title > span')
            if title_element:
                paper_data['title'] = title_element.get_text().strip()
            else:
                # 백업 셀렉터들
                title_selectors = [
                    'h1', 'h1.article-title', 'span.Title', 'span.headline',
                    'meta[property="og:title"]', 'meta[name="citation_title"]'
                ]

                for selector in title_selectors:
                    if selector.startswith('meta'):
                        el = soup.select_one(selector)
                        if el and el.get("content"):
                            paper_data['title'] = clean(el["content"])
                            break
                    else:
                        el = soup.select_one(selector)
                        if el and clean(el.get_text()):
                            paper_data['title'] = clean(el.get_text())
                            break

            # 초록 추출 - ScienceDirect 특화 셀렉터 우선
            abstract_element = soup.select_one('#sp0055')
            if abstract_element:
                paper_data['abstract'] = abstract_element.get_text().strip()
            else:
                # 백업 셀렉터들
                abstract_selectors = [
                    "section#abstract p", ".article__abstract p", "div.abstract p",
                    "div.Abstracts div.abstract", "div#abspara p", "#abstract",
                    ".abstract", '[data-testid="abstract"]'
                ]

                for selector in abstract_selectors:
                    el = soup.select_one(selector)
                    if el and clean(el.get_text()):
                        paper_data['abstract'] = clean(el.get_text())
                        break

            # 키워드 추출 - ScienceDirect 특화 셀렉터 우선
            keyword_elements = soup.select('#ks0005 div')
            if keyword_elements:
                keywords = []
                for element in keyword_elements:
                    keyword_text = element.get_text().strip()
                    if keyword_text and keyword_text.lower() not in ['keywords', 'keyword']:
                        keywords.append(keyword_text)
                paper_data['keywords'] = keywords
            else:
                # 백업 키워드 추출
                keywords = self.extract_keywords_fallback(soup)
                paper_data['keywords'] = keywords

            return paper_data

        except Exception as e:
            print(f"Selenium 추출 중 오류 (URL: {url}): {e}")
            # 오류 시 requests로 대체
            return self.extract_paper_details_requests(url, xref_fallback)

    def extract_paper_details_requests(self, url: str, xref_fallback: Dict) -> Dict:
        """requests를 사용하여 논문 상세 정보 추출 (백업)"""
        paper_data = {
            'url': url,
            'title': '',
            'abstract': '',
            'keywords': [],
            'authors': xref_fallback.get('authors', ''),
            'year': xref_fallback.get('year', ''),
            'doi': xref_fallback.get('doi', '')
        }

        try:
            response = requests.get(url, headers=HDRS, timeout=30, allow_redirects=True)
            if response.status_code >= 400:
                # Crossref 폴백 데이터 사용
                paper_data.update(self.use_crossref_fallback(xref_fallback))
                return paper_data

            soup = BeautifulSoup(response.text, "html.parser")

            # 제목 추출
            title_selectors = [
                "h1", "h1.article-title", "span.Title", "span.headline",
                "meta[property='og:title']", "meta[name='citation_title']"
            ]

            for selector in title_selectors:
                if selector.startswith("meta"):
                    el = soup.select_one(selector)
                    if el and el.get("content"):
                        paper_data['title'] = clean(el["content"])
                        break
                else:
                    el = soup.select_one(selector)
                    if el and clean(el.get_text()):
                        paper_data['title'] = clean(el.get_text())
                        break

            # 초록 추출
            abstract_selectors = [
                "section#abstract p", ".article__abstract p", "div.abstract p",
                "div.Abstracts div.abstract", "div#abspara p"
            ]

            for selector in abstract_selectors:
                el = soup.select_one(selector)
                if el and clean(el.get_text()):
                    paper_data['abstract'] = clean(el.get_text())
                    break

            if not paper_data['abstract']:
                meta = soup.select_one("meta[name='description']")
                if meta and meta.get("content"):
                    paper_data['abstract'] = clean(meta["content"])

            # 키워드 추출
            paper_data['keywords'] = self.extract_keywords_fallback(soup)

        except Exception as e:
            print(f"Requests 추출 중 오류 (URL: {url}): {e}")

        # 폴백: Crossref 메타 사용
        if not paper_data['title']:
            paper_data['title'] = xref_fallback.get("title_xref", "")
        if not paper_data['abstract']:
            paper_data['abstract'] = xref_fallback.get("abstract_xref", "")
        if not paper_data['keywords']:
            kw = xref_fallback.get("keywords_xref", "")
            paper_data['keywords'] = [k.strip() for k in kw.split(",") if k.strip()] if kw else []

        return paper_data

    def extract_keywords_fallback(self, soup) -> List[str]:
        """키워드 추출 백업 메소드"""
        keywords = []

        # 일반 블록에서 추출
        blk = soup.select_one("ul.keywords, .keywords, #keywords, section.keywords")
        if blk:
            keywords = [clean(x.get_text()) for x in blk.select("li, span, a") if clean(x.get_text())]

        # 메타 태그에서 추출
        if not keywords:
            meta_kw = soup.select_one("meta[name='keywords']")
            if meta_kw and meta_kw.get("content"):
                keywords = [clean(x) for x in re.split(r",|;", meta_kw["content"]) if clean(x)]

        # JSON-LD 스키마에서 추출
        if not keywords:
            for s in soup.select("script[type='application/ld+json']"):
                try:
                    data = json.loads(s.string or "{}")
                    if isinstance(data, dict) and "keywords" in data:
                        v = data["keywords"]
                        if isinstance(v, str):
                            keywords = [clean(x) for x in re.split(r",|;", v) if clean(x)]
                        elif isinstance(v, list):
                            keywords = [clean(str(x)) for x in v if clean(str(x))]
                        if keywords:
                            break
                except:
                    pass

        # 중복 제거
        seen, dedup = set(), []
        for k in keywords:
            if k not in seen:
                seen.add(k)
                dedup.append(k)

        return dedup

    def use_crossref_fallback(self, xref_fallback: Dict) -> Dict:
        """Crossref 폴백 데이터 사용"""
        return {
            'title': xref_fallback.get("title_xref", ""),
            'abstract': xref_fallback.get("abstract_xref", ""),
            'keywords': [k.strip() for k in xref_fallback.get("keywords_xref", "").split(",") if k.strip()]
        }

    def save_results(self, results: List[Dict], filename_prefix: str):
        """결과를 CSV와 JSON으로 저장"""
        if not results:
            print("저장할 데이터가 없습니다.")
            return

        # DataFrame 생성
        df_data = []
        for result in results:
            df_data.append({
                'title': result.get('title', ''),
                'authors': result.get('authors', ''),
                'year': result.get('year', ''),
                'abstract': result.get('abstract', ''),
                'keywords': ', '.join(result.get('keywords', [])) if isinstance(result.get('keywords'), list) else result.get('keywords', ''),
                'doi': result.get('doi', ''),
                'url': result.get('url', '')
            })

        df = pd.DataFrame(df_data)

        # CSV 저장
        csv_filename = f"{filename_prefix}.csv"
        df.to_csv(csv_filename, index=False, encoding='utf-8-sig')

        # JSON 저장
        json_filename = f"{filename_prefix}.json"
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        print(f"결과 저장 완료:")
        print(f"  - CSV: {csv_filename}")
        print(f"  - JSON: {json_filename}")
        print(f"  - 총 {len(results)}개 논문")

    def close(self):
        """리소스 정리"""
        if self.driver:
            self.driver.quit()

def main():
    # 설정
    journal = "Decision Support Systems"
    volume = 164
    year = 2023

    scraper = IntegratedPaperScraper()

    try:
        print("=" * 60)
        print(f"논문 정보 수집 시작: {journal} Vol.{volume} ({year})")
        print("=" * 60)

        # 1단계: Crossref에서 URL 수집
        seeds = scraper.fetch_urls_from_crossref(journal, volume, year)

        if not seeds:
            print("Crossref에서 논문을 찾을 수 없습니다.")
            return

        # URL 목록만 먼저 저장
        url_data = [{"url": s["url"], "doi": s["doi"], "title_crossref": s["title_xref"]} for s in seeds]
        pd.DataFrame(url_data).to_csv(f"dss_vol{volume}_urls.csv", index=False, encoding="utf-8-sig")
        print(f"URL 목록 저장: dss_vol{volume}_urls.csv ({len(seeds)}개)")

        # 2단계: 각 논문 페이지에서 상세 정보 추출
        print(f"\n각 논문 페이지에서 상세 정보 추출 중...")
        print("-" * 50)

        results = []
        failed_count = 0

        for i, seed in enumerate(seeds, 1):
            print(f"[{i}/{len(seeds)}] 처리 중...")

            try:
                result = scraper.extract_paper_details_selenium(seed["url"], seed)
                results.append(result)

                # 진행상황 출력
                title = result.get('title', 'No title')[:60]
                keywords_count = len(result.get('keywords', []))
                abstract_length = len(result.get('abstract', ''))

                print(f"  ✓ {title}")
                print(f"    키워드: {keywords_count}개, 초록: {abstract_length}자")

            except Exception as e:
                print(f"  ✗ 오류: {e}")
                failed_count += 1
                # 오류가 있어도 기본 정보는 저장
                results.append({
                    'url': seed["url"],
                    'title': seed.get("title_xref", ""),
                    'abstract': seed.get("abstract_xref", ""),
                    'keywords': [],
                    'authors': seed.get('authors', ''),
                    'year': seed.get('year', ''),
                    'doi': seed.get('doi', '')
                })

            # 예의상 잠시 대기
            time.sleep(0.5)

        # 3단계: 결과 저장
        print(f"\n" + "=" * 50)
        print("수집 완료!")
        print("=" * 50)
        print(f"성공: {len(seeds) - failed_count}개")
        print(f"실패: {failed_count}개")

        scraper.save_results(results, f"dss_vol{volume}_complete")

        # 간단한 통계
        titles_with_content = sum(1 for r in results if r.get('title'))
        abstracts_with_content = sum(1 for r in results if r.get('abstract'))
        keywords_with_content = sum(1 for r in results if r.get('keywords'))

        print(f"\n수집 통계:")
        print(f"  - 제목 수집: {titles_with_content}/{len(results)}")
        print(f"  - 초록 수집: {abstracts_with_content}/{len(results)}")
        print(f"  - 키워드 수집: {keywords_with_content}/{len(results)}")

    except Exception as e:
        print(f"전체 프로세스 오류: {e}")

    finally:
        scraper.close()

if __name__ == "__main__":
    main()

논문 정보 수집 시작: Decision Support Systems Vol.164 (2023)
Crossref에서 Decision Support Systems Vol.164 (2023) 논문 검색 중...
  시도 1/3: 필터 방식
    -> 오류: Request.__init__() got multiple values for argument 'timeout'
  시도 2/3: 쿼리 방식
    -> 오류: Request.__init__() got multiple values for argument 'timeout'
  시도 3/3: 쿼리 방식
    -> 오류: Request.__init__() got multiple values for argument 'timeout'
⚠️  Crossref에서 논문을 찾을 수 없습니다. 대안 방법을 시도합니다...
대안 방법: 알려진 논문 패턴으로 URL 생성 중...
최종 수집: 1개 논문
URL 목록 저장: dss_vol164_urls.csv (1개)

각 논문 페이지에서 상세 정보 추출 중...
--------------------------------------------------
[1/1] 처리 중...
  ✓ Decision Support Systems
    키워드: 0개, 초록: 0자

수집 완료!
성공: 1개
실패: 0개
결과 저장 완료:
  - CSV: dss_vol164_complete.csv
  - JSON: dss_vol164_complete.json
  - 총 1개 논문

수집 통계:
  - 제목 수집: 1/1
  - 초록 수집: 0/1
  - 키워드 수집: 0/1
