In [None]:
import time
import csv
import logging
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException

# ---------------- 로깅 설정 ----------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

# ---------------- 대상 인물 리스트 ----------------
professors = {
    "조형태 교수님": "https://scholar.google.com/citations?hl=ko&user=QsjS7I4AAAAJ",
    "이재원 교수님": "https://scholar.google.co.kr/citations?user=guPkb4cAAAAJ&hl=ko",
    "가성빈 교수님": "https://scholar.google.com/citations?hl=ko&user=YZ5rW_gAAAAJ",
    "최동호": "https://scholar.google.com/citations?hl=ko&user=JHi2ay4AAAAJ",
    "안나현": "https://scholar.google.com/citations?hl=ko&user=5gsoA1EAAAAJ",
    "주종효": "https://scholar.google.co.kr/citations?user=GdFcc0QAAAAJ&hl=ko&oi=sra",
    "김유림": "https://scholar.google.co.kr/citations?user=66OKDcgAAAAJ&hl=ko&oi=sra",
    "윤승관": "https://scholar.google.co.kr/citations?user=HDVnd5YAAAAJ&hl=ko&oi=sra",
    # 권혁원, 강현진, 이세영 등 URL 없는 분들은 제외
}

# ---------------- 브라우저 실행 ----------------
options = Options()
# options.add_argument("--headless")  # 브라우저 창 안 띄움
options.add_argument("--no-sandbox")

service = Service("c:/Users/PSID_PC_20/Desktop/[00]Projects/PSID_ARCHIVE/chromedriver-win64/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)

all_papers = []

# ---------------- 개별 Scholar 페이지 순회 ----------------
for name, url in professors.items():
    logging.info(f"====== {name} ({url}) 크롤링 시작 ======")
    driver.get(url)
    time.sleep(2)

    # "더보기" 버튼 계속 클릭
    while True:
        try:
            more_button = driver.find_element(By.ID, "gsc_bpf_more")
            if more_button.is_enabled():
                driver.execute_script("arguments[0].click();", more_button)
                time.sleep(1.5)
            else:
                break
        except (NoSuchElementException, ElementNotInteractableException):
            break

    # 논문 데이터 추출
    rows = driver.find_elements(By.CSS_SELECTOR, "#gsc_a_t tr.gsc_a_tr")

    for row in tqdm(rows, desc=f"{name} 논문 추출 중"):
        try:
            title_elem = row.find_element(By.CSS_SELECTOR, "a.gsc_a_at")
            title = title_elem.text
            link = title_elem.get_attribute("href")
        except:
            title, link = "", ""

        try:
            authors = row.find_element(By.CSS_SELECTOR, ".gsc_a_at+div").text
        except:
            authors = ""

        try:
            journal = row.find_element(By.CSS_SELECTOR, ".gs_gray+ .gs_gray").text
        except:
            journal = ""

        try:
            year = row.find_element(By.CSS_SELECTOR, ".gsc_a_y").text
        except:
            year = ""

        try:
            cites_elem = row.find_element(By.CSS_SELECTOR, ".gsc_a_c a")
            cites = cites_elem.text if cites_elem.text else "0"
        except:
            cites = "0"

        all_papers.append([name, title, authors, journal, year, cites, link])

logging.info("크롤링 완료")

# ---------------- CSV 저장 ----------------
csv_filename = "scholar_papers_all.csv"
with open(csv_filename, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    writer.writerow(["Professor", "Title", "Authors", "Journal/Conference", "Year", "Citations", "Link"])
    writer.writerows(all_papers)

logging.info(f"총 {len(all_papers)}개 논문을 {csv_filename}에 저장 완료")

driver.quit()


조형태 교수님 논문 추출 중: 100%|██████████| 154/154 [00:12<00:00, 12.72it/s]
이재원 교수님 논문 추출 중: 100%|██████████| 56/56 [00:03<00:00, 15.30it/s]
가성빈 교수님 논문 추출 중: 100%|██████████| 48/48 [00:03<00:00, 15.64it/s]
최동호 논문 추출 중: 100%|██████████| 40/40 [00:02<00:00, 13.77it/s]
안나현 논문 추출 중: 100%|██████████| 18/18 [00:01<00:00, 14.60it/s]
주종효 논문 추출 중: 100%|██████████| 49/49 [00:03<00:00, 15.27it/s]
김유림 논문 추출 중: 100%|██████████| 26/26 [00:01<00:00, 15.36it/s]
윤승관 논문 추출 중: 100%|██████████| 8/8 [00:00<00:00, 14.92it/s]
2025-09-17 20:29:44,606 [INFO] 크롤링 완료
2025-09-17 20:29:44,611 [INFO] 총 399개 논문을 scholar_papers_all.csv에 저장 완료


In [25]:
import csv
import time
import requests
import logging
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

input_csv = "scholar_papers_all.csv"
output_csv = "scholar_papers_with_abstract.csv"

HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; MyResearchBot/1.0)"}

# ---------------- 1단계: CrossRef에서 DOI 가져오기 ----------------
def get_doi_from_crossref(title):
    url = "https://api.crossref.org/works"
    params = {"query.title": title, "rows": 1}
    try:
        r = requests.get(url, params=params, headers=HEADERS, timeout=15)
        if r.status_code == 200:
            data = r.json()
            items = data.get("message", {}).get("items", [])
            if items:
                return items[0].get("DOI", "")
    except Exception as e:
        logging.warning(f"CrossRef DOI 오류: {e}")
    return ""

# ---------------- 2단계: Semantic Scholar DOI로 초록 가져오기 ----------------
def get_abstract_from_semanticscholar(doi):
    url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}"
    params = {"fields": "title,abstract,venue"}
    try:
        r = requests.get(url, params=params, headers=HEADERS, timeout=15)
        if r.status_code == 200:
            data = r.json()
            return data.get("venue", ""), data.get("abstract", "")
    except Exception as e:
        logging.warning(f"Semantic Scholar DOI 오류: {e}")
    return "", ""

# ---------------- 실행 ----------------
results = []
with open(input_csv, "r", encoding="utf-8-sig") as f:
    reader = csv.DictReader(f)
    for row in tqdm(reader, desc="논문 초록 수집 중"):
        title = row["Title"]

        # 1) DOI 먼저 확보
        doi = get_doi_from_crossref(title)

        # 2) DOI 있으면 Semantic Scholar에서 초록 시도
        journal, abstract = ("", "")
        if doi:
            journal, abstract = get_abstract_from_semanticscholar(doi)

        results.append({
            "Title": title,
            "Journal": journal if journal else row["Journal/Conference"],
            "Abstract": abstract
        })

# ---------------- 저장 ----------------
with open(output_csv, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.DictWriter(f, fieldnames=["Title", "Journal", "Abstract"])
    writer.writeheader()
    writer.writerows(results)

logging.info(f"총 {len(results)}개 논문 초록을 {output_csv}에 저장 완료")


논문 초록 수집 중: 2it [00:40, 20.38s/it]


KeyboardInterrupt: 

In [28]:
import time
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# ---------------- 브라우저 실행 ----------------
options = Options()
# options.add_argument("--headless")
service = Service("chromedriver.exe")   # 경로 수정
driver = webdriver.Chrome(service=service, options=options)

# ---------------- Scholar 접속 ----------------
driver.get("https://scholar.google.com/citations?hl=ko&user=QsjS7I4AAAAJ")
time.sleep(2)

# 첫 번째 논문 클릭
first_paper = driver.find_element(By.CSS_SELECTOR, "tr.gsc_a_tr a.gsc_a_at")
first_paper.click()
time.sleep(2)

# 오른쪽 패널에서 원문 링크 추출
link_elem = driver.find_element(By.CSS_SELECTOR, "#gsc_oci_title a.gsc_oci_title_link")
paper_url = link_elem.get_attribute("href")
logging.info(f"원문 링크: {paper_url}")

# ---------------- 새 탭 열어서 원문 페이지 접속 ----------------
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[1])
driver.get(paper_url)
time.sleep(5)  # 페이지 로딩 대기 (네트워크 환경에 맞게 조정)

abstract_text = ""

# ---------------- 여러 저널별 Abstract CSS Selector 시도 ----------------
try:
    # ScienceDirect
    elem = driver.find_element(By.CSS_SELECTOR, "div.abstract.author")
    abstract_text = elem.text
except NoSuchElementException:
    try:
        # Springer
        elem = driver.find_element(By.CSS_SELECTOR, "section.Abstract")
        abstract_text = elem.text
    except NoSuchElementException:
        try:
            # Wiley
            elem = driver.find_element(By.CSS_SELECTOR, "div.article-section__content")
            abstract_text = elem.text
        except NoSuchElementException:
            logging.warning("Abstract 영역을 찾을 수 없음")

logging.info(f"추출된 Abstract: {abstract_text[:300]}...")

# ---------------- 종료 ----------------
driver.quit()


2025-09-18 13:20:17,194 [INFO] 원문 링크: https://www.sciencedirect.com/science/article/pii/S0196890421006142
2025-09-18 13:20:23,289 [INFO] 추출된 Abstract: Abstract
The production and application of hydrogen, an environmentally friendly energy source, have been attracting increasing interest of late. Although steam methane reforming (SMR) method is used to produce hydrogen, it is difficult to build a high-fidelity model because the existing equation-or...


In [None]:
import os 
os.path.abspath(os.path.curdir)
strr = 'c:\\Users\\PSID_PC_20\\AppData\\Local\\Programs\\Microsoft VS Code'
strr.replace('\\','/')

'c:/Users/PSID_PC_20/AppData/Local/Programs/Microsoft VS Code'

In [13]:
# os.chdir("c:/Users/PSID_PC_20/Downloads/")
import pandas as pd
df = pd.read_csv("./scholar_papers_all.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'scholar_papers.csv'

In [14]:
import pandas as pd
import re

# ----------- 유틸 함수 -----------
def normalize_journal(journal: str) -> str:
    if pd.isna(journal) or str(journal).strip() == "":
        return "Unknown"
    j = re.sub(r"\d.*$", "", str(journal)).strip()  # 숫자부터 뒤 제거
    j = re.sub(r"[-,:.;\u00B7\u2013\u2014\s]+$", "", j).strip()  # 끝의 불필요 문자 제거
    return j if j else "Unknown"

# ----------- CSV 불러오기 -----------
df = pd.read_csv("scholar_papers_all.csv")  # 파일명 바꿔주세요

# ----------- 정규화 컬럼 추가 -----------
df["JournalClean"] = df["Journal/Conference"].apply(normalize_journal)

# ----------- Unknown 항목만 필터링 -----------
unknown_df = df[df["JournalClean"] == "Unknown"]

print(f"총 Unknown 항목 수: {len(unknown_df)}")
print(unknown_df[["Professor", "Title", "Journal/Conference"]].head(20))  # 상위 20개 미리보기

# ----------- Unknown 원본 저널값 유니크 확인 -----------
print("\nUnique 원본 Journal 값:")
print(unknown_df["Journal/Conference"].unique())

총 Unknown 항목 수: 45
    Professor                                              Title  \
99    조형태 교수님  Chemical Property-Guided Neural Networks for N...   
116   조형태 교수님  Quantum Computing Assisted Data-Driven Modelin...   
117   조형태 교수님  Analysis of Correlation between Microbubble an...   
118   조형태 교수님  Data-Driven Adaptive Sparse Identification of ...   
119   조형태 교수님  Probabilistic Prediction Model-Based High-Thro...   
125   조형태 교수님  Improved Long-Short Term Memory Model for Dyna...   
126   조형태 교수님  Cost-Optimal Multi-Effect Mechanical Vapor Rec...   
127   조형태 교수님  Multi-objective Optimization of Process Effici...   
128   조형태 교수님  Guide to COF Adsorbent for Ammonia-based Green...   
129   조형태 교수님  Data-Driven Modeling to Predict the Physical P...   
130   조형태 교수님  Development of Cement Kiln Dust Recovery Proce...   
132   조형태 교수님  A Data-Driven Approach for Modeling and Energy...   
134   조형태 교수님  Development of Two Sequential Kmc Models to De...   
135   조형태 교수님  Data-Driven Pr

In [32]:
import time
import csv
import logging
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException

# ---------------- 로깅 설정 ----------------
logging.basicConfig(level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

# ---------------- 대상 인물 리스트 ----------------
professors = {
    "조형태 교수님": "https://scholar.google.com/citations?hl=ko&user=QsjS7I4AAAAJ",
    "이재원 교수님": "https://scholar.google.co.kr/citations?user=guPkb4cAAAAJ&hl=ko",
    "가성빈 교수님": "https://scholar.google.com/citations?hl=ko&user=YZ5rW_gAAAAJ",
    "최동호": "https://scholar.google.com/citations?hl=ko&user=JHi2ay4AAAAJ",
    "안나현": "https://scholar.google.com/citations?hl=ko&user=5gsoA1EAAAAJ",
    "주종효": "https://scholar.google.co.kr/citations?user=GdFcc0QAAAAJ&hl=ko&oi=sra",
    "김유림": "https://scholar.google.co.kr/citations?user=66OKDcgAAAAJ&hl=ko&oi=sra",
    "윤승관": "https://scholar.google.co.kr/citations?user=HDVnd5YAAAAJ&hl=ko&oi=sra",
}

# ---------------- 브라우저 실행 (논문 리스트만 Selenium으로) ----------------
options = Options()
# options.add_argument("--headless")   # 무창 실행 원하면 주석 해제
service = Service("chromedriver.exe")   # 경로 수정 필요
driver = webdriver.Chrome(service=service, options=options)

results = []
base_url = "https://scholar.google.com"

# ---------------- 개별 교수님 순회 ----------------
for prof_name, url in professors.items():
    logging.info(f"====== {prof_name} ({url}) 크롤링 시작 ======")
    driver.get(url)
    time.sleep(2)

    # "더보기" 버튼 계속 눌러서 모든 논문 로딩
    while True:
        try:
            more_button = driver.find_element(By.ID, "gsc_bpf_more")
            if more_button.is_enabled():
                driver.execute_script("arguments[0].click();", more_button)
                time.sleep(1.2)
            else:
                break
        except (NoSuchElementException, ElementNotInteractableException):
            break

    # 논문 리스트 가져오기
    rows = driver.find_elements(By.CSS_SELECTOR, "tr.gsc_a_tr")

    for row in tqdm(rows, desc=f"{prof_name} 논문 처리 중"):
        try:
            title_elem = row.find_element(By.CSS_SELECTOR, "a.gsc_a_at")
            title = title_elem.text
            year = row.find_element(By.CSS_SELECTOR, ".gsc_a_y").text

            detail_url = title_elem.get_attribute("href")
            if detail_url.startswith("/"):  # 상대 경로일 경우만 base_url 붙임
                detail_url = base_url + detail_url

            # requests로 citation_for_view 페이지 요청
            r = requests.get(detail_url, headers={"User-Agent":"Mozilla/5.0"}, timeout=15)
            soup = BeautifulSoup(r.text, "html.parser")
            link_tag = soup.select_one("#gsc_oci_title a.gsc_oci_title_link")
            link = link_tag["href"] if link_tag else ""

            results.append([prof_name, title, year, link])

            time.sleep(0.5)  # 요청 속도 제한 (차단 방지)

        except Exception as e:
            logging.warning(f"{prof_name} 논문 처리 실패: {e}")

logging.info("===== 모든 교수님 크롤링 완료 =====")

# ---------------- CSV 저장 ----------------
csv_filename = "scholar_papers_links.csv"
with open(csv_filename, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    writer.writerow(["Professor", "Title", "Year", "Link"])
    writer.writerows(results)

logging.info(f"총 {len(results)}개 논문 링크를 {csv_filename}에 저장 완료")

driver.quit()


조형태 교수님 논문 처리 중:   2%|▏         | 3/154 [00:05<04:52,  1.94s/it]


KeyboardInterrupt: 

In [34]:
import time
import csv
import logging
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from concurrent.futures import ThreadPoolExecutor, as_completed

# ---------------- 로깅 설정 ----------------
logging.basicConfig(level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

professors = {
    "조형태 교수님": "https://scholar.google.com/citations?hl=ko&user=QsjS7I4AAAAJ",
    "이재원 교수님": "https://scholar.google.co.kr/citations?user=guPkb4cAAAAJ&hl=ko",
    "가성빈 교수님": "https://scholar.google.com/citations?hl=ko&user=YZ5rW_gAAAAJ",
    "최동호": "https://scholar.google.com/citations?hl=ko&user=JHi2ay4AAAAJ",
    "안나현": "https://scholar.google.com/citations?hl=ko&user=5gsoA1EAAAAJ",
    "주종효": "https://scholar.google.co.kr/citations?user=GdFcc0QAAAAJ&hl=ko&oi=sra",
    "김유림": "https://scholar.google.co.kr/citations?user=66OKDcgAAAAJ&hl=ko&oi=sra",
    "윤승관": "https://scholar.google.co.kr/citations?user=HDVnd5YAAAAJ&hl=ko&oi=sra",
}

# ---------------- 브라우저 실행 (논문 리스트만 Selenium으로) ----------------
options = Options()
# options.add_argument("--headless")   # 무창 실행 원하면 주석 해제
service = Service("chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)

results = []
base_url = "https://scholar.google.com"

# ---------------- 원문 링크 추출 함수 ----------------
def fetch_detail(prof_name, title, year, detail_url):
    try:
        r = requests.get(detail_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=15)
        soup = BeautifulSoup(r.text, "html.parser")
        link_tag = soup.select_one("#gsc_oci_title a.gsc_oci_title_link")
        link = link_tag["href"] if link_tag else ""
        return [prof_name, title, year, link]
    except Exception as e:
        logging.warning(f"{prof_name} 논문 '{title}' 원문 링크 요청 실패: {e}")
        return [prof_name, title, year, ""]

# ---------------- 개별 교수님 순회 ----------------
for prof_name, url in professors.items():
    logging.info(f"====== {prof_name} ({url}) 크롤링 시작 ======")
    driver.get(url)
    time.sleep(2)

    # "더보기" 버튼 계속 눌러서 모든 논문 로딩
    while True:
        try:
            more_button = driver.find_element(By.ID, "gsc_bpf_more")
            if more_button.is_enabled():
                driver.execute_script("arguments[0].click();", more_button)
                time.sleep(1.0)
            else:
                break
        except (NoSuchElementException, ElementNotInteractableException):
            break

    # 논문 리스트 가져오기
    rows = driver.find_elements(By.CSS_SELECTOR, "tr.gsc_a_tr")
    tasks = []

    with ThreadPoolExecutor(max_workers=10) as executor:  # 동시 10개 요청
        future_to_title = {}
        for row in rows:
            try:
                title_elem = row.find_element(By.CSS_SELECTOR, "a.gsc_a_at")
                title = title_elem.text
                year = row.find_element(By.CSS_SELECTOR, ".gsc_a_y").text
                detail_url = title_elem.get_attribute("href")
                if detail_url.startswith("/"):
                    detail_url = base_url + detail_url

                future = executor.submit(fetch_detail, prof_name, title, year, detail_url)
                future_to_title[future] = title
            except Exception as e:
                logging.warning(f"{prof_name} 논문 row 처리 실패: {e}")

        for future in tqdm(as_completed(future_to_title), total=len(future_to_title), desc=f"{prof_name} 논문 처리 중"):
            results.append(future.result())

logging.info("===== 모든 교수님 크롤링 완료 =====")

# ---------------- CSV 저장 ----------------
csv_filename = "scholar_papers_links.csv"
with open(csv_filename, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    writer.writerow(["Professor", "Title", "Year", "Link"])
    writer.writerows(results)

logging.info(f"총 {len(results)}개 논문 링크를 {csv_filename}에 저장 완료")

driver.quit()


조형태 교수님 논문 처리 중: 100%|██████████| 154/154 [00:13<00:00, 11.65it/s]
이재원 교수님 논문 처리 중: 100%|██████████| 56/56 [00:03<00:00, 14.31it/s] 
가성빈 교수님 논문 처리 중: 100%|██████████| 48/48 [00:04<00:00, 11.01it/s]
최동호 논문 처리 중: 100%|██████████| 40/40 [00:01<00:00, 23.04it/s] 
안나현 논문 처리 중: 100%|██████████| 18/18 [00:00<00:00, 19.63it/s]
주종효 논문 처리 중: 100%|██████████| 49/49 [00:02<00:00, 18.71it/s]
김유림 논문 처리 중: 100%|██████████| 26/26 [00:01<00:00, 14.96it/s]
윤승관 논문 처리 중: 100%|██████████| 8/8 [00:00<00:00, 10.17it/s]
2025-09-18 13:26:07,113 [INFO] ===== 모든 교수님 크롤링 완료 =====
2025-09-18 13:26:07,117 [INFO] 총 399개 논문 링크를 scholar_papers_links.csv에 저장 완료


In [37]:
import pandas as pd

# 데이터 불러오기
df_all = pd.read_csv("scholar_papers_all.csv")
df_links = pd.read_csv("scholar_papers_links.csv")

# Title 기준 inner join (같은 논문만 매칭)
df_merged = pd.merge(
    df_links,
    df_all[["Title", "Journal/Conference"]],
    on="Title",
    how="left"
)

# 결과 저장
df_merged.to_csv("scholar_papers_merged.csv", index=False, encoding="utf-8-sig")

print("병합된 데이터:", df_merged.head())

# ---------------- 저널별 그룹핑 ----------------
journal_group = (
    df_merged.groupby("Journal/Conference")
    .size()
    .reset_index(name="Count")
    .sort_values("Count", ascending=False)
)

print("저널별 그룹핑 결과:", journal_group.head())

# CSV로 저장
journal_group.to_csv("journal_grouping.csv", index=False, encoding="utf-8-sig")


병합된 데이터:   Professor                                              Title    Year  \
0   조형태 교수님  Techno‐economic comparison of amine regenerati...  2021.0   
1   조형태 교수님  Multiobjective Optimization of Plastic Waste S...  2022.0   
2   조형태 교수님  Multiobjective Optimization of Plastic Waste S...  2022.0   
3   조형태 교수님  Optimization of wet flue gas desulfurization s...  2021.0   
4   조형태 교수님  Novel waste heat and oil recovery system in th...  2022.0   

                                                Link  \
0  https://scijournals.onlinelibrary.wiley.com/do...   
1  https://pubs.acs.org/doi/abs/10.1021/acssusche...   
2  https://pubs.acs.org/doi/abs/10.1021/acssusche...   
3  https://www.sciencedirect.com/science/article/...   
4  https://onlinelibrary.wiley.com/doi/abs/10.100...   

                                  Journal/Conference  
0     Energy Science & Engineering 9 (12), 2529-2543  
1  ACS Sustainable Chemistry & Engineering 10 (40...  
2  ACS Sustainable Chemistry & Engineering 1

In [43]:
import pandas as pd
import re

df_all = pd.read_csv("scholar_papers_all.csv")

# Journal/Conference 컬럼에서 저널명만 추출
def clean_journal(text):
    if pd.isna(text):
        return ""
    # 숫자, 콤마 이후 제거 (볼륨/페이지 제거)
    cleaned = re.sub(r"\s\d.*$", "", text.strip())
    return cleaned

df_all["Journal_clean"] = df_all["Journal/Conference"].apply(clean_journal)

# 고유 저널 리스트
unique_journals = sorted(df_all["Journal_clean"].dropna().unique().tolist())

print("고유 저널 개수:", len(unique_journals))
print(unique_journals[:20])  # 앞 20개만 보기

# 필요시 CSV로 저장
pd.Series(unique_journals).to_csv("unique_journals.csv", index=False, header=["Journal"])


고유 저널 개수: 111
['', '.', '19th AIChE Annual Meeting', '2015', '2017 AIChE Annual Meeting', '2018 AIChE Annual Meeting', '2018 KIChE Fall Meeting', '2020 Virtual AIChE Annual Meeting', '2021 AIChE Annual Meeting', '2022 AIChE Annual Meeting', '2022 Spring Meeting &', '2023 AIChE Annual Meeting', '2023 IEEE', '2025 AIChE Annual Meeting', '8th International Symposium on Design, Operation and Control of Chemical …', 'ACS Sustainable Chemistry & Engineering', 'ACS omega', 'AIChE Annual Meeting', 'Advanced Science', 'Algal Research']


In [None]:
import re

filtered_journals = [
    x for x in unique_journals
    if len(x) > 5
    and "Meeting" not in x
    and "Conference" not in x
    and not re.search(r"[가-힣]", x)   # 한글 포함된 경우 제외
]

print("필터링 후 저널 개수:", len(filtered_journals))
print(filtered_journals[:20])  # 앞 20개만 확인
filtered_journals['2023 IEEE',
 '8th International Symposium on Design, Operation and Control of Chemical …',
 'ACS Sustainable Chemistry & Engineering',
 'ACS omega',
 'Advanced Science',
 'Algal Research',
 'Applied Catalysis B: Environmental',
 'Applied Chemistry for Engineering',
 'Applied Crystallography',
 'Applied Energy',
 'Applied Soft Computing',
 'Applied Thermal Engineering',
 'Aquaculture International',
 'Available at SSRN',
 'Bioresource Technology',
 'Catalysis Today',
 'Cell Reports Physical Science',
 'ChemSusChem,',
 'Chemical Engineering Journal',
 'Chemical Engineering Transactions',
 'Chemical Engineering Transactions,',
 'Chemosphere',
 'Clean Technology',
 'Computer Aided Chemical Engineering',
 'Computer Aided Chemical Engineering,',
...
 'Science of The Total Environment',
 'Science of the Total Environment',
 'Sustainable Cities and Society',
 'US Patent',
 'Vsa Processes for Ammonia-Based Green Hydrogen Separation Via Multiscale Hts']
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

필터링 후 저널 개수: 81
['2023 IEEE', '8th International Symposium on Design, Operation and Control of Chemical …', 'ACS Sustainable Chemistry & Engineering', 'ACS omega', 'Advanced Science', 'Algal Research', 'Applied Catalysis B: Environmental', 'Applied Chemistry for Engineering', 'Applied Crystallography', 'Applied Energy', 'Applied Soft Computing', 'Applied Thermal Engineering', 'Aquaculture International', 'Available at SSRN', 'Bioresource Technology', 'Catalysis Today', 'Cell Reports Physical Science', 'ChemSusChem,', 'Chemical Engineering Journal', 'Chemical Engineering Transactions']


['2023 IEEE',
 '8th International Symposium on Design, Operation and Control of Chemical …',
 'ACS Sustainable Chemistry & Engineering',
 'ACS omega',
 'Advanced Science',
 'Algal Research',
 'Applied Catalysis B: Environmental',
 'Applied Chemistry for Engineering',
 'Applied Crystallography',
 'Applied Energy',
 'Applied Soft Computing',
 'Applied Thermal Engineering',
 'Aquaculture International',
 'Available at SSRN',
 'Bioresource Technology',
 'Catalysis Today',
 'Cell Reports Physical Science',
 'ChemSusChem,',
 'Chemical Engineering Journal',
 'Chemical Engineering Transactions',
 'Chemical Engineering Transactions,',
 'Chemosphere',
 'Clean Technology',
 'Computer Aided Chemical Engineering',
 'Computer Aided Chemical Engineering,',
 'Computer Physics Communications',
 'Computer aided chemical engineering',
 'Computers & Chemical Engineering',
 'Computers in Industry',
 'DYCOPS-CAB',
 'Data in brief',
 'Desalination',
 'ENGINEERING WITH COMPUTERS',
 'Energies',
 'Energy',
 'En

In [51]:
import requests
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

logging.basicConfig(level=logging.INFO)

HEADERS = {"User-Agent": "Mozilla/5.0"}
def fetch_papers_from_semanticscholar(journal, limit=100):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {
        "query": f'journal:"{journal}"',  # 저널 필드 지정
        "limit": limit,
        "fields": "title,abstract"
    }
    try:
        r = requests.get(url, params=params, headers=HEADERS, timeout=30)
        if r.status_code == 200:
            data = r.json()
            texts = []
            for item in data.get("data", []):
                title = item.get("title", "")
                abstract = item.get("abstract", "")
                if title or abstract:
                    texts.append(f"{title} {abstract}")
            return texts
    except Exception as e:
        logging.warning(f"Semantic Scholar API 오류 ({journal}): {e}")
    return []


def extract_keywords_tfidf(texts, topn=10):
    """TF-IDF로 키워드 상위 topn 추출"""
    if not texts:
        return []
    vectorizer = TfidfVectorizer(
        stop_words="english",
        max_features=5000,
        ngram_range=(1,2)  # unigram + bigram
    )
    X = vectorizer.fit_transform(texts)
    scores = X.toarray().sum(axis=0)
    indices = scores.argsort()[::-1][:topn]
    features = vectorizer.get_feature_names_out()
    return [features[i] for i in indices]

# ---------------- 실행 예시 ----------------
journals = [
    "Chemical Engineering Journal",
    "Energy Conversion and Management",
    "Journal of Cleaner Production"
]

results = {}
for j in journals:
    logging.info(f"==== {j} 논문 수집 중 ====")
    texts = fetch_papers_from_semanticscholar(j, limit=100)
    keywords = extract_keywords_tfidf(texts, topn=10)
    results[j] = keywords

# DataFrame 변환
df_keywords = pd.DataFrame.from_dict(results, orient="index").reset_index()
df_keywords.columns = ["Journal"] + [f"Keyword_{i}" for i in range(1, df_keywords.shape[1])]
print(df_keywords)

2025-09-18 13:41:22,080 [INFO] ==== Chemical Engineering Journal 논문 수집 중 ====
2025-09-18 13:41:44,127 [INFO] ==== Energy Conversion and Management 논문 수집 중 ====
2025-09-18 13:41:44,725 [INFO] ==== Journal of Cleaner Production 논문 수집 중 ====


                            Journal
0      Chemical Engineering Journal
1  Energy Conversion and Management
2     Journal of Cleaner Production


In [49]:
filtered_journals

['2023 IEEE',
 '8th International Symposium on Design, Operation and Control of Chemical …',
 'ACS Sustainable Chemistry & Engineering',
 'ACS omega',
 'Advanced Science',
 'Algal Research',
 'Applied Catalysis B: Environmental',
 'Applied Chemistry for Engineering',
 'Applied Crystallography',
 'Applied Energy',
 'Applied Soft Computing',
 'Applied Thermal Engineering',
 'Aquaculture International',
 'Available at SSRN',
 'Bioresource Technology',
 'Catalysis Today',
 'Cell Reports Physical Science',
 'ChemSusChem,',
 'Chemical Engineering Journal',
 'Chemical Engineering Transactions',
 'Chemical Engineering Transactions,',
 'Chemosphere',
 'Clean Technology',
 'Computer Aided Chemical Engineering',
 'Computer Aided Chemical Engineering,',
 'Computer Physics Communications',
 'Computer aided chemical engineering',
 'Computers & Chemical Engineering',
 'Computers in Industry',
 'DYCOPS-CAB',
 'Data in brief',
 'Desalination',
 'ENGINEERING WITH COMPUTERS',
 'Energies',
 'Energy',
 'En