In [8]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# 옵션 설정
options = Options()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# 드라이버 실행
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

# 대상 URL
url = 'https://pubsonline.informs.org/doi/abs/10.1287/isre.2021.0997'
driver.get(url)

# 데이터 추출
try:
    title = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="pb-page-content"]/div/main/article/div/div[2]/div[1]/h1'))).text

    # abstract
    try:
        abstract = driver.find_element(By.XPATH, '//*[@id="pb-page-content"]/div/main/article/div/div[2]/div[3]/div/div[1]/div[2]/p').text
    except:
        abstract = "No abstract"

    # authors
    try:
        author_elements = driver.find_elements(By.XPATH, '//*[@id="sb-1"]/div/div/a')
        authors = ", ".join([a.text for a in author_elements])
    except:
        authors = "No author"

    # year
    try:
        date = driver.find_element(By.XPATH, '//*[@id="pb-page-content"]/div/main/article/div/div[2]/div[2]/span[1]/span[2]').text
    except:
        date = "Unknown"

    # keywords
    try:
        keyword_elements = driver.find_elements(By.XPATH, '//*[@id="pane-pcw-details"]/section[2]/div/ul/li')
        keywords = ", ".join([k.text for k in keyword_elements])
    except:
        keywords = "None"

    # 결과 출력
    print(f"title: {title}")
    print(f"authors: {authors}")
    print(f"date: {date}")
    print(f"abstract: {abstract}")
    print(f"keywords: {keywords}")

    # CSV 저장
    with open('informs_article.csv', mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['title', 'author', 'year', 'abstract', 'keywords', 'url'])
        writer.writerow([title, authors, date, abstract, keywords, url])
        print("✅ CSV 저장 완료: informs_article.csv")

except Exception as e:
    print(f"❌ 오류 발생: {e}")

driver.quit()

title: FinTech as a Game Changer: Overview of Research Frontiers
authors: , Terrence Hendershott, , , Xiaoquan (Michael) Zhang, , , J. Leon Zhao, , , Zhiqiang (Eric) Zheng, 
date: 1 Mar 2021
abstract: Technologies have spawned finance innovations since the early days of computer applications in businesses, most recently reaching the stage of disruptive innovations, such as mobile payments, cryptocurrencies, and digitization of business assets. This has led to the emerging field called financial technology or simply FinTech. In this editorial review, we first provide an overview on relevant technological, pedagogical, and managerial issues pertaining to FinTech teaching and research, with a focus on market trading, artificial intelligence, and blockchain in finance. And then we introduce the articles appearing in this special section. We hope that our discussions of potential research directions and topics in FinTech will stimulate future research in the fields of information systems an

In [4]:
import csv
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ✅ Undetected ChromeDriver 설정
options = uc.ChromeOptions()
# options.add_argument('--headless')  # ❌ Headless 비활성화 (브라우저 창 띄움)
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--disable-blink-features=AutomationControlled")

driver = uc.Chrome(options=options)
wait = WebDriverWait(driver, 15)

# ✅ Cloudflare 인증을 먼저 수동으로 통과할 논문 URL
sample_url = "https://pubsonline.informs.org/doi/abs/10.1287/isre.2021.0997"
driver.get(sample_url)
print("🛑 Cloudflare 인증이 보이면 수동으로 '사람 인증'을 완료해주세요.")
input("✅ 인증이 완료되면 Enter 키를 눌러주세요...")

# ✅ CSV 파일 준비 # 35-4부터
with open('informs_vol35to35.csv', mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['volume', 'issue', 'title', 'author', 'year', 'abstract', 'keywords', 'url'])

    # 🔁 volume 32~35, issue 1~4 반복
    for volume in range(34, 37):
        for issue in range(1, 5):
            toc_url = f"https://pubsonline.informs.org/toc/isre/{volume}/{issue}"
            print(f"\n📘 TOC 접속: {toc_url}")
            try:
                driver.get(toc_url)
                time.sleep(random.uniform(2, 4))

                # 논문 링크 수집
                links = driver.find_elements(By.CSS_SELECTOR, "h5.issue-item__title a")
                hrefs = [a.get_attribute("href") for a in links if a.get_attribute("href")]
                print(f"🔗 {volume}-{issue} 수집된 논문 수: {len(hrefs)}")

                for url in hrefs:
                    driver.get(url)
                    time.sleep(random.uniform(2, 5))

                    try:
                        title = wait.until(EC.presence_of_element_located(
                            (By.XPATH, '//*[@id="pb-page-content"]/div/main/article/div/div[2]/div[1]/h1'))).text

                        try:
                            abstract = driver.find_element(
                                By.XPATH, '//*[@id="pb-page-content"]/div/main/article/div/div[2]/div[3]/div/div[1]/div[2]/p'
                            ).text
                        except:
                            abstract = "No abstract"

                        try:
                            authors = ", ".join([
                                el.text for el in driver.find_elements(By.XPATH, '//*[@id="sb-1"]/div/div/a')
                            ])
                        except:
                            authors = "No author"

                        try:
                            year = driver.find_element(
                                By.XPATH, '//*[@id="pb-page-content"]/div/main/article/div/div[2]/div[2]/span[1]/span[2]'
                            ).text
                        except:
                            year = "Unknown"

                        try:
                            keywords = ", ".join([
                                el.text for el in driver.find_elements(By.XPATH, '//*[@id="pane-pcw-details"]/section[2]/div/ul/li')
                            ])
                        except:
                            keywords = "None"

                        writer.writerow([volume, issue, title, authors, year, abstract, keywords, url])
                        print(f"✅ 저장 완료: {title}")

                    except Exception as e:
                        print(f"❌ {url} 에서 오류 발생: {e}")

            except Exception as e:
                print(f"❌ TOC 접속 오류 ({toc_url}): {e}")

driver.quit()
print("\n📦 크롤링 완료! → informs_vol34to35.csv 저장됨 ✅")

🛑 Cloudflare 인증이 보이면 수동으로 '사람 인증'을 완료해주세요.

📘 TOC 접속: https://pubsonline.informs.org/toc/isre/34/1
🔗 34-1 수집된 논문 수: 22
✅ 저장 완료: Research Spotlights
✅ 저장 완료: Editorial: Continuing on an Inclusive Path to Scholarly Excellence with Renewed Vigor
✅ 저장 완료: Handling Missing Values in Information Systems Research: A Review of Methods and Assumptions
✅ 저장 완료: Could Gamification Designs Enhance Online Learning Through Personalization? Lessons from a Field Experiment
✅ 저장 완료: Self-Regulation and External Influence: The Relative Efficacy of Mobile Apps and Offline Channels for Personal Weight Management
✅ 저장 완료: Identification of Causal Mechanisms from Randomized Experiments: A Framework for Endogenous Mediation Analysis
✅ 저장 완료: The Path to Hedonic Information System Use Addiction: A Process Model in the Context of Social Networking Sites
✅ 저장 완료: Are You What You Tweet? The Impact of Sentiment on Digital News Consumption and Social Media Sharing
✅ 저장 완료: sDTM: A Supervised Bayesian Deep Topic M

In [6]:
import os
import csv
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 📁 저장 경로 설정
save_path = r"/Users/choihj/PycharmProjects/AI_news/data"
os.makedirs(save_path, exist_ok=True)
output_file = os.path.join(save_path, "informs_isre_vol35to36.csv")

# ✅ 드라이버 설정
options = uc.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--disable-blink-features=AutomationControlled")
driver = uc.Chrome(options=options)
wait = WebDriverWait(driver, 15)

# ✅ 샘플 URL 접속 → 수동 Cloudflare 인증
sample_url = "https://pubsonline.informs.org/doi/abs/10.1287/isre.2021.0997"
driver.get(sample_url)
print("🛑 Cloudflare 인증이 보이면 수동으로 '사람 인증'을 완료해주세요.")
input("✅ 인증이 완료되면 Enter 키를 눌러주세요...")

# ✅ CSV 파일 준비
with open(output_file, mode='w', newline='', encoding='utf-8-sig') as f:
    writer = csv.writer(f)
    writer.writerow(['volume', 'issue', 'title', 'authors', 'date', 'abstract', 'keywords', 'url'])

    # 🔁 Volume 34~36, Issue 1~4
    for volume in range(35, 37):
        for issue in range(1, 5):
            toc_url = f"https://pubsonline.informs.org/toc/isre/{volume}/{issue}"
            print(f"\n📘 TOC 접속: {toc_url}")

            try:
                driver.get(toc_url)
                time.sleep(random.uniform(2, 4))

                # 논문 링크 수집
                links = driver.find_elements(By.CSS_SELECTOR, "h5.issue-item__title a")
                hrefs = [a.get_attribute("href") for a in links if a.get_attribute("href")]
                print(f"🔗 수집된 논문 수: {len(hrefs)}")

                for url in hrefs:
                    try:
                        driver.get(url)
                        time.sleep(random.uniform(2, 4))

                        # 제목
                        title = wait.until(EC.presence_of_element_located(
                            (By.CSS_SELECTOR, "h1.citation__title"))).text

                        # 저자
                        try:
                            authors = ", ".join([
                                el.text for el in driver.find_elements(By.CSS_SELECTOR, 'a.entryAuthor')
                            ])
                        except:
                            authors = "No author"

                        # 날짜
                        try:
                            date = driver.find_element(By.CSS_SELECTOR, "span.epub-section__date").text
                        except:
                            date = "Unknown"

                        # 초록
                        try:
                            abstract = driver.find_element(
                                By.CSS_SELECTOR, 'div.abstractSection.abstractInFull > p'
                            ).text
                        except:
                            abstract = "No abstract"

                        # 키워드
                        try:
                            keywords = ", ".join([
                                el.text for el in driver.find_elements(
                                    By.CSS_SELECTOR, 'section.article__keyword ul.rlist li a')
                            ])
                        except:
                            keywords = "None"

                        writer.writerow([volume, issue, title, authors, date, abstract, keywords, url])
                        print(f"✅ 저장 완료: {title}")

                    except Exception as e:
                        print(f"❌ 논문 페이지 오류: {url} → {e}")
                        continue

            except Exception as e:
                print(f"❌ TOC 페이지 오류: {toc_url} → {e}")
                continue

# ✅ 마무리
driver.quit()
print(f"\n📦 크롤링 완료! → {output_file} 저장됨 ✅")

🛑 Cloudflare 인증이 보이면 수동으로 '사람 인증'을 완료해주세요.

📘 TOC 접속: https://pubsonline.informs.org/toc/isre/35/1
🔗 수집된 논문 수: 24
✅ 저장 완료: Research Spotlights
✅ 저장 완료: Motion Sensor–Based Fall Prevention for Senior Care: A Hidden Markov Model with Generative Adversarial Network Approach
✅ 저장 완료: Improving Convenience or Saving Face? An Empirical Analysis of the Use of Facial Recognition Payment Technology in Retail
✅ 저장 완료: Business Value of Information Technology Capabilities: An Institutional Governance Perspective
✅ 저장 완료: Impact of Telehealth and Process Virtualization on Healthcare Utilization
✅ 저장 완료: Atrophy in Aging Systems: Evidence, Dynamics, and Antidote
✅ 저장 완료: The Effect of Gender Expectations and Physical Attractiveness on Discussion of Weakness in Online Professional Recommendations
✅ 저장 완료: Effect of Online Professional Network Recommendations on the Likelihood of an Interview: A Field Study
✅ 저장 완료: Smart Testing with Vaccination: A Bandit Algorithm for Active Sampling for Managing C

In [15]:
import pandas as pd

df=pd.read_csv('data/ISR.csv',encoding='cp949')
df['journal'] = 'ISR'

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   volume    420 non-null    int64 
 1   issue     420 non-null    int64 
 2   title     420 non-null    object
 3   authors   368 non-null    object
 4   date      420 non-null    object
 5   abstract  363 non-null    object
 6   keywords  362 non-null    object
 7   url       420 non-null    object
 8   journal   420 non-null    object
dtypes: int64(2), object(7)
memory usage: 29.7+ KB


In [16]:
df

Unnamed: 0,volume,issue,title,authors,date,abstract,keywords,url,journal
0,32,1,Research Spotlights,,1 Mar 2021,,,https://pubsonline.informs.org/doi/abs/10.1287...,ISR
1,32,1,FinTech as a Game Changer: Overview of Researc...,"Terrence Hendershott, Xiaoquan (Michael) Zhang...",1 Mar 2021,Technologies have spawned finance innovations ...,"FinTech, financial service, blockchain, AI",https://pubsonline.informs.org/doi/abs/10.1287...,ISR
2,32,1,Mobile Money and Mobile Technologies: A Struct...,"Yan Dong, Sining Song, Sriram Venkataraman, Yu...",23 Jul 2020,Using a data set on mobile technologies and mo...,"mobile money, mobile technology, demand estima...",https://pubsonline.informs.org/doi/abs/10.1287...,ISR
3,32,1,"Fake News, Investor Attention, and Market Reac...","Jonathan Clarke, Hailiang Chen, Ding Du, Yu Je...",23 Jul 2020,Does fake news in financial markets attract mo...,"fake news, investor attention, financial techn...",https://pubsonline.informs.org/doi/abs/10.1287...,ISR
4,32,1,Education Crowdfunding and Student Performance...,"Qiang Gao, Mingfeng Lin, D. J. Wu, Qiang Gao, ...",31 Dec 2020,Despite the growing popularity of online publi...,"donation-based crowdfunding, education crowdfu...",https://pubsonline.informs.org/doi/abs/10.1287...,ISR
...,...,...,...,...,...,...,...,...,...
415,36,1,Fast Forecasting of Unstable Data Streams for ...,"Yu Jeffrey Hu, Jeroen Rombouts, Ines Wilms, Yu...",30 May 2024,On-demand service platforms face a challenging...,"e-commerce, platform econometrics, streaming d...",https://pubsonline.informs.org/doi/abs/10.1287...,ISR
416,36,1,KETCH: A Knowledge-Enhanced Transformer-Based ...,"Dongsong Zhang, Lina Zhou, Jie Tao, Tingshao Z...",31 May 2024,"Suicidal ideation (SI), as a psychiatric emerg...","suicidal ideation detection, social media, dom...",https://pubsonline.informs.org/doi/abs/10.1287...,ISR
417,36,1,Monitoring and the Cold Start Problem in Digit...,"Chen Liang, Yili Hong, Bin Gu, Chen Li...",6 Mar 2024,Many online labor platforms employ reputation ...,"cold-start problem, online platforms, monitori...",https://pubsonline.informs.org/doi/abs/10.1287...,ISR
418,36,1,A Nudge to Credible Information as a Counterme...,"Elina H. Hwang, Stephanie Lee, Elina H. Hwang,...",28 Feb 2024,"Fueled by social media, health misinformation ...","misinformation, diffusion, online platform, co...",https://pubsonline.informs.org/doi/abs/10.1287...,ISR


In [17]:
import pandas as pd

# 예시: 날짜가 '1 Mar 2021' 형식일 때
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # 문자열을 datetime으로 변환
df['date'] = df['date'].dt.year  # 연도만 추출

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 360 entries, 1 to 418
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   volume    360 non-null    int64 
 1   issue     360 non-null    int64 
 2   title     360 non-null    object
 3   authors   360 non-null    object
 4   date      360 non-null    int32 
 5   abstract  360 non-null    object
 6   keywords  360 non-null    object
 7   url       360 non-null    object
 8   journal   360 non-null    object
dtypes: int32(1), int64(2), object(6)
memory usage: 26.7+ KB


In [19]:
df = df.drop_duplicates().dropna()

In [22]:
# ✅ 병합
df2=pd.read_csv('data/01_combined_journal.csv')
combined_df = pd.concat([df, df2], ignore_index=True)
# ✅ 필요한 컬럼만 추출
columns_to_keep = ['title', 'date', 'abstract', 'keywords', 'authors', 'affiliations','journal']
combined_df = combined_df[columns_to_keep]
# ✅ 최종 확인
print("📌 최종 shape:", combined_df.shape)
print("📌 컬럼 확인:", combined_df.columns.tolist())

📌 최종 shape: (5662, 7)
📌 컬럼 확인: ['title', 'date', 'abstract', 'keywords', 'authors', 'affiliations', 'journal']


In [24]:
combined_df.to_csv('data/01_combined_journal.csv', index=False)

In [32]:
test=pd.read_csv('data/01_combined_journal.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5653 entries, 0 to 5652
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         5653 non-null   object
 1   date          5653 non-null   int64 
 2   abstract      5653 non-null   object
 3   keywords      5653 non-null   object
 4   authors       5653 non-null   object
 5   affiliations  1508 non-null   object
 6   journal       5653 non-null   object
dtypes: int64(1), object(6)
memory usage: 309.3+ KB


In [29]:
# 연도별 개수 확인 (내림차순 정렬)
test = test[(test['date'] >= 2021) & (test['date'] <= 2025)]
year_counts = test['date'].value_counts().sort_index()
print(year_counts)

date
2021    1222
2022    1222
2023    1210
2024    1275
2025     724
Name: count, dtype: int64


In [30]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5653 entries, 0 to 5661
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         5653 non-null   object
 1   date          5653 non-null   int64 
 2   abstract      5653 non-null   object
 3   keywords      5653 non-null   object
 4   authors       5653 non-null   object
 5   affiliations  1508 non-null   object
 6   journal       5653 non-null   object
dtypes: int64(1), object(6)
memory usage: 353.3+ KB


In [31]:
test.to_csv('data/01_combined_journal.csv', index=False)