# 1. Academia

## 1.1. 데이터 병합

In [48]:
import pandas as pd
from pandas import value_counts

# 파일 경로
files = [
    "DSS/dss_vol164to170.csv",
    "DSS/dss_vol170to180.csv",
    "DSS/dss_vol180to190.csv",
    "DSS/dss_vol183to190.csv",
    "DSS/dss_vol186to190.csv",
    "DSS/dss_vol189to197.csv"
]

# 모든 파일 읽어서 합치기
dfs = [pd.read_csv(f) for f in files]
merged_df = pd.concat(dfs, ignore_index=True)

# 중복 제거 (같은 논문 여러 번 수집된 경우)
merged_df = merged_df.drop_duplicates()

# CSV로 저장
merged_df.to_csv("dss_merged.csv", index=False)

print("CSV 병합 완료: dss_merged.csv")

CSV 병합 완료: dss_merged.csv


In [50]:
merged_df.to_csv('DSS/DSS.csv', index=False, encoding="utf-8-sig")

In [52]:
import pandas as pd
import glob
import os

# CSV 파일이 들어있는 폴더 경로
folder_path = "Crawler/IAM"

# 해당 폴더 안의 모든 csv 파일 불러오기
all_files = glob.glob(os.path.join(folder_path, "*.csv"))

# 파일들을 DataFrame으로 읽어서 합치기
dfs = [pd.read_csv(f) for f in all_files]
merged_df = pd.concat(dfs, ignore_index=True)

# 중복 제거 (같은 논문 여러 번 수집된 경우)
merged_df = merged_df.drop_duplicates()

# 저장
output_file = os.path.join(folder_path, "IAM.csv")
merged_df.to_csv(output_file, index=False)

print("CSV 병합 완료:", output_file)

CSV 병합 완료: IAM/IAM.csv


In [53]:
import pandas as pd
import glob
import os

# CSV 폴더 경로
folder_path = "Crawler/Academia"

# 모든 csv 파일 읽기
all_files = glob.glob(os.path.join(folder_path, "*.csv"))

# 저널 키워드별 분류
journals = {
    "isre": [f for f in all_files if "isre" in os.path.basename(f).lower()],
    "misq": [f for f in all_files if "misq" in os.path.basename(f).lower()],
    "dss":  [f for f in all_files if "dss"  in os.path.basename(f).lower()],
    "iam":  [f for f in all_files if "iam"  in os.path.basename(f).lower()],
}

# 저널별 병합
for journal, files in journals.items():
    if not files:
        continue
    dfs = [pd.read_csv(f) for f in files]
    merged = pd.concat(dfs, ignore_index=True).drop_duplicates()
    output_file = os.path.join(folder_path, f"{journal.upper()}.csv")
    merged.to_csv(output_file, index=False)
    print(f"{journal.upper()} 병합 완료: {output_file}")

ISRE 병합 완료: Academia/ISRE.csv
MISQ 병합 완료: Academia/MISQ.csv
DSS 병합 완료: Academia/DSS.csv
IAM 병합 완료: Academia/IAM.csv


## 1.2. 데이터 전처리

In [18]:
import pandas as pd
import glob
import os

# 1. 폴더 및 파일 경로 설정
folder_path = 'Data/Academia'  # CSV 파일들이 있는 폴더
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

df_list = []

for file in csv_files:
    # 파일명에서 저널명 추출 (예: 'MISQ.csv' → 'MISQ')
    journal_name = os.path.splitext(os.path.basename(file))[0].upper()

    # 파일 불러오기
    df = pd.read_csv(file)

    # journal 컬럼은 affiliations 대체용으로만 사용 (최종 저장에는 포함 X)
    df['journal'] = journal_name

    # affiliations 컬럼이 없으면 새로 생성
    if 'affiliations' not in df.columns:
        df['affiliations'] = journal_name
    else:
        df['affiliations'] = df['affiliations'].fillna(journal_name)

    # year 추출 (내부 전처리용)
    if journal_name == 'MISQ':
        def extract_year_from_doi(doi):
            if pd.isna(doi): return None
            doi_lower = doi.lower()
            if 'vol47' in doi_lower: return 2023
            if 'vol48' in doi_lower: return 2024
            if 'vol49' in doi_lower: return 2025
            return None
        df['date'] = df['url'].apply(extract_year_from_doi)
    else:
        df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.year

    df_list.append(df)

# 2. 모든 데이터 통합
merged_df = pd.concat(df_list, ignore_index=True)

# 3. 중복 제거 (title + abstract 기준)
merged_df.drop_duplicates(subset=['title', 'abstract'], inplace=True)

# 4. Null 제거 (title, abstract)
merged_df.dropna(subset=['title', 'abstract'], inplace=True)

# 5. 최종 컬럼 순서 지정 (keyword 또는 affiliations이 없는 경우 대비)
final_cols = ['title', 'abstract', 'keywords', 'date', 'affiliations']
for col in final_cols:
    if col not in merged_df.columns:
        merged_df[col] = None  # 없으면 빈 컬럼 생성

final_df = merged_df[final_cols]  # 순서 맞춰서 정렬

# 6. 저장
final_df.to_csv('Data/Academia/merged_journals_preprocessed.csv', index=False)

# 7. 확인용 출력
print("✅ 최종 데이터 수:", final_df.shape)
print("✅ 결측치 확인:\n", final_df.isnull().sum())

✅ 최종 데이터 수: (4032, 5)
✅ 결측치 확인:
 title            0
abstract         0
keywords        47
date             0
affiliations     0
dtype: int64


In [19]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4032 entries, 1 to 4373
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         4032 non-null   object
 1   abstract      4032 non-null   object
 2   keywords      3985 non-null   object
 3   date          4032 non-null   int64 
 4   affiliations  4032 non-null   object
dtypes: int64(1), object(4)
memory usage: 189.0+ KB


In [20]:
print(final_df.value_counts(subset=['affiliations'],sort=False).sort_index())

affiliations
DSS              360
HICSS           2097
IAM              270
ICIS             822
ISR              268
MISQ             215
Name: count, dtype: int64


In [None]:
import pandas as pd
df = pd.read_csv('Data/Academia/03_journal_2023_2025.csv')
df.info()

In [None]:
print(df.value_counts(subset=['affiliations'],sort=False).sort_index())

## 1.3. ISR date 추가 전처리

In [7]:
import pandas as pd

isr=pd.read_csv('Data/Academia/ISR.csv')
isr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   volume    264 non-null    int64 
 1   issue     264 non-null    int64 
 2   title     264 non-null    object
 3   authors   235 non-null    object
 4   date      264 non-null    object
 5   abstract  232 non-null    object
 6   keywords  231 non-null    object
 7   url       264 non-null    object
dtypes: int64(2), object(6)
memory usage: 16.6+ KB


In [8]:
isr.head()

Unnamed: 0,volume,issue,title,authors,date,abstract,keywords,url
0,34,3,Research Spotlights,,28 Sep 2023,,,https://pubsonline.informs.org/doi/abs/10.1287...
1,34,3,Personalized Ranking at a Mobile App Distribut...,"Shengjun Mao, Sanjeev Dewan, Yi-Jen (Ian) Ho, ...",12 Aug 2022,The ease of customer data collection has enabl...,"mobile, ranking, app, platform revenue, hierar...",https://pubsonline.informs.org/doi/abs/10.1287...
2,34,3,Bystanders Join in Cyberbullying on Social Net...,"Tommy K. H. Chan, Christy M. K. Cheung, Izak B...",18 Aug 2022,Cyberbullying on social networking sites (SNSs...,"online harms, cyberbullying, social networking...",https://pubsonline.informs.org/doi/abs/10.1287...
3,34,3,Direct and Indirect Spillovers from Content Pr...,"Keran Zhao, Yingda Lu, Yuheng Hu, Yili Hong, K...",22 Aug 2022,Content providers in online social media platf...,"livestreaming, content switching, viewer behav...",https://pubsonline.informs.org/doi/abs/10.1287...
4,34,3,A Bitter Pill to Swallow? The Consequences of ...,"Chen Chen, Dylan Walker, Chen Chen, Dylan Walk...",24 Aug 2022,Online health question-and-answer (Q&A) platfo...,"online healthcare, patient evaluation, care av...",https://pubsonline.informs.org/doi/abs/10.1287...


In [9]:
merged_df = pd.read_csv('Data/Academia/merged_journals_preprocessed.csv')

In [10]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056 entries, 0 to 1055
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1056 non-null   object 
 1   abstract      1056 non-null   object 
 2   keyword       0 non-null      float64
 3   date          824 non-null    float64
 4   affiliations  1056 non-null   object 
dtypes: float64(2), object(3)
memory usage: 41.4+ KB


In [11]:
merged_df

Unnamed: 0,title,abstract,keyword,date,affiliations
0,Know Your Firm: Managing Social Media Engageme...,We examine the impact of firm social media eng...,,2023.0,MISQ
1,It Depends On When You Search,Existing studies have found that online search...,,2023.0,MISQ
2,Cyberslacking in the Workplace: Antecedents an...,Employees’ nonwork use of information technolo...,,2023.0,MISQ
3,Where is IT in Information Security? The Inter...,Data breaches can severely damage a firm’s rep...,,2023.0,MISQ
4,Special Issue Introduction: Building Digital R...,Major shocks such as the COVID-19 pandemic cre...,,2023.0,MISQ
...,...,...,...,...,...
1051,Stress from Digital Work: Toward a Unified Vie...,There are many models with various sets of hin...,,,ISR
1052,Dynamics of Shared Security in the Cloud,Cloud services exist under a shared security e...,,,ISR
1053,Beyond Risk: A Measure of Distribution Uncerta...,"Uncertainty, particularly distribution uncerta...",,,ISR
1054,Unveiling the Cost of Free: How an Ad-Sponsore...,The selection of a business model significantl...,,,ISR


In [12]:
import pandas as pd
import re
from datetime import datetime

# --- 입력 ---
merged = pd.read_csv("Data/Academia/merged_journals_preprocessed.csv")
isr = pd.read_csv("Data/Academia/ISR.csv")

# --- 유틸: 날짜 → 연도 ---
def to_year(x):
    if pd.isna(x): return pd.NA
    s = str(x).strip()
    m = re.search(r"(19|20)\d{2}", s)
    if m: return int(m.group())
    try:
        return pd.to_datetime(s, errors="raise").year
    except:
        return pd.NA

# --- 정규화 키 (title) ---
def norm(t):
    return re.sub(r"\s+", " ", str(t)).strip().lower()

merged["_k"] = merged["title"].map(norm)
isr["_k"] = isr["title"].map(norm)

# --- ISR 측 연도 맵 (+1 해주기) ---
isr_year_map = (isr.drop_duplicates("_k")[["_k","date"]]
                  .assign(year=lambda d: d["date"].map(to_year) + 1)  # ✅ +1
                  .set_index("_k")["year"])

# --- 채우기 ---
mask = (merged["affiliations"].eq("ISR")) & (merged["date"].isna())
merged.loc[mask, "date"] = merged.loc[mask, "_k"].map(isr_year_map)

# --- 숫자형 변환 ---
merged["date"] = pd.to_numeric(merged["date"], errors="coerce").astype("Int64")

# --- 저장 ---
merged.drop(columns=["_k"]).to_csv("Data/Academia/07_journal.csv", index=False)
print("Done. Saved t")

Done. Saved t


# 2. Industry

## 2.1. 데이터 병합

In [2]:
import pandas as pd

df=pd.read_csv('Data/Industry/05_industry_2023_2025.csv')
news=pd.read_csv('Data/Industry/nyt.csv')

print(df.info())
print(news.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11672 entries, 0 to 11671
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         11672 non-null  object 
 1   content       11672 non-null  object 
 2   date          11672 non-null  float64
 3   affiliations  11672 non-null  object 
 4   keywords      6252 non-null   object 
dtypes: float64(1), object(4)
memory usage: 456.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    164 non-null    object
 1   link     164 non-null    object
 2   date     158 non-null    object
 3   media    158 non-null    object
 4   content  158 non-null    object
dtypes: object(5)
memory usage: 6.5+ KB
None


In [3]:
df

Unnamed: 0,title,content,date,affiliations,keywords
0,ArtificialIntelligence(A Special Report) --- H...,The current generation of college students is ...,2024.0,Wall Street Journal,
1,ArtificialIntelligence(A Special Report) --- T...,ChatGPT is barely two years old. And yet it's ...,2024.0,Wall Street Journal,
2,ArtificialIntelligence(A Special Report) --- F...,The race for AI dominance launched a stampede ...,2025.0,Wall Street Journal,
3,Crunchbase UsesArtificialIntelligenceTo Predic...,"Crunchbase, the firm best known for its startu...",2025.0,Wall Street Journal,
4,On the Clock: Bosses' Mental Fitness Set for A...,Bosses already live in fear that a verbal miss...,2024.0,Wall Street Journal,
...,...,...,...,...,...
11667,Imposter used AI to pose as Marco Rubio and co...,The US State Department said it is investigati...,2025.0,BBC,
11668,How King Charles will help rebuild the shaken ...,Listen to Paul read this article\nFew scenes c...,2025.0,BBC,
11669,Linda Yaccarino departs as boss of Musk's X,"Linda Yaccarino, the boss of Elon Musk's socia...",2025.0,BBC,
11670,Musk's Grok chatbot praises Hitler and insults...,Elon Musk's artificial intelligence start-up x...,2025.0,BBC,


In [4]:
news

Unnamed: 0,title,link,date,media,content
0,Bringing Art Back to Life WithArtificialIntell...,https://www.proquest.com/docview/3242402362/D2...,23 Aug 2025,New York Times,Alex Kachkine spends his days working on micro...
1,Trump Plans to Give AI Developers a Free Hand,https://www.proquest.com/docview/3232620020/D2...,. 24 July 2025.,New York Times,WASHINGTON — President Donald Trump said Wedne...
2,Trump Administration Plans to Give AI Develope...,https://www.proquest.com/docview/3232545176/D2...,. 23 July 2025.,New York Times,WASHINGTON — The Trump administration said Wed...
3,A.I. Changes Video Games And Alters An Industr...,https://www.proquest.com/docview/3234093762/D2...,. 29 July 2025: C.1.,New York Times,Game designers have used artificial intelligen...
4,Their Water Taps Ran Dry When Meta Built Next ...,https://www.proquest.com/docview/3231529523/D2...,. 19 July 2025.,New York Times,"NEWTON COUNTY, Ga. — After Meta broke ground o..."
...,...,...,...,...,...
159,They’re Stuffed Animals. They’re Also AI Chatb...,https://www.proquest.com/docview/3241025239/D2...,19 Aug 2025,New York Times,Curio is a company that describes itself as “a...
160,A.I.-Fueled Smear Attack Highlights Argentine ...,https://www.proquest.com/docview/3226481397/D2...,. 03 July 2025: A.6.,New York Times,Journalists face an increasing number of attac...
161,"If A.I. Outwits Phones, What's Next?: [Busines...",https://www.proquest.com/docview/3248171135/D2...,09 Sep 2025,New York Times,As Apple prepares to release new iPhones this ...
162,The Doctors Are Real. The Quack Cures Are A.I....,https://www.proquest.com/docview/3247571825/D2...,07 Sep 2025,New York Times,"Dr. Robert H. Lustig is an endocrinologist, a ..."


In [6]:
import os, re
import pandas as pd
from datetime import datetime, timedelta

BASE = "/home/dslab/choi/Journal/Data/Industry"

# === 날짜 → 연도만 남기기 ===
def to_year(s):
    if pd.isna(s):
        return None
    s = str(s).strip()
    now = datetime.today()

    # 1) 상대적 날짜 (X days ago / Xh ago)
    m = re.match(r"(\d+)\s*days?\s*ago", s.lower())
    if m:
        return (now - timedelta(days=int(m.group(1)))).year
    if re.match(r"\d+\s*h\s*ago", s.lower()):
        return now.year

    # 2) ISO 8601 with timezone
    try:
        return datetime.fromisoformat(s).year
    except:
        pass

    # 3) 흔한 날짜 포맷들
    for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%d %b %Y", "%d %B %Y"):
        try:
            return datetime.strptime(s, fmt).year
        except:
            pass

    # 4) 문자열 중 "31 July 2025" 같은 패턴 추출
    m = re.search(r"\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4}", s)
    if m:
        for fmt in ("%d %B %Y", "%d %b %Y"):
            try:
                return datetime.strptime(m.group(0), fmt).year
            except:
                pass

    # 5) YYYY 단독
    if re.fullmatch(r"\d{4}", s):
        return int(s)

    return None

# === URL에서 연도 추출 ===
def year_from_url(u):
    if pd.isna(u):
        return None
    u = str(u)
    # case: /YYYY/MM/DD/
    m = re.search(r"/((?:19|20)\d{2})/(?:0?[1-9]|1[0-2])/(?:0?[1-9]|[12]\d|3[01])/", u)
    if m:
        return int(m.group(1))
    # case: /YYYY/
    m = re.search(r"/((?:19|20)\d{2})/", u)
    if m:
        return int(m.group(1))
    return None

# === affiliations 보정 ===
def ensure_affiliations(df, source_name):
    alias = next((c for c in ["affiliations","media","outlet","source","publisher"] if c in df.columns), None)
    if alias and alias != "affiliations":
        df = df.rename(columns={alias: "affiliations"})
    if "affiliations" not in df.columns:
        df["affiliations"] = source_name
    else:
        mask = df["affiliations"].isna() | (df["affiliations"].astype(str).str.strip() == "")
        df.loc[mask, "affiliations"] = source_name
    return df

# === 파일명 유연 매칭 (대소문자/부분일치) ===
def resolve_path(base, name):
    cand = os.path.join(base, name)
    if os.path.exists(cand):
        return cand
    low = name.lower()
    for f in os.listdir(base):
        if f.lower() == low:  # 대소문자 차이
            return os.path.join(base, f)
    stem = low.replace(".csv","")
    for f in os.listdir(base):
        if f.lower().endswith(".csv") and (stem in f.lower()):
            return os.path.join(base, f)
    return None

# === 처리할 파일 목록 ===
files = {
    "bbc.csv":"BBC",
    "cnn.csv":"CNN",
    "guardian.csv":"The Guardian",
    "nyt.csv":"New York Times",
    "techcrunch.csv":"TechCrunch",
    "wallstreet.csv":"Wall Street Journal",
    "TheVerge.csv":"The Verge",
}

# === 메인 루프 ===
for name, src in files.items():
    in_path = resolve_path(BASE, name)
    if not in_path:
        print(f"⚠️ 찾지 못함: {name}  (폴더: {BASE})")
        continue

    df = pd.read_csv(in_path)

    # date 컬럼 처리
    if "date" in df.columns:
        df["date"] = df["date"].apply(to_year)
    else:
        df["date"] = None

    # CNN 전용: url에서 연도 보충
    if src == "CNN" and "url" in df.columns:
        mask = df["date"].isna() | (df["date"].astype(str).str.strip() == "")
        df.loc[mask, "date"] = df.loc[mask, "url"].apply(year_from_url)

    # affiliations 보정
    df = ensure_affiliations(df, src)

    # 저장
    out_path = os.path.join(BASE, os.path.splitext(os.path.basename(in_path))[0] + "_fixed.csv")
    df.to_csv(out_path, index=False)
    print(f"✅ 처리 완료: {os.path.basename(in_path)} → {os.path.basename(out_path)}")

✅ 처리 완료: bbc.csv → bbc_fixed.csv
✅ 처리 완료: cnn.csv → cnn_fixed.csv
✅ 처리 완료: guardian.csv → guardian_fixed.csv
✅ 처리 완료: nyt.csv → nyt_fixed.csv
✅ 처리 완료: techcrunch.csv → techcrunch_fixed.csv
✅ 처리 완료: wallstreet.csv → wallstreet_fixed.csv
✅ 처리 완료: TheVerge.csv → TheVerge_fixed.csv


In [7]:
import os
import pandas as pd

BASE = "/home/dslab/choi/Journal/Data/Industry"

# ✅ fixed 파일 목록 확인
fixed_files = [f for f in os.listdir(BASE) if f.endswith("_fixed.csv")]
print("📂 변환된 파일 목록:", fixed_files)

# ✅ 각 파일 샘플 확인
for f in fixed_files:
    path = os.path.join(BASE, f)
    try:
        df = pd.read_csv(path)
        print(f"\n=== {f} ===")
        print(f"행(row) 수: {len(df)}, 컬럼: {list(df.columns)}")
        print(df.head(3))   # 상위 3행 미리보기
        print("-"*50)
    except Exception as e:
        print(f"❌ {f} 읽기 오류:", e)

📂 변환된 파일 목록: ['guardian_fixed.csv', 'TheVerge_fixed.csv', 'techcrunch_fixed.csv', 'nyt_fixed.csv', 'cnn_fixed.csv', 'wallstreet_fixed.csv', 'bbc_fixed.csv']

=== guardian_fixed.csv ===
행(row) 수: 396, 컬럼: ['title', 'date', 'content', 'url', 'affiliations']
                                               title    date  \
0  Chatbot site depicting child sexual abuse imag...  2025.0   
1  Labour cosies up to US tech firms with little ...  2025.0   
2  British AI startup beats humans in internation...  2025.0   

                                             content  \
0  A chatbot site offering explicit scenarios wit...   
1  Jensen Huang, the boss of the chipmakerNvidia,...   
2  An artificial intelligence system has beaten s...   

                                                 url  affiliations  
0  https://www.theguardian.com/technology/2025/se...  The Guardian  
1  https://www.theguardian.com/business/2025/sep/...  The Guardian  
2  https://www.theguardian.com/technology/2025/se...  T

In [None]:
import pandas as pd

cnn=pd.read_csv('/home/dslab/choi/Journal/Data/Industry/cnn_fixed.csv')
cnn=cnn.rename(columns={'article_name':'title'})
cnn.info()

# 3. 03이랑 07 journal 합치기 및 중복 제거

In [3]:
import pandas as pd

df2=pd.read_csv('home/dslab/choi/WITS/전처리완료된데이터/03_journal_2023_2025.csv')
journal=pd.read_csv('/home/dslab/choi/Journal/Data/07_journal.csv')
df2.info()
journal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3209 entries, 0 to 3208
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         3209 non-null   object
 1   date          3209 non-null   int64 
 2   abstract      3209 non-null   object
 3   keywords      3209 non-null   object
 4   authors       3209 non-null   object
 5   affiliations  3209 non-null   object
dtypes: int64(1), object(5)
memory usage: 150.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056 entries, 0 to 1055
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1056 non-null   object 
 1   abstract      1056 non-null   object 
 2   keyword       0 non-null      float64
 3   date          1056 non-null   int64  
 4   affiliations  1056 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 41.4+ KB


In [4]:
final_df=pd.concat([df2,journal])
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4265 entries, 0 to 1055
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         4265 non-null   object 
 1   date          4265 non-null   int64  
 2   abstract      4265 non-null   object 
 3   keywords      3209 non-null   object 
 4   authors       3209 non-null   object 
 5   affiliations  4265 non-null   object 
 6   keyword       0 non-null      float64
dtypes: float64(1), int64(1), object(5)
memory usage: 266.6+ KB


In [5]:
print(final_df.value_counts(subset=['affiliations']))
print(final_df.value_counts(subset=['date']))

affiliations
HICSS           2098
ICIS             822
ISR              381
DSS              360
MISQ             334
IAM              270
Name: count, dtype: int64
date
2024    1714
2023    1520
2025    1030
2026       1
Name: count, dtype: int64


In [6]:
final_df.drop(columns=['keyword'], inplace=True)

In [7]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4265 entries, 0 to 1055
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         4265 non-null   object
 1   date          4265 non-null   int64 
 2   abstract      4265 non-null   object
 3   keywords      3209 non-null   object
 4   authors       3209 non-null   object
 5   affiliations  4265 non-null   object
dtypes: int64(1), object(5)
memory usage: 233.2+ KB


In [9]:
import pandas as pd

# ✅ 중복 후보 찾기 (title+content 모두 동일)
dupes = final_df[final_df.duplicated(subset=["title", "abstract"], keep=False)] \
            .sort_values(by=["title", "abstract"])

print("🔎 중복된 행 목록:")
print(dupes)

# ✅ 실제로 제거 (첫 번째만 남기고 삭제)
df_dedup = final_df.drop_duplicates(subset=["title", "abstract"], keep="first").reset_index(drop=True)

print(df_dedup)
# 결과 저장 (선택)
# dupes.to_csv("Data/08_Academia.csv", index=False)   # 제거될 가능성이 있는 행 저장
df_dedup.to_csv("/home/dslab/choi/Journal/Data/08_Academia.csv", index=False)     # 최종 중복 제거된 데이터 저장
#
# print(f"총 {len(dupes)}개의 중복 행 발견, 제거 후 남은 행 수: {len(df_dedup)}")

🔎 중복된 행 목록:
                                                  title  date  \
13    A Complex Adaptive Systems Perspective of Soft...  2023   
866   A Complex Adaptive Systems Perspective of Soft...  2024   
45    A Computational Framework for Understanding Fi...  2023   
938   A Computational Framework for Understanding Fi...  2024   
62    A Design Theory for Transparency of Informatio...  2023   
...                                                 ...   ...   
948   When the Clock Strikes: A Multimethod Investig...  2024   
12    Which Enemy to Dance with? A New Role of Softw...  2023   
865   Which Enemy to Dance with? A New Role of Softw...  2024   
1270  iRepair or I Repair? A Dialectical Process Ana...  2024   
85    iRepair or I Repair? A Dialectical Process Ana...  2024   

                                               abstract  \
13    Software is instrumental to the accelerated pa...   
866   Software is instrumental to the accelerated pa...   
45    Large firms are leaders 

In [10]:
df_dedup

Unnamed: 0,title,date,abstract,keywords,authors,affiliations
0,Digital “x”?Charting a Path for Digital-Themed...,2023,"As of late, the use of “digital” as a qualifie...","digital x, IT x, digitalization, digitization,...","Abayomi Baiyere, Varun Grover, Kalle J. Lyytin...",ISR
1,"Law, Economics, and Privacy: Implications of G...",2023,Widespread abuse of internet users’ privacy on...,"data protection regulation, government policy,...","Ram D. Gopal, Hooman Hidaji, Sule Nur Kutlu, R...",ISR
2,Spoiled for Choice? Personalized Recommendatio...,2023,Online healthcare platforms provide users with...,"personal health management, online healthcare ...","Tongxin Zhou, Yingfei Wang, Lu (Lucy) Yan, Yon...",ISR
3,A Theory-Driven Deep Learning Method for Voice...,2023,As artificial intelligence and digitalization ...,"customer response prediction, voice chat, theo...","Gang Chen, Shuaiyong Xiao, Chenghong Zhang, Hu...",ISR
4,The Decoy Effect and Recommendation Systems,2023,"In this paper, we explore the decoy effect in ...","recommendation system, personalization, decoy ...","Nasim Mousavi, Panagiotis Adamopoulos, Jesse B...",ISR
...,...,...,...,...,...,...
4027,Stress from Digital Work: Toward a Unified Vie...,2025,There are many models with various sets of hin...,,,ISR
4028,Dynamics of Shared Security in the Cloud,2025,Cloud services exist under a shared security e...,,,ISR
4029,Beyond Risk: A Measure of Distribution Uncerta...,2025,"Uncertainty, particularly distribution uncerta...",,,ISR
4030,Unveiling the Cost of Free: How an Ad-Sponsore...,2025,The selection of a business model significantl...,,,ISR


# 4. 데이터 확인

In [14]:
# import pandas as pd
#
# # affiliations 정리 함수
# def clean_affil(s: str) -> str:
#     s = str(s).strip().lower()
#     mapping = {
#         "the verge": "The Verge",
#         "verge": "The Verge",
#         "the_verge": "The Verge",
#         "techcrunch": "TechCrunch",
#         "techcrunch ": "TechCrunch",
#         "bbc": "BBC",
#         "the guardian": "The Guardian",
#         "guardian": "The Guardian",
#         "cnn": "CNN",
#         "wall street journal": "Wall Street Journal",
#         "new york times": "New York Times",
#     }
#     return mapping.get(s, s.title())
#
# industry["affiliations"] = industry["affiliations"].map(clean_affil)
#
# # 새로 집계
# aff_counts = industry["affiliations"].value_counts().reset_index()
# aff_counts.columns = ["Affiliation", "Count"]
# print(aff_counts)

           Affiliation  Count
0            The Verge   6891
1           TechCrunch   3204
2         The Guardian   2341
3                  BBC   1431
4  Wall Street Journal    955
5       New York Times    900
6                  CNN    408


In [20]:
# industry.to_csv('/home/dslab/choi/Journal/Data/07_industry.csv', index=False)

In [2]:
import pandas as pd

industry=pd.read_csv('/home/dslab/choi/Journal/Data/07_industry.csv')
journal=pd.read_csv('/home/dslab/choi/Journal/Data/08_Academia.csv')

print(industry.info())
print(journal.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16130 entries, 0 to 16129
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         16130 non-null  object 
 1   content       16130 non-null  object 
 2   date          16130 non-null  float64
 3   affiliations  16130 non-null  object 
 4   keywords      10095 non-null  object 
dtypes: float64(1), object(4)
memory usage: 630.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4032 entries, 0 to 4031
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         4032 non-null   object
 1   date          4032 non-null   int64 
 2   abstract      4032 non-null   object
 3   keywords      3208 non-null   object
 4   authors       3208 non-null   object
 5   affiliations  4032 non-null   object
dtypes: int64(1), object(5)
memory usage: 189.1+ KB
None


In [26]:
print('소속별')
print(industry.value_counts(subset=['affiliations']))
print('-'*20)
print('년도별')
print(industry.value_counts(subset=['date']))

소속별
affiliations       
The Verge              6891
TechCrunch             3204
The Guardian           2341
BBC                    1431
Wall Street Journal     955
New York Times          900
CNN                     408
Name: count, dtype: int64
--------------------
년도별
date  
2024.0    6337
2023.0    4975
2025.0    4818
Name: count, dtype: int64


In [27]:
print('소속별')
print(journal.value_counts(subset=['affiliations']))
print('-'*20)
print('년도별')
print(journal.value_counts(subset=['date']))

소속별
affiliations
HICSS           2097
ICIS             822
DSS              360
IAM              270
ISR              268
MISQ             215
Name: count, dtype: int64
--------------------
년도별
date
2024    1580
2023    1470
2025     982
Name: count, dtype: int64


In [29]:
industry.drop_duplicates(subset=['title', 'content'], keep="first")
industry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16130 entries, 0 to 16129
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         16130 non-null  object 
 1   content       16130 non-null  object 
 2   date          16130 non-null  float64
 3   affiliations  16130 non-null  object 
 4   keywords      10095 non-null  object 
dtypes: float64(1), object(4)
memory usage: 630.2+ KB


In [3]:
industry[industry['affiliations']=='The Verge']

Unnamed: 0,title,content,date,affiliations,keywords
3667,ChatGPT’s creator made a free tool for detecti...,"OpenAI, the company behind DALL-E and ChatGPT,...",2023.0,The Verge,"AI, News, Tech"
3668,Instagram’s co-founders are back with Artifact...,Kevin Systrom and Mike Krieger are back.\nThe ...,2023.0,The Verge,"AI, Apps, Creators, Instagram, Meta, Mobile, T..."
3669,4chan users embrace AI voice clone tool to gen...,"Update, Wednesday 1st February, 5:40AM ET:Elev...",2023.0,The Verge,"AI, Creators, News, Tech"
3670,Google is freaking out about ChatGPT,The recent launch of OpenAI’s AI chatbot ChatG...,2023.0,The Verge,"AI, Business, Google, News, Tech"
3671,Microsoft to challenge Google by integrating C...,Microsoft is reportedly planning to launch a v...,2023.0,The Verge,"AI, Google, Microsoft, News, Tech"
...,...,...,...,...,...
15458,It’s Google’s turn to convince us to care abou...,Made by Google will surely feature a lot of ta...,2025.0,The Verge,"AI, Google, Google Pixel, Tech"
15459,Here’s how the Pixel’s AI zoom compares to a r...,We used the Google Pixel 10 Pro’s AI zoom alon...,2025.0,The Verge,"AI, Cameras, Gadgets, Google, Google Pixel, Ha..."
15460,Satya Nadella is haunted at the prospect of Mi...,Microsoft CEO Satya Nadella has revealed why t...,2025.0,The Verge,"AI, Microsoft, Notepad, Tech"
15461,Apparently you can pay your way out of nationa...,National Economic Council Director Kevin Hasse...,2025.0,The Verge,"AI, News, Nvidia, Policy, Politics, Tech"


In [3]:
import pandas as pd

verge_fixed=pd.read_csv('/home/dslab/choi/WITS/원본데이터/[2025.09.25]Industry/TheVerge_fixed.csv')
industry=pd.read_csv('/home/dslab/choi/Journal/Data/07_industry.csv')
verge_fixed.info()
industry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4532 entries, 0 to 4531
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          4530 non-null   float64
 1   title         4530 non-null   object 
 2   content       4530 non-null   object 
 3   keywords      4530 non-null   object 
 4   url           4532 non-null   object 
 5   affiliations  4532 non-null   object 
dtypes: float64(1), object(5)
memory usage: 212.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16130 entries, 0 to 16129
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         16130 non-null  object 
 1   content       16130 non-null  object 
 2   date          16130 non-null  float64
 3   affiliations  16130 non-null  object 
 4   keywords      10095 non-null  object 
dtypes: float64(1), object(4)
memory usage: 630.2+ KB


In [4]:
verge_fixed

Unnamed: 0,date,title,content,keywords,url,affiliations
0,2013.0,The best apps for your new Windows PC,The Verge is about technology and how it makes...,"Apps, Microsoft, Tech",https://www.theverge.com/2013/12/25/5231308/be...,The Verge
1,2013.0,Google’s ‘If I Had Glass’ winners list dominat...,The Verge is about technology and how it makes...,"Google, Tech, Web",https://www.theverge.com/2013/3/30/4162862/lis...,The Verge
2,2013.0,Inside Gaikai: how to make cloud gaming as eas...,We take a tour of Gaikai’s headquarters and le...,"Creators, Entertainment, Gaming, Tech, YouTube",https://www.theverge.com/2013/7/16/4442372/ins...,The Verge
3,2014.0,FTC finalizes privacy settlement with Snapchat...,The Verge is about technology and how it makes...,"Apps, Policy, Snapchat, Tech",https://www.theverge.com/2014/12/31/7476157/ft...,The Verge
4,2014.0,"Playing ‘Pokémon’ with 78,000 people is frustr...",The Verge is about technology and how it makes...,"Entertainment, Gaming, Play This, Pokemon",https://www.theverge.com/2014/2/17/5418690/pla...,The Verge
...,...,...,...,...,...,...
4527,2025.0,It’s Google’s turn to convince us to care abou...,Made by Google will surely feature a lot of ta...,"AI, Google, Google Pixel, Tech",https://www.theverge.com/tech/760372/made-by-g...,The Verge
4528,2025.0,Here’s how the Pixel’s AI zoom compares to a r...,We used the Google Pixel 10 Pro’s AI zoom alon...,"AI, Cameras, Gadgets, Google, Google Pixel, Ha...",https://www.theverge.com/tech/769360/google-pi...,The Verge
4529,2025.0,Satya Nadella is haunted at the prospect of Mi...,Microsoft CEO Satya Nadella has revealed why t...,"AI, Microsoft, Notepad, Tech",https://www.theverge.com/tech/780946/microsoft...,The Verge
4530,2025.0,Apparently you can pay your way out of nationa...,National Economic Council Director Kevin Hasse...,"AI, News, Nvidia, Policy, Politics, Tech",https://www.theverge.com/tech/781384/apparentl...,The Verge


In [None]:
import pandas as pd

# ✅ The Verge 제거
industry = industry[industry["affiliations"] != "The Verge"]

# ✅ 공통 컬럼만 선택
cols = ["title", "content", "date", "affiliations", "keywords"]

industry = industry[cols]
verge_fixed = verge_fixed[cols]

# ✅ 2023~2025년만 필터링
industry = industry[industry["date"].isin([2023, 2024, 2025])]
verge_fixed = verge_fixed[verge_fixed["date"].isin([2023, 2024, 2025])]

# ✅ 합치기
merged = pd.concat([verge_fixed, industry], ignore_index=True)

print(merged["date"].value_counts())
print(merged.info())

# 결과 저장
merged.to_csv("/home/dslab/choi/Journal/Data/08_Industry.csv", index=False)
merged.info()

In [None]:
print(merged.value_counts(subset=['affiliations']))
print(merged.value_counts(subset=['date']))

# 5. industry techcrunch 다시 병합...

In [8]:
import pandas as pd

techcrunch=pd.read_csv('/home/dslab/choi/Journal/Crawler/techcrunch_ai_articles.csv')
techcrunch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     753 non-null    object
 1   date      743 non-null    object
 2   content   753 non-null    object
 3   keywords  743 non-null    object
 4   url       753 non-null    object
dtypes: object(5)
memory usage: 29.5+ KB


In [9]:
techcrunch

Unnamed: 0,title,date,content,keywords,url
0,OpenAI launches ChatGPT Pulse to proactively w...,2025-09-25T10:00:00-07:00,OpenAI is launching a new feature inside of Ch...,"AI,ai agent,chatbot,ChatGPT,OpenAI",https://techcrunch.com/2025/09/25/openai-launc...
1,Steph Curry’s VC firm just backed an AI startu...,2025-09-25T09:30:04-07:00,Food supply chains are notoriously messy. Orde...,"AI,food supply chain,Startups,Steph Curry,Y Co...",https://techcrunch.com/2025/09/25/steph-currys...
2,OpenAI says GPT-5 stacks up to humans in a wid...,2025-09-25T09:11:34-07:00,OpenAI released a newbenchmarkon Thursday that...,"AI,Automation,ChatGPT,Claude,gpt-5,OpenAI",https://techcrunch.com/2025/09/25/openai-says-...
3,Clarifai’s new reasoning engine makes AI model...,2025-09-25T07:13:43-07:00,"On Thursday, the AI platformClarifaiannounced ...","AI,Clarifai,compute,inference",https://techcrunch.com/2025/09/25/clarifais-ne...
4,"Spotify to label AI music, filter spam and mor...",2025-09-25T05:00:00-07:00,Spotify on Thursday announced a series of upda...,"AI,AI,Apps,Media & Entertainment,Music,spam,Sp...",https://techcrunch.com/2025/09/25/spotify-upda...
...,...,...,...,...,...
748,Google launches new healthcare-related feature...,2025-03-18T06:00:00-07:00,Google on Tuesday announced new products and f...,"AI,Android,Apps,Google,Google Search,healthcare",https://techcrunch.com/2025/03/18/google-adds-...
749,People are using Google’s new AI model to remo...,2025-03-17T13:49:00-07:00,Users on social media have discovered a contro...,"AI,copyright,gemini,Google,Media & Entertainme...",https://techcrunch.com/2025/03/17/people-are-u...
750,OpenAI to start testing ChatGPT connectors for...,2025-03-17T11:29:25-07:00,OpenAI will soon begin testing a way for busin...,"AI,Apps,ChatGPT,Enterprise,Exclusive,OpenAI,Slack",https://techcrunch.com/2025/03/17/openai-to-st...
751,YC-backed ReactWise is applying AI to speed up...,2025-03-17T10:55:10-07:00,Artificial intelligence continues stirring thi...,"AI,Biotech & Health,chemical process automatio...",https://techcrunch.com/2025/03/17/yc-backed-re...


In [None]:
industry=pd.read_csv('/home/dslab/choi/Journal/Data/08_Industry.csv')
industry.info()

In [None]:
import pandas as pd
from datetime import datetime

# ✅ 날짜 → 연도만 추출하는 함수
def to_year(x):
    if pd.isna(x):
        return None
    try:
        return datetime.fromisoformat(str(x)).year
    except:
        return None

# ✅ affiliations 생성/수정
techcrunch["affiliations"] = "TechCrunch"

# ✅ 공통 컬럼만 선택
cols = ["title", "content", "date", "affiliations", "keywords"]
industry = industry[cols].copy()
techcrunch = techcrunch[cols].copy()

# ✅ 날짜 → 연도만 변환
techcrunch["date"] = techcrunch["date"].apply(to_year)

techcrunch.info()
print(techcrunch["affiliations"].value_counts())

In [None]:
# ✅ 합치기
merged = pd.concat([industry, techcrunch], ignore_index=True)

# ✅ 중복 제거 (title + content 기준)
merged = merged.drop_duplicates(subset=["title", "content"], keep="first")

# ✅ 2023 ~ 2025년만 남기기
merged = merged[merged["date"].isin([2023, 2024, 2025])]

print(merged["date"].value_counts())
print(merged.info())

# ✅ 결과 저장
merged.to_csv("/home/dslab/choi/Journal/Data/08_Industry.csv", index=False)

# 6. 2025.10.14 - 데이터 전처리


## EJIS.csv

In [7]:
import pandas as pd
import os
from datetime import datetime, timedelta
import glob

# 데이터 폴더 경로 설정 (사용자의 실제 경로로 변경 필요)
data_folder = "/home/dslab/choi/Journal/Data/EJIS"  # 예: "./EJIS" 또는 실제 폴더 경로

# 모든 CSV 파일 찾기
csv_files = glob.glob(os.path.join(data_folder, "*.csv"))

# 파일명에서 vol 번호와 affiliations 추출하는 함수
def extract_info(filename):
    """파일명에서 affiliations와 vol 번호를 추출합니다."""
    basename = os.path.basename(filename)
    # EJIS_vol32_iss1.csv 형식
    parts = basename.replace('.csv', '').split('_')
    affiliations = parts[0]  # EJIS
    vol = int(parts[1].replace('vol', ''))
    return affiliations, vol

# 모든 CSV 파일을 읽고 affiliations, vol 정보와 함께 저장
data_list = []

for file in csv_files:
    affiliations, vol = extract_info(file)
    df = pd.read_csv(file)
    df['affiliations'] = affiliations
    df['volume'] = vol
    data_list.append(df)

# 모든 데이터 합치기
merged_df = pd.concat(data_list, ignore_index=True)

# vol 기준으로 정렬
merged_df = merged_df.sort_values('volume').reset_index(drop=True)

# 가장 낮은 vol 찾기
min_vol = merged_df['volume'].min()

print(f"가장 낮은 vol: {min_vol}")

# 각 volume에 날짜 할당
# volume 번호와 연도를 직접 매핑
unique_vols = sorted(merged_df['volume'].unique())

print(f"\n발견된 volume 목록: {unique_vols}")

# 가장 낮은 volume을 2023년으로 설정
min_vol = min(unique_vols)
base_year = 2023

# volume에 날짜 매핑
date_mapping = {}
for vol in unique_vols:
    # 각 volume을 연도로 변환 (min_vol이 2023년)
    year = base_year + (vol - min_vol)
    date_mapping[vol] = year  # 연도만 저장
    print(f"  Volume {vol} → {year}")

# date 컬럼 추가
merged_df['date'] = merged_df['volume'].map(date_mapping)

# 결과를 날짜 기준으로 정렬
merged_df = merged_df.sort_values('date').reset_index(drop=True)

# 결과 저장
output_file = "/home/dslab/choi/Journal/Data/EJIS.csv"
merged_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"\n데이터 합치기 완료!")
print(f"총 {len(csv_files)}개의 파일을 합쳤습니다.")
print(f"총 {len(merged_df)}개의 행이 있습니다.")
print(f"날짜 범위: {merged_df['date'].min()} ~ {merged_df['date'].max()}")
print(f"결과 파일: {output_file}")

# 처음 몇 행 미리보기
print("\n데이터 미리보기:")
print(merged_df.head(10))

# 컬럼 정보
print("\n컬럼 목록:")
print(merged_df.columns.tolist())


가장 낮은 vol: 32

발견된 volume 목록: [32, 33, 34]
  Volume 32 → 2023
  Volume 33 → 2024
  Volume 34 → 2025

데이터 합치기 완료!
총 18개의 파일을 합쳤습니다.
총 165개의 행이 있습니다.
날짜 범위: 2023 ~ 2025
결과 파일: /home/dslab/choi/Journal/Data/EJIS.csv

데이터 미리보기:
                                               title  \
0  Clinical research from information systems pra...   
1  Developing human/AI interactions for chat-base...   
2  Unpacking digital options thinking for innovat...   
3  Adopting and integrating cyber-threat intellig...   
4  Patient health locus of control: the design of...   
5  Rethinking time: ubichronic time and its impac...   
6  Personal use of technology at work: a literatu...   
7  PUBLIC MANAGEMENT CHALLENGES IN THE DIGITAL RI...   
8  The social fabric framework: steps to elicitin...   
9  The effects of cyber regulations and security ...   

                                            abstract  \
0  An increasing presence of practitioners with d...   
1  Advancements in human/AI interactions led to

In [8]:
merged_df

Unnamed: 0,title,abstract,keywords,url,affiliations,volume,date
0,Clinical research from information systems pra...,An increasing presence of practitioners with d...,"clinical research, information systems, practice",https://www.tandfonline.com/doi/full/10.1080/0...,EJIS,32,2023
1,Developing human/AI interactions for chat-base...,Advancements in human/AI interactions led to s...,"Affordances, augmentation, automation, chatbot...",https://www.tandfonline.com/doi/full/10.1080/0...,EJIS,32,2023
2,Unpacking digital options thinking for innovat...,Options thinking is a powerful approach for ma...,"Digital options, digital innovation, clinical ...",https://www.tandfonline.com/doi/full/10.1080/0...,EJIS,32,2023
3,Adopting and integrating cyber-threat intellig...,Cyber-attacks are increasingly perpetrated by ...,"Cybersecurity, information security management...",https://www.tandfonline.com/doi/full/10.1080/0...,EJIS,32,2023
4,Patient health locus of control: the design of...,Patient locus of control (LOC) is a strong det...,"Healthcare information systems, locus of contr...",https://www.tandfonline.com/doi/full/10.1080/0...,EJIS,32,2023
...,...,...,...,...,...,...,...
160,Measuring perceived security in FinTech servic...,The last ten years have witnessed the rapid in...,"Fintech security, perceived security, scale de...",https://www.tandfonline.com/doi/full/10.1080/0...,EJIS,34,2025
161,Exploring IT identity claims: toward a value-c...,The ubiquity and integration of technology int...,"IT identity claim, personal value, social desi...",https://www.tandfonline.com/doi/full/10.1080/0...,EJIS,34,2025
162,Artificial intelligence boundary resources: a ...,In response to the increasing relevance of art...,"Boundary resources, application programming in...",https://www.tandfonline.com/doi/full/10.1080/0...,EJIS,34,2025
163,Mobile ICT outages and public safety: is there...,The digital divide remains one of the pressing...,"Digital divide, digital outcomes inequality, m...",https://www.tandfonline.com/doi/full/10.1080/0...,EJIS,34,2025


## JAIS

In [9]:
import pandas as pd
import os
from datetime import datetime, timedelta
import glob

# 데이터 폴더 경로 설정 (사용자의 실제 경로로 변경 필요)
data_folder = "/home/dslab/choi/Journal/Data/JAIS"  # 예: "./JAIS" 또는 실제 폴더 경로

# 모든 CSV 파일 찾기
csv_files = glob.glob(os.path.join(data_folder, "*.csv"))

# 파일명에서 vol 번호와 affiliations 추출하는 함수
def extract_info(filename):
    """파일명에서 affiliations와 vol 번호를 추출합니다."""
    basename = os.path.basename(filename)
    # JAIS_vol32_iss1.csv 형식
    parts = basename.replace('.csv', '').split('_')
    affiliations = parts[0]  # JAIS
    vol = int(parts[1].replace('vol', ''))
    return affiliations, vol

# 모든 CSV 파일을 읽고 affiliations, vol 정보와 함께 저장
data_list = []

for file in csv_files:
    affiliations, vol = extract_info(file)
    df = pd.read_csv(file)
    df['affiliations'] = affiliations
    df['volume'] = vol
    data_list.append(df)

# 모든 데이터 합치기
merged_df = pd.concat(data_list, ignore_index=True)

# vol 기준으로 정렬
merged_df = merged_df.sort_values('volume').reset_index(drop=True)

# 가장 낮은 vol 찾기
min_vol = merged_df['volume'].min()

print(f"가장 낮은 vol: {min_vol}")

# 각 volume에 날짜 할당
# volume 번호와 연도를 직접 매핑
unique_vols = sorted(merged_df['volume'].unique())

print(f"\n발견된 volume 목록: {unique_vols}")

# 가장 낮은 volume을 2023년으로 설정
min_vol = min(unique_vols)
base_year = 2023

# volume에 날짜 매핑
date_mapping = {}
for vol in unique_vols:
    # 각 volume을 연도로 변환 (min_vol이 2023년)
    year = base_year + (vol - min_vol)
    date_mapping[vol] = year  # 연도만 저장
    print(f"  Volume {vol} → {year}")

# date 컬럼 추가
merged_df['date'] = merged_df['volume'].map(date_mapping)

# 결과를 날짜 기준으로 정렬
merged_df = merged_df.sort_values('date').reset_index(drop=True)

# 결과 저장
output_file = "/home/dslab/choi/Journal/Data/JAIS.csv"
merged_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"\n데이터 합치기 완료!")
print(f"총 {len(csv_files)}개의 파일을 합쳤습니다.")
print(f"총 {len(merged_df)}개의 행이 있습니다.")
print(f"날짜 범위: {merged_df['date'].min()} ~ {merged_df['date'].max()}")
print(f"결과 파일: {output_file}")

# 처음 몇 행 미리보기
print("\n데이터 미리보기:")
print(merged_df.head(10))

# 컬럼 정보
print("\n컬럼 목록:")
print(merged_df.columns.tolist())


가장 낮은 vol: 24

발견된 volume 목록: [24, 25, 26]
  Volume 24 → 2023
  Volume 25 → 2024
  Volume 26 → 2025

데이터 합치기 완료!
총 17개의 파일을 합쳤습니다.
총 176개의 행이 있습니다.
날짜 범위: 2023 ~ 2025
결과 파일: /home/dslab/choi/Journal/Data/JAIS.csv

데이터 미리보기:
                                               title  \
0  A Practical Guide for Successful Revisions and...   
1  On the Effectiveness of Smart Metering Technol...   
2  Be Together, Run More: Enhancing Group Partici...   
3  The Effect of Community Managers on Online Ide...   
4  An Adversarial Dance: Toward an Understanding ...   
5  The Effects of Digitally Delivered Nudges in a...   
6  Assessing the Connections among Top Management...   
7  Guidelines for the Development of Three-Level ...   
8  Algorithm Sensemaking: How Platform Workers Ma...   
9  Same but Different: Variations in Reactions to...   

                                            abstract  \
0  Revising a manuscript after receiving a revise...   
1  In response to the burgeoning threats of cli

In [10]:
merged_df

Unnamed: 0,title,abstract,keywords,url,affiliations,volume,date
0,A Practical Guide for Successful Revisions and...,Revising a manuscript after receiving a revise...,,https://aisel.aisnet.org/jais/vol24/iss2/11,JAIS,24,2023
1,On the Effectiveness of Smart Metering Technol...,In response to the burgeoning threats of clima...,"Smart Meter Technology, Household Technology A...",https://aisel.aisnet.org/jais/vol24/iss2/2,JAIS,24,2023
2,"Be Together, Run More: Enhancing Group Partici...",Individuals are increasingly using novel fitne...,"Fitness Technology, Running Group, Running Spo...",https://aisel.aisnet.org/jais/vol24/iss2/3,JAIS,24,2023
3,The Effect of Community Managers on Online Ide...,"In this study, we investigate whether and to w...","Crowdsourcing, Crowdsourced Innovation, Ideati...",https://aisel.aisnet.org/jais/vol24/iss1/3,JAIS,24,2023
4,An Adversarial Dance: Toward an Understanding ...,Despite the increased focus on organizational ...,"Information System (IS) Security, Security Bel...",https://aisel.aisnet.org/jais/vol24/iss1/4,JAIS,24,2023
...,...,...,...,...,...,...,...
171,When Everyone Is Visible No One Is: Qualificat...,The visibility of qualifications is of central...,"Qualifications, Visibility, Qualifications Vis...",https://aisel.aisnet.org/jais/vol26/iss2/5,JAIS,26,2025
172,Technocognitive Structuration: Modeling the Ro...,The way we use technology both shapes and is s...,"Adaptive Structuration Theory for Individuals,...",https://aisel.aisnet.org/jais/vol26/iss2/6,JAIS,26,2025
173,From Links to Likes: Evidence From a Social Co...,Product-related content created by social medi...,"Social Media, Influencer Marketing, Product Li...",https://aisel.aisnet.org/jais/vol26/iss5/2,JAIS,26,2025
174,Legal Compliance and the Open Texture of Law,"The law is often vague and ambiguous, especial...","Open Texture, Vagueness, Policy, Law, Complian...",https://aisel.aisnet.org/jais/vol26/iss1/10,JAIS,26,2025


## JIT

In [11]:
import pandas as pd
import os
from datetime import datetime, timedelta
import glob

# 데이터 폴더 경로 설정 (사용자의 실제 경로로 변경 필요)
data_folder = "/home/dslab/choi/Journal/Data/JIT"  # 예: "./JIT" 또는 실제 폴더 경로

# 모든 CSV 파일 찾기
csv_files = glob.glob(os.path.join(data_folder, "*.csv"))

# 파일명에서 vol 번호와 affiliations 추출하는 함수
def extract_info(filename):
    """파일명에서 affiliations와 vol 번호를 추출합니다."""
    basename = os.path.basename(filename)
    # JIT_vol32_iss1.csv 형식
    parts = basename.replace('.csv', '').split('_')
    affiliations = parts[0]  # JIT
    vol = int(parts[1].replace('vol', ''))
    return affiliations, vol

# 모든 CSV 파일을 읽고 affiliations, vol 정보와 함께 저장
data_list = []

for file in csv_files:
    affiliations, vol = extract_info(file)
    df = pd.read_csv(file)
    df['affiliations'] = affiliations
    df['volume'] = vol
    data_list.append(df)

# 모든 데이터 합치기
merged_df = pd.concat(data_list, ignore_index=True)

# vol 기준으로 정렬
merged_df = merged_df.sort_values('volume').reset_index(drop=True)

# 가장 낮은 vol 찾기
min_vol = merged_df['volume'].min()

print(f"가장 낮은 vol: {min_vol}")

# 각 volume에 날짜 할당
# volume 번호와 연도를 직접 매핑
unique_vols = sorted(merged_df['volume'].unique())

print(f"\n발견된 volume 목록: {unique_vols}")

# 가장 낮은 volume을 2023년으로 설정
min_vol = min(unique_vols)
base_year = 2023

# volume에 날짜 매핑
date_mapping = {}
for vol in unique_vols:
    # 각 volume을 연도로 변환 (min_vol이 2023년)
    year = base_year + (vol - min_vol)
    date_mapping[vol] = year  # 연도만 저장
    print(f"  Volume {vol} → {year}")

# date 컬럼 추가
merged_df['date'] = merged_df['volume'].map(date_mapping)

# 결과를 날짜 기준으로 정렬
merged_df = merged_df.sort_values('date').reset_index(drop=True)

# 결과 저장
output_file = "/home/dslab/choi/Journal/Data/JIT.csv"
merged_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"\n데이터 합치기 완료!")
print(f"총 {len(csv_files)}개의 파일을 합쳤습니다.")
print(f"총 {len(merged_df)}개의 행이 있습니다.")
print(f"날짜 범위: {merged_df['date'].min()} ~ {merged_df['date'].max()}")
print(f"결과 파일: {output_file}")

# 처음 몇 행 미리보기
print("\n데이터 미리보기:")
print(merged_df.head(10))

# 컬럼 정보
print("\n컬럼 목록:")
merged_df


가장 낮은 vol: 38

발견된 volume 목록: [38, 39, 40]
  Volume 38 → 2023
  Volume 39 → 2024
  Volume 40 → 2025

데이터 합치기 완료!
총 11개의 파일을 합쳤습니다.
총 75개의 행이 있습니다.
날짜 범위: 2023 ~ 2025
결과 파일: /home/dslab/choi/Journal/Data/JIT.csv

데이터 미리보기:
                                               title  \
0  Products of theorizing—towards native theories...   
1                                                NaN   
2  Rethinking online friction in the information ...   
3  Reconceptualizing users: The roles and activit...   
4                                                NaN   
5  Big Tech’s power, political corporate social r...   
6                                                NaN   
7                                                NaN   
8  Regulation of data-driven market power in the ...   
9  What is the Metaverse and who seeks to define ...   

                                            abstract  \
0                                                NaN   
1  Information security (InfoSec)–related behavio

Unnamed: 0,title,abstract,keywords,url,affiliations,volume,date
0,Products of theorizing—towards native theories...,,,https://aisel.aisnet.org/jit/vol38/iss4/1,JIT,38,2023
1,,Information security (InfoSec)–related behavio...,"Behavioral information systems security, Fouca...",https://aisel.aisnet.org/jit/vol38/iss4/2,JIT,38,2023
2,Rethinking online friction in the information ...,A recurrent mantra of the technology industry ...,"friction, ICTs and society, information techno...",https://aisel.aisnet.org/jit/vol38/iss1/1,JIT,38,2023
3,Reconceptualizing users: The roles and activit...,The concept of the user has persisted in infor...,"user, people, roles, activities, information t...",https://aisel.aisnet.org/jit/vol38/iss4/6,JIT,38,2023
4,,"In response to their growing importance, digit...","Digital infrastructure, digital innovation, re...",https://aisel.aisnet.org/jit/vol38/iss2/3,JIT,38,2023
...,...,...,...,...,...,...,...
70,A critical realist approach to agent-based mod...,Information systems (IS) scholarship and pract...,"Critical realism, agent-based model, complex a...",https://aisel.aisnet.org/jit/vol40/iss2/2,JIT,40,2025
71,Guiding computationally intensive theory devel...,This study advances the field of Computational...,"computationally intensive theory development, ...",https://aisel.aisnet.org/jit/vol40/iss2/5,JIT,40,2025
72,The group mind of hybrid teams with humans and...,Studies regularly demonstrate how well intelli...,"Transactive memory systems, hybrid teams, inte...",https://aisel.aisnet.org/jit/vol40/iss1/2,JIT,40,2025
73,"Digital futures: Definition (what), importance...",“Digital futures” as a research field that exa...,"Digital futures, future studies, future orient...",https://aisel.aisnet.org/jit/vol40/iss1/1,JIT,40,2025


## JMIS

In [12]:
import pandas as pd
import os
from datetime import datetime, timedelta
import glob

# 데이터 폴더 경로 설정 (사용자의 실제 경로로 변경 필요)
data_folder = "/home/dslab/choi/Journal/Data/JMIS"  # 예: "./JMIS" 또는 실제 폴더 경로

# 모든 CSV 파일 찾기
csv_files = glob.glob(os.path.join(data_folder, "*.csv"))

# 파일명에서 vol 번호와 affiliations 추출하는 함수
def extract_info(filename):
    """파일명에서 affiliations와 vol 번호를 추출합니다."""
    basename = os.path.basename(filename)
    # JMIS_vol32_iss1.csv 형식
    parts = basename.replace('.csv', '').split('_')
    affiliations = parts[0]  # JMIS
    vol = int(parts[1].replace('vol', ''))
    return affiliations, vol

# 모든 CSV 파일을 읽고 affiliations, vol 정보와 함께 저장
data_list = []

for file in csv_files:
    affiliations, vol = extract_info(file)
    df = pd.read_csv(file)
    df['affiliations'] = affiliations
    df['volume'] = vol
    data_list.append(df)

# 모든 데이터 합치기
merged_df = pd.concat(data_list, ignore_index=True)

# vol 기준으로 정렬
merged_df = merged_df.sort_values('volume').reset_index(drop=True)

# 가장 낮은 vol 찾기
min_vol = merged_df['volume'].min()

print(f"가장 낮은 vol: {min_vol}")

# 각 volume에 날짜 할당
# volume 번호와 연도를 직접 매핑
unique_vols = sorted(merged_df['volume'].unique())

print(f"\n발견된 volume 목록: {unique_vols}")

# 가장 낮은 volume을 2023년으로 설정
min_vol = min(unique_vols)
base_year = 2023

# volume에 날짜 매핑
date_mapping = {}
for vol in unique_vols:
    # 각 volume을 연도로 변환 (min_vol이 2023년)
    year = base_year + (vol - min_vol)
    date_mapping[vol] = year  # 연도만 저장
    print(f"  Volume {vol} → {year}")

# date 컬럼 추가
merged_df['date'] = merged_df['volume'].map(date_mapping)

# 결과를 날짜 기준으로 정렬
merged_df = merged_df.sort_values('date').reset_index(drop=True)

# 결과 저장
output_file = "/home/dslab/choi/Journal/Data/JMIS.csv"
merged_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"\n데이터 합치기 완료!")
print(f"총 {len(csv_files)}개의 파일을 합쳤습니다.")
print(f"총 {len(merged_df)}개의 행이 있습니다.")
print(f"날짜 범위: {merged_df['date'].min()} ~ {merged_df['date'].max()}")
print(f"결과 파일: {output_file}")

# 처음 몇 행 미리보기
print("\n데이터 미리보기:")
print(merged_df.head(10))

# 컬럼 정보
print("\n컬럼 목록:")
print(merged_df.columns.tolist())


가장 낮은 vol: 40

발견된 volume 목록: [40, 41, 42]
  Volume 40 → 2023
  Volume 41 → 2024
  Volume 42 → 2025

데이터 합치기 완료!
총 11개의 파일을 합쳤습니다.
총 128개의 행이 있습니다.
날짜 범위: 2023 ~ 2025
결과 파일: /home/dslab/choi/Journal/Data/JMIS.csv

데이터 미리보기:
                                               title  \
0         Product Recommendation and Consumer Search   
1  Impact of Bot Involvement in an Incentivized B...   
2  Strategic Investments for Platform Launch and ...   
3  Differential Impacts of Technology-Network Str...   
4  The Paradoxical Role of Humanness in Aggressio...   
5  How Lending Experience and Borrower Credit Inf...   
6  Influence of Media Capabilities on Trust in th...   
7  Stigmergy in Open Collaboration: An Empirical ...   
8  Task Conflict Resolution in Designing Legacy R...   
9  Foreignness Liability of Mobile App Startups: ...   

                                            abstract  \
0  We study an online environment where a firm pr...   
1  Incentivized blockchain-based online social 

In [13]:
merged_df

Unnamed: 0,title,abstract,keywords,url,affiliations,volume,date
0,Product Recommendation and Consumer Search,We study an online environment where a firm pr...,"Recommendation bias, consumer search, digital ...",https://www.tandfonline.com/doi/full/10.1080/0...,JMIS,40,2023
1,Impact of Bot Involvement in an Incentivized B...,Incentivized blockchain-based online social me...,"Blockchain, social media networks, bots, user ...",https://www.tandfonline.com/doi/full/10.1080/0...,JMIS,40,2023
2,Strategic Investments for Platform Launch and ...,Multi-sided platforms must make decisions on b...,"Multi-sided platforms, two-sided markets, netw...",https://www.tandfonline.com/doi/full/10.1080/0...,JMIS,40,2023
3,Differential Impacts of Technology-Network Str...,We examine how hospital cost efficiency can im...,"Healthcare costs, knowledge networks, electron...",https://www.tandfonline.com/doi/full/10.1080/0...,JMIS,40,2023
4,The Paradoxical Role of Humanness in Aggressio...,Conversational Agents (CAs) are becoming part ...,"Conversational agent, chatbot, humanlike desig...",https://www.tandfonline.com/doi/full/10.1080/0...,JMIS,40,2023
...,...,...,...,...,...,...,...
123,Impact of Non-Diagnostic Digital Services on O...,The persistent preference of patients for well...,"Online healthcare, healthcare platforms, onlin...",https://www.tandfonline.com/doi/full/10.1080/0...,JMIS,42,2025
124,Digital Skill Visibility and Job Promotion of ...,Material to inform the decisions on the job pr...,"Open source software, OSS, job promotions, dig...",https://www.tandfonline.com/doi/full/10.1080/0...,JMIS,42,2025
125,Editorial Introduction,"In closing, it is my pleasure and privilege to...",,https://www.tandfonline.com/doi/full/10.1080/0...,JMIS,42,2025
126,Everyday Metaverse: The Metaverse as an Integr...,This paper explores a paradigm shift from view...,"Metaverse, everyday computing, correspondence,...",https://www.tandfonline.com/doi/full/10.1080/0...,JMIS,42,2025


## JSIS

In [14]:
import pandas as pd
import os
from datetime import datetime, timedelta
import glob

# 데이터 폴더 경로 설정 (사용자의 실제 경로로 변경 필요)
data_folder = "/home/dslab/choi/Journal/Data/JSIS"  # 예: "./JSIS" 또는 실제 폴더 경로

# 모든 CSV 파일 찾기
csv_files = glob.glob(os.path.join(data_folder, "*.csv"))

# 파일명에서 vol 번호와 affiliations 추출하는 함수
def extract_info(filename):
    """파일명에서 affiliations와 vol 번호를 추출합니다."""
    basename = os.path.basename(filename)
    # JSIS_vol32_iss1.csv 형식
    parts = basename.replace('.csv', '').split('_')
    affiliations = parts[0]  # JSIS
    vol = int(parts[1].replace('vol', ''))
    return affiliations, vol

# 모든 CSV 파일을 읽고 affiliations, vol 정보와 함께 저장
data_list = []

for file in csv_files:
    affiliations, vol = extract_info(file)
    df = pd.read_csv(file)
    df['affiliations'] = affiliations
    df['volume'] = vol
    data_list.append(df)

# 모든 데이터 합치기
merged_df = pd.concat(data_list, ignore_index=True)

# vol 기준으로 정렬
merged_df = merged_df.sort_values('volume').reset_index(drop=True)

# 가장 낮은 vol 찾기
min_vol = merged_df['volume'].min()

print(f"가장 낮은 vol: {min_vol}")

# 각 volume에 날짜 할당
# volume 번호와 연도를 직접 매핑
unique_vols = sorted(merged_df['volume'].unique())

print(f"\n발견된 volume 목록: {unique_vols}")

# 가장 낮은 volume을 2023년으로 설정
min_vol = min(unique_vols)
base_year = 2023

# volume에 날짜 매핑
date_mapping = {}
for vol in unique_vols:
    # 각 volume을 연도로 변환 (min_vol이 2023년)
    year = base_year + (vol - min_vol)
    date_mapping[vol] = year  # 연도만 저장
    print(f"  Volume {vol} → {year}")

# date 컬럼 추가
merged_df['date'] = merged_df['volume'].map(date_mapping)

# 결과를 날짜 기준으로 정렬
merged_df = merged_df.sort_values('date').reset_index(drop=True)

# 결과 저장
output_file = "/home/dslab/choi/Journal/Data/JSIS.csv"
merged_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"\n데이터 합치기 완료!")
print(f"총 {len(csv_files)}개의 파일을 합쳤습니다.")
print(f"총 {len(merged_df)}개의 행이 있습니다.")
print(f"날짜 범위: {merged_df['date'].min()} ~ {merged_df['date'].max()}")
print(f"결과 파일: {output_file}")

# 처음 몇 행 미리보기
print("\n데이터 미리보기:")
print(merged_df.head(10))

# 컬럼 정보
print("\n컬럼 목록:")
merged_df


가장 낮은 vol: 32

발견된 volume 목록: [32, 33, 34]
  Volume 32 → 2023
  Volume 33 → 2024
  Volume 34 → 2025

데이터 합치기 완료!
총 4개의 파일을 합쳤습니다.
총 90개의 행이 있습니다.
날짜 범위: 2023 ~ 2025
결과 파일: /home/dslab/choi/Journal/Data/JSIS.csv

데이터 미리보기:
   volume  issue                                              title  \
0      32      4  Doing good by going digital: A taxonomy of dig...   
1      32      4  Machine learning advice in managerial decision...   
2      32      2  The influence of project initiators’ person-to...   
3      32      2  Business-IT alignment as a coevolution process...   
4      32      2  Can ICT enhance workplace inclusion? ICT-enabl...   
5      32      2  Responsibly strategizing with the metaverse: B...   
6      32      3                                    Editorial Board   
7      32      3  Welcome to the third issue of Volume 32 of the...   
8      32      3  Literature review in the generative AI era - h...   
9      32      3  Ethical management of human-AI interaction: Th... 

Unnamed: 0,volume,issue,title,authors,abstract,date,keywords,url,affiliations
0,32,4,Doing good by going digital: A taxonomy of dig...,"ChristophBuck, AnnaKrombacher, MaximilianRögli...",Digital social innovation (DSI) offers incumbe...,2023,"Digital Innovation, Social Innovation, Digital...",https://www.sciencedirect.com/science/article/...,JSIS
1,32,4,Machine learning advice in managerial decision...,"TimoSturm, LuisaPumplun, Jin P.Gerlach, Martin...",Machine learning (ML) analyses offer great pot...,2023,"Machine learning advice, Transparency, Informa...",https://www.sciencedirect.com/science/article/...,JSIS
2,32,2,The influence of project initiators’ person-to...,"QinWeng, FranckSoh",Person-to-person (P2P) followership is an impo...,2023,"Open source software project, Project initiato...",https://www.sciencedirect.com/science/article/...,JSIS
3,32,2,Business-IT alignment as a coevolution process...,"FabrizioAmarilli, Bartvan den Hooff, Mariovan ...","In this paper, we provide a detailed insight i...",2023,"Business-IT alignment, Coevolution theory, Sys...",https://www.sciencedirect.com/science/article/...,JSIS
4,32,2,Can ICT enhance workplace inclusion? ICT-enabl...,"MonideepaTarafdar, IrinaRets, YangHu",Workplace inclusion is a strategic concern for...,2023,"ICT-enabled workplace inclusion practices, ICT...",https://www.sciencedirect.com/science/article/...,JSIS
...,...,...,...,...,...,...,...,...,...
85,34,2,The role of use for the business value of big ...,"Yi-TingYeh, RebekahEden, ErwinFielt, RehanSyed",Big data analytics (BDA) has attracted signifi...,2025,"Big data analytics, System use, Business value...",https://www.sciencedirect.com/science/article/...,JSIS
86,34,2,Where are the processes in IS research on digi...,"MartinWiener, SusanneStrahringer, JuliaKotlarsky",Digital transformation (DT) has emerged as a c...,2025,"Digital transformation, Organizational level, ...",https://www.sciencedirect.com/science/article/...,JSIS
87,34,2,Process-level value creation from business ana...,"Pascal C.Kunz, KaiSpohrer, ArminHeinzl",Research onbusiness analytics(BA) has made var...,2025,"Business Analytics, Machine Learning, IS Value...",https://www.sciencedirect.com/science/article/...,JSIS
88,34,2,Socio-technical phenomena involving blockchain...,"ShaoxinWang, DanielSchlagwein, MikeSeymour",This paper reviews the emerging literature on ...,2025,"Blockchain use, Socio-technical systems, Infor...",https://www.sciencedirect.com/science/article/...,JSIS


# 7. 2025.10.14 - 데이터 병합

In [20]:
import pandas as pd

ejis=pd.read_csv('/home/dslab/choi/Journal/Data/EJIS.csv')
jais=pd.read_csv('/home/dslab/choi/Journal/Data/JAIS.csv')
jit=pd.read_csv('/home/dslab/choi/Journal/Data/JIT.csv')
jmis=pd.read_csv('/home/dslab/choi/Journal/Data/JMIS.csv')
jsis=pd.read_csv('/home/dslab/choi/Journal/Data/JSIS.csv')
data=pd.read_csv('/home/dslab/choi/Journal/Data/08_Academia.csv')


In [21]:
df=pd.concat([ejis,jais,jit, jmis, jsis,data])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4666 entries, 0 to 4031
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         4645 non-null   object 
 1   abstract      4627 non-null   object 
 2   keywords      3774 non-null   object 
 3   url           634 non-null    object 
 4   affiliations  4666 non-null   object 
 5   volume        634 non-null    float64
 6   date          4666 non-null   int64  
 7   issue         90 non-null     float64
 8   authors       3286 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 364.5+ KB


In [22]:
df.drop(columns=['url','volume','issue'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4666 entries, 0 to 4031
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         4645 non-null   object
 1   abstract      4627 non-null   object
 2   keywords      3774 non-null   object
 3   affiliations  4666 non-null   object
 4   date          4666 non-null   int64 
 5   authors       3286 non-null   object
dtypes: int64(1), object(5)
memory usage: 255.2+ KB


In [23]:
import pandas as pd

# 4. Null 제거 (title, abstract)
df.dropna(subset=['title', 'abstract'], inplace=True)

# ✅ 중복 후보 찾기 (title+content 모두 동일)
dupes = df[df.duplicated(subset=["title", "abstract"], keep=False)] \
            .sort_values(by=["title", "abstract"])

print("🔎 중복된 행 목록:")
print(dupes)

# ✅ 실제로 제거 (첫 번째만 남기고 삭제)
df_dedup = df.drop_duplicates(subset=["title", "abstract"], keep="first").reset_index(drop=True)

print(df_dedup)
# 결과 저장 (선택)
# dupes.to_csv("Data/08_Academia.csv", index=False)   # 제거될 가능성이 있는 행 저장
df_dedup.to_csv("/home/dslab/choi/Journal/Data/09_Academia.csv", index=False)     # 최종 중복 제거된 데이터 저장
#
# print(f"총 {len(dupes)}개의 중복 행 발견, 제거 후 남은 행 수: {len(df_dedup)}")

🔎 중복된 행 목록:
Empty DataFrame
Columns: [title, abstract, keywords, affiliations, date, authors]
Index: []
                                                  title  \
0     Clinical research from information systems pra...   
1     Developing human/AI interactions for chat-base...   
2     Unpacking digital options thinking for innovat...   
3     Adopting and integrating cyber-threat intellig...   
4     Patient health locus of control: the design of...   
...                                                 ...   
4601  Stress from Digital Work: Toward a Unified Vie...   
4602           Dynamics of Shared Security in the Cloud   
4603  Beyond Risk: A Measure of Distribution Uncerta...   
4604  Unveiling the Cost of Free: How an Ad-Sponsore...   
4605  The Impact of Situational Achievement Goals on...   

                                               abstract  \
0     An increasing presence of practitioners with d...   
1     Advancements in human/AI interactions led to s...   
2     Opti

In [24]:
df_dedup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4606 entries, 0 to 4605
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         4606 non-null   object
 1   abstract      4606 non-null   object
 2   keywords      3752 non-null   object
 3   affiliations  4606 non-null   object
 4   date          4606 non-null   int64 
 5   authors       3272 non-null   object
dtypes: int64(1), object(5)
memory usage: 216.0+ KB


In [25]:
print(df.value_counts(subset=['date']))
print(df.value_counts(subset=['affiliations']))

date
2024    1776
2023    1668
2025    1162
Name: count, dtype: int64
affiliations
HICSS           2097
ICIS             822
DSS              360
IAM              270
ISR              268
MISQ             215
JAIS             166
EJIS             165
JMIS             128
JSIS              65
JIT               50
Name: count, dtype: int64
