#### DOI Mapping with raw data

In [12]:
import pandas as pd
import re

# 1. Load the .dta files
paper_230130 = pd.read_stata('3.Paper_230130.dta')
paper_bio = pd.read_stata('Paper_Dataset_Bio.dta')

# 2. Normalization function
def normalize_text(text):
    if pd.isna(text):
        return ''
    return re.sub(r'[^a-zA-Z0-9]', '', text).lower()

# Apply normalization
paper_230130['Paper_norm'] = paper_230130['pa030'].apply(normalize_text)
paper_bio['Title_norm'] = paper_bio['Title'].apply(normalize_text)

# 3. Create a dictionary for quick lookup
doi_mapping = paper_230130.set_index('Paper_norm')['pa140'].to_dict()

# 4. Map DOI based on Title_norm
paper_bio['DOI'] = paper_bio['Title_norm'].map(doi_mapping)

# Save the updated dataset
paper_bio.to_csv('Updated_doi.csv', index=False, encoding='utf-8-sig')


#### Fetch DOI using Semantic Scholar

In [23]:
import requests
import time

# Semantic Scholar API settings
API_KEY = "d9uN8uXwqK2FkU7VYFGJr9ECqcyAYh3Gx0QRyOy7"
HEADERS = {"x-api-key": API_KEY}
BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"

# Paper title to search
title = "Population genetic study of 10 short tandem repeat loci from 600 domestic dogs in Korea"

# Function to fetch DOI from Semantic Scholar
def fetch_doi(title):
    params = {"query": title, "fields": "externalIds", "limit": 1}
    response = requests.get(BASE_URL, headers=HEADERS, params=params)
    if response.status_code == 200:
        data = response.json()
        if "data" in data and data["data"]:
            external_ids = data["data"][0].get("externalIds", {})
            return external_ids.get("DOI", None)
    return None

# Fetch DOI
doi = fetch_doi(title)
print(f"DOI for '{title}': {doi}")

time.sleep(1)  # Avoid rate limiting

DOI for 'Population genetic study of 10 short tandem repeat loci from 600 domestic dogs in Korea': None


#### Fetching DOI using Crossref

In [6]:
from habanero import Crossref
cr = Crossref()
titles = ["Enhanced sensitivity of CpG island search and primer design based on predicted CpG island position"]  # Replace with your list of titles
dois = {}
for title in titles:
    result = cr.works(query=title)
    if result['message']['items']:
        dois[title] = result['message']['items'][0]['DOI']
    else:
        dois[title] = None
print(dois)


{'Enhanced sensitivity of CpG island search and primer design based on predicted CpG island position': '10.1016/j.fsigen.2018.02.013'}


In [11]:
import pandas as pd
from habanero import Crossref

# Load dataset
df = pd.read_csv("Paper_Dataset_Bio.csv")

# Filter papers with missing DOIs
missing_doi_df = df[df['DOI'].isna()]  # Select first 10 missing DOIs

# Initialize Crossref API
cr = Crossref()

# Function to fetch DOI
def get_doi(title):
    try:
        result = cr.works(query=title)
        if result['message']['items']:
            return result['message']['items'][0]['DOI']
    except Exception as e:
        print(f"Error fetching DOI for {title}: {e}")
    return None

# Fetch DOIs sequentially
for idx, row in missing_doi_df.iterrows():
    title = row['Title']
    doi = get_doi(title)
    df.at[idx, 'DOI'] = doi  # Update original DataFrame

# Save updated dataset
df.to_csv("Paper_Dataset_Bio_Updated.csv", index=False)

print("DOI collection completed for first 10 missing entries.")


Error fetching DOI for A lectin-coupled, multiple reaction monitoring based quantitative analysis of human plasma glycoproteins by mass spectrometry: The read operation timed out
DOI collection completed for first 10 missing entries.


In [13]:
import time
import pandas as pd
from habanero import Crossref

# Load dataset
df = pd.read_csv("Paper_Dataset_Bio.csv")

# Filter papers with missing DOIs
missing_doi_df = df[df['DOI'].isna()]  # Only papers where DOI is missing

# Initialize Crossref API
cr = Crossref()

# Function to fetch DOI
def get_doi(title):
    try:
        result = cr.works(query=title)
        if result and 'message' in result and 'items' in result['message'] and result['message']['items']:
            return result['message']['items'][0].get('DOI', None)
    except Exception as e:
        print(f"Error fetching DOI for {title}: {e}")
    return None

# Fetch DOIs sequentially
for i, (idx, row) in enumerate(missing_doi_df.iterrows(), start=1):
    title = row['Title']
    doi = get_doi(title)
    df.at[idx, 'DOI'] = doi  # Update original DataFrame

    # Print progress every 50 papers
    if i % 50 == 0:
        print(f"Processed {i} papers...")

# Save updated dataset with only 'Title' and 'DOI'
df[['Title', 'DOI']].to_csv("Paper_Dataset_Bio_DOI.csv", index=False)

print("DOI collection completed and saved to 'Paper_Dataset_Bio_DOI.csv'.")


Error fetching DOI for Kinetic studies on the formation of various II-VI semiconductor nanocrystals and synthesis of gradient alloy quantum dots emitting in the entire visible range: The read operation timed out
Error fetching DOI for Globoside promotes activation of ERK by interaction with the epidermal growth factor receptor: The read operation timed out
Error fetching DOI for Scanometric analysis of DNA microarrays using DNA intercalator-conjugated gold nanoparticles: The read operation timed out
Error fetching DOI for A label-free fluorescence immunoassay system for the sensitive detection of the mycotoxin, ochratoxin A: The read operation timed out
Error fetching DOI for Highly Efficient Enzyme Immobilization and Stabilization within Meso-Structured Onion-Like Silica for Biodiesel Production: The read operation timed out
Error fetching DOI for Mutational complex genotype of the hepatitis B virus X/precore regions as a novel predictive marker for hepatocellular carcinoma: The read 

KeyboardInterrupt: 

In [14]:
# Save the current state of the DataFrame (before interrupting the process)
df[['Title', 'DOI']].to_csv("Paper_Dataset_Bio_DOI.csv", index=False)

In [18]:
import time
import pandas as pd
from habanero import Crossref

# Load dataset
df = pd.read_csv("Paper_Dataset_Bio.csv")

# Filter papers with missing DOIs 
missing_doi_df = df[df['DOI'].isna()]  

# Initialize Crossref API
cr = Crossref()

# Function to fetch DOI
def get_doi(title):
    try:
        result = cr.works(query=title)
        if result and 'message' in result and 'items' in result['message'] and result['message']['items']:
            return result['message']['items'][0].get('DOI', None)
    except Exception as e:
        print(f"Error fetching DOI for {title}: {e}")
    return None

# Fetch DOIs sequentially
for i, (idx, row) in enumerate(missing_doi_df.iterrows(), start=1):
    title = row['Title']
    doi = get_doi(title)
    df.at[idx, 'DOI'] = doi  # Update original DataFrame

    # Print progress every 50 papers
    if i % 50 == 0:
        print(f"Processed {i} papers...")

# Save updated dataset with only 'Title' and 'DOI'
df[['Title', 'DOI']].to_csv("Paper_Dataset_Bio_DOI.csv", index=False)

print("DOI collection completed and saved to 'Paper_Dataset_Bio_DOI.csv'.")


Error fetching DOI for Enhanced therapeutic efficacy of an adenovirus-PEI-bile-acid complex in tumors with low coxsackie and adenovirus receptor expression: The read operation timed out
Error fetching DOI for New indoles from the roots of Brassica rapa ssp campestris: The read operation timed out
Error fetching DOI for Flavonoid Glycosides from the Fruit of Rhus parviflora and Inhibition of Cyclin Dependent Kinases by Hyperin: The read operation timed out
Error fetching DOI for Pancreatic Islet-Like Three-Dimensional Aggregates Derived From Human Embryonic Stem Cells Ameliorate Hyperglycemia in Streptozotocin-Induced Diabetic Mice: The read operation timed out
DOI collection completed and saved to 'Paper_Dataset_Bio_DOI.csv'.


#### Citation using Semantic Scholar

In [2]:
import requests

def get_citation_info(doi):
    url = f"https://api.semanticscholar.org/v1/paper/{doi}"
    response = requests.get(url)
    
    if response.status_code != 200:
        print("Error fetching data")
        return None
    
    data = response.json()
    
    def extract_info(citation_list):
        extracted = []
        for paper in citation_list:
            if 'doi' in paper and 'year' in paper and 'title' in paper:
                extracted.append(f"{{{paper['doi']}; {paper['year']}; {paper['title']}}}")
        return extracted
    
    forward_citations = extract_info(data.get("citations", []))
    backward_references = extract_info(data.get("references", []))
    
    print("Forward_Citation")
    print(", ".join(forward_citations) if forward_citations else "No Forward Citations Found")
    
    print("\nBackward_Citation")
    print(", ".join(backward_references) if backward_references else "No Backward Citations Found")
    
# Example usage
doi = "10.4142/jvs.2016.17.3.391"
get_citation_info(doi)


Forward_Citation
{10.1186/s44342-024-00013-4; 2024; Shared alleles and genetic structures in different Thai domestic cat breeds: the possible influence of common racial origins}, {10.1007/s13258-024-01510-0; 2024; Optimizing Bangkaew dog breed identification using DNA technology.}, {10.1016/j.fsigen.2024.103056; 2024; Development and validation of a novel 30-plex STR assay for canine individual identification and parentage testing.}, {10.1007/s11033-019-04601-4; 2019; Polymorphism analyses of 19 STRs in Labrador Retriever population from China and its heterozygosity comparisons with other retriever breeds}, {10.1021/acs.analchem.8b05318; 2018; Forensic DNA Analysis.}, {10.1186/s13104-017-2722-6; 2017; The use of genetic markers to estimate relationships between dogs in the course of criminal investigations}

Backward_Citation
{10.1016/j.fsigen.2011.04.015; 2012; Genetic data from 15 STR loci for forensic individual identification and parentage analyses in UK domestic dogs (Canis lupus 

In [3]:
import requests
import pandas as pd

def get_citation_info(doi):
    url = f"https://api.semanticscholar.org/v1/paper/{doi}"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Error fetching data for DOI: {doi}")
        return None
    
    data = response.json()
    paper_year = data.get("year", "Unknown")  # 원 논문 출판 연도
    
    citation_data = []
    
    def extract_info(citation_list, citation_type):
        for paper in citation_list:
            if 'doi' in paper and 'year' in paper and 'title' in paper:
                citation_data.append({
                    "Paper_Doi": doi,
                    "Publish_Year": paper_year,
                    "Citation_Type": citation_type,
                    "Cited_Doi": paper["doi"],
                    "Year": paper["year"],
                    "Title": paper["title"]
                })
    
    # Forward citations (이 논문을 인용한 논문들)
    extract_info(data.get("citations", []), "Forward")
    
    # Backward citations (이 논문이 참고한 논문들)
    extract_info(data.get("references", []), "Backward")
    
    return pd.DataFrame(citation_data)

# Example usage
doi = "10.4142/jvs.2016.17.3.391"
df = get_citation_info(doi)

# 결과 출력
if df is not None and not df.empty:
    print(df)
    # CSV로 저장 (선택)
    df.to_csv("citation_data.csv", index=False)
else:
    print("No citations found.")


                    Paper_Doi  Publish_Year Citation_Type  \
0   10.4142/jvs.2016.17.3.391          2016       Forward   
1   10.4142/jvs.2016.17.3.391          2016       Forward   
2   10.4142/jvs.2016.17.3.391          2016       Forward   
3   10.4142/jvs.2016.17.3.391          2016       Forward   
4   10.4142/jvs.2016.17.3.391          2016       Forward   
5   10.4142/jvs.2016.17.3.391          2016       Forward   
6   10.4142/jvs.2016.17.3.391          2016      Backward   
7   10.4142/jvs.2016.17.3.391          2016      Backward   
8   10.4142/jvs.2016.17.3.391          2016      Backward   
9   10.4142/jvs.2016.17.3.391          2016      Backward   
10  10.4142/jvs.2016.17.3.391          2016      Backward   
11  10.4142/jvs.2016.17.3.391          2016      Backward   
12  10.4142/jvs.2016.17.3.391          2016      Backward   
13  10.4142/jvs.2016.17.3.391          2016      Backward   
14  10.4142/jvs.2016.17.3.391          2016      Backward   
15  10.4142/jvs.2016.17.

In [None]:
import requests
import pandas as pd

def get_citation_info(doi):
    url = f"https://api.semanticscholar.org/v1/paper/{doi}"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Error fetching data for DOI: {doi}")
        return None
    
    data = response.json()
    paper_year = data.get("year", "Unknown")  # 원 논문 출판 연도
    
    citation_data = []
    
    def extract_info(citation_list, citation_type):
        for paper in citation_list:
            if 'doi' in paper and 'year' in paper and 'title' in paper:
                citation_data.append({
                    "Paper_Doi": doi,
                    "Publish_Year": paper_year,
                    "Citation_Type": citation_type,
                    "Cited_Doi": paper["doi"],
                    "Year": paper["year"],
                    "Title": paper["title"]
                })
    
    # Forward citations (이 논문을 인용한 논문들)
    extract_info(data.get("citations", []), "Forward")
    
    # Backward citations (이 논문이 참고한 논문들)
    extract_info(data.get("references", []), "Backward")
    
    return citation_data

# CSV 파일 로드
input_file = "Paper_Dataset_Bio.csv"
output_file = "citation_data.csv"

df = pd.read_csv(input_file, encoding="utf-8")

# 결과 저장용 리스트
all_citations = []

# DOI 기준으로 데이터 수집
for idx, doi in enumerate(df["DOI"].dropna().unique()):  # 중복 제거 후 수집
    citation_info = get_citation_info(doi)
    if citation_info:
        all_citations.extend(citation_info)
    
    # 50개마다 진행 상황 출력
    if (idx + 1) % 50 == 0:
        print(f"Processed {idx + 1} papers...")

# DataFrame으로 변환 후 CSV 저장
citation_df = pd.DataFrame(all_citations)
citation_df.to_csv(output_file, index=False, encoding="utf-8")

print("✅ Citation data collection complete!")


In [None]:
# Delay 추가

import requests
import pandas as pd
import time

# API 요청 함수
def get_citation_info(doi):
    url = f"https://api.semanticscholar.org/v1/paper/{doi}"
    retries = 5  # 최대 재시도 횟수
    delay = 5  # 초기 대기 시간 (초)
    
    for attempt in range(retries):
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            paper_year = data.get("year", "Unknown")  # 원 논문 출판 연도
            citation_data = []
            
            def extract_info(citation_list, citation_type):
                for paper in citation_list:
                    if 'doi' in paper and 'year' in paper and 'title' in paper:
                        citation_data.append({
                            "Paper_Doi": doi,
                            "Publish_Year": paper_year,
                            "Citation_Type": citation_type,
                            "Cited_Doi": paper["doi"],
                            "Year": paper["year"],
                            "Title": paper["title"]
                        })
            
            extract_info(data.get("citations", []), "Forward")
            extract_info(data.get("references", []), "Backward")
            
            return citation_data
        
        elif response.status_code == 429:  # Too Many Requests
            print(f"Rate limit exceeded. Retrying in {delay} seconds...")
            time.sleep(delay)
            delay *= 2  # 대기 시간 증가 (지수적 백오프)
        else:
            print(f"Error {response.status_code} for DOI: {doi}")
            return None
    
    print(f"Failed to fetch data for DOI: {doi} after {retries} attempts.")
    return None

# CSV 파일 로드
input_file = "Paper_Dataset_Bio.csv"
output_file = "citation_data.csv"
error_file = "error_log.txt"

# 기존 데이터 로드 (이미 수집된 DOI 중복 방지)
try:
    existing_data = pd.read_csv(output_file, encoding="utf-8")
    collected_dois = set(existing_data["Paper_Doi"].unique())
except FileNotFoundError:
    collected_dois = set()

df = pd.read_csv(input_file, encoding="utf-8")
all_citations = []
errors = []

total_papers = len(df["DOI"].dropna().unique())

for idx, doi in enumerate(df["DOI"].dropna().unique()):
    if doi in collected_dois:
        continue  # 이미 수집된 DOI는 건너뛰기
    
    citation_info = get_citation_info(doi)
    if citation_info:
        all_citations.extend(citation_info)
    else:
        errors.append(doi)
    
    if (idx + 1) % 50 == 0:
        print(f"Processed {idx + 1}/{total_papers} papers...")
        pd.DataFrame(all_citations).to_csv(output_file, mode='a', header=not bool(collected_dois), index=False, encoding="utf-8")
        all_citations = []
    
    time.sleep(1)  # 요청 간격 조정

# 마지막 데이터 저장
if all_citations:
    pd.DataFrame(all_citations).to_csv(output_file, mode='a', header=not bool(collected_dois), index=False, encoding="utf-8")

# 오류 로그 저장
if errors:
    with open(error_file, "w") as f:
        for doi in errors:
            f.write(doi + "\n")

print("✅ Citation data collection complete!")

Error 404 for DOI: 10.1021/ac0618730.s002
Error 404 for DOI: 10.1021/acs.langmuir.0c00729.s001
Error 404 for DOI: 10.1039/c2jm16448e
Error 404 for DOI: 10.4141/cjps2011-165
Error 404 for DOI: 10.1002/ange.201204989
Error 404 for DOI: 10.1093/infdis/jir731
Error 404 for DOI: 10.1016/j.imr.2015.04.053
Error 404 for DOI: 10.20944/preprints202307.1316.v1
Error 404 for DOI: 10.26226/morressier.578f37f9d462b8028d88f59d
Error 404 for DOI: 10.1201/9781003220329-20
Error 404 for DOI: 10.1021/acsanm.0c00474.s001
Error 404 for DOI: 10.1002/ange.201106758
Error 404 for DOI: 10.1161/blog.20200612.193056
Error 404 for DOI: 10.1016/j.ygyno.2013.04.368
Error 404 for DOI: 10.26226/morressier.599bdc78d462b80296ca0b33
Error 404 for DOI: 10.1093/jxb/erw002
Processed 350/6243 papers...
Error 404 for DOI: 10.1021/acs.langmuir.5b03945.s001
Processed 400/6243 papers...
Error 404 for DOI: 10.1002/ange.201302881
Processed 450/6243 papers...
Error 404 for DOI: 10.1002/ange.201108977
Error 404 for DOI: 10.1021/ac

: 

In [None]:
import requests
import pandas as pd
import time

# 논문 DOI를 이용해 인용 정보 가져오기
def get_citation_info(doi):
    url = f"https://api.semanticscholar.org/v1/paper/{doi}"
    retries = 5  # 최대 재시도 횟수
    delay = 5  # 초기 대기 시간 (초)
    
    for attempt in range(retries):
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            citation_data = []
            
            def extract_info(citation_list, citation_type):
                for paper in citation_list:
                    if "doi" in paper and "year" in paper and "title" in paper:
                        citation_data.append({
                            "Paper_Doi": doi,
                            "Citation_Type": citation_type,
                            "Cited_Doi": paper["doi"],
                            "Year": paper["year"],
                            "Title": paper["title"]
                        })
            
            extract_info(data.get("citations", []), "Forward")
            extract_info(data.get("references", []), "Backward")
            
            return citation_data
        
        elif response.status_code == 429:
            time.sleep(delay)
            delay *= 2  # 지수적 백오프 적용
        else:
            return None
    
    return None

# CSV 파일 로드
input_file = "Paper_Dataset_Bio.csv"
output_file = "Citation_Dataset_Bio.csv"
error_file = "error_log.txt"

df = pd.read_csv(input_file, encoding="utf-8-sig").head(100)
dois = df["DOI"].dropna().unique()

all_citations = []
errors = []

total_papers = len(dois)
for idx, doi in enumerate(dois, 1):
    citation_info = get_citation_info(doi)
    if citation_info:
        all_citations.extend(citation_info)
    else:
        errors.append(doi)
    
    if idx % 50 == 0:
        print(f"Processed {idx}/{total_papers} papers...")
        pd.DataFrame(all_citations).to_csv(output_file, mode='a', index=False, encoding="utf-8-sig")
        all_citations = []
    
    time.sleep(1)  # API 요청 간격 조정

# 마지막 데이터 저장
if all_citations:
    pd.DataFrame(all_citations).to_csv(output_file, mode='a', index=False, encoding="utf-8-sig")

# 오류 로그 저장
if errors:
    with open(error_file, "w") as f:
        for doi in errors:
            f.write(doi + "\n")

print("✅ Citation data collection complete!")


In [5]:
import requests
import time

def get_paper_doi(title, max_retries=5):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {
        "query": title,
        "fields": "externalIds,title"
    }
    
    for attempt in range(max_retries):
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            if "data" in data and len(data["data"]) > 0:
                for paper in data["data"]:
                    if "externalIds" in paper and "DOI" in paper["externalIds"]:
                        return paper["externalIds"]["DOI"]
            return "DOI not found"
        elif response.status_code == 429:
            print(f"Rate limit exceeded. Retrying in {2 ** attempt} seconds...")
            time.sleep(2 ** attempt)  # 지수적 백오프 (1s, 2s, 4s, ...)
        else:
            return f"Error: {response.status_code}"
    
    return "Failed after multiple retries"

# 예제 논문 제목
title = "The auditory and speech performance of children with intellectual disability after cochlear implantation"
doi = get_paper_doi(title)
print("DOI:", doi)


DOI: 10.3109/00016489.2012.720031


In [35]:
#1차

import pandas as pd
import requests
import time

# 파일 경로 설정
paper_dataset_path = "Paper_Dataset_Bio.csv"
error_log_path = "error_log.txt"
output_path = "Updated_DOI_Dataset.csv"

# 1. Paper_Dataset_Bio.csv에서 DOI와 Title 로드
df = pd.read_csv(paper_dataset_path, dtype=str)  # 문자열로 읽기
doi_to_title = dict(zip(df["DOI"], df["Title"]))  # DOI -> Title 매핑

# 2. error_log.txt에서 잘못된 DOI 목록 로드
with open(error_log_path, "r") as file:
    error_dois = [line.strip() for line in file.readlines() if line.strip()]

# Semantic Scholar API에서 논문 제목으로 새로운 DOI 찾기
def get_paper_doi(title, max_retries=5):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {"query": title, "fields": "externalIds,title"}

    for attempt in range(max_retries):
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            if "data" in data and len(data["data"]) > 0:
                for paper in data["data"]:
                    if "externalIds" in paper and "DOI" in paper["externalIds"]:
                        return paper["externalIds"]["DOI"]
            return "DOI not found"
        elif response.status_code == 429:
            print(f"Rate limit exceeded. Retrying in {2 ** attempt} seconds...")
            time.sleep(2 ** attempt)  # 지수적 백오프 (1s, 2s, 4s, ...)
        else:
            return f"Error: {response.status_code}"

    return "Failed after multiple retries"

# 새로운 DOI 검색 및 저장
updated_data = []

for doi in error_dois:
    title = doi_to_title.get(doi, "Title not found")
    if title == "Title not found":
        print(f"Skipping DOI {doi} - No matching title found in dataset.")
        continue

    new_doi = get_paper_doi(title)
    updated_data.append({"Title": title, "Original_DOI": doi, "New_DOI": new_doi})
    print(f"Updated DOI for '{title}': {new_doi}")

# 새로운 데이터프레임 생성 및 저장
updated_df = pd.DataFrame(updated_data, encoding="utf-8")
updated_df.to_csv(output_path, index=False)

print(f"Updated DOI dataset saved to {output_path}")


FileNotFoundError: [Errno 2] No such file or directory: 'error_log.txt'

In [16]:
import requests
import time

# 'New_DOI'가 "Failed after multiple retries"인 경우 필터링
retry_df = updated_df[updated_df["New_DOI"] == "Failed after multiple retries"].copy()

# Semantic Scholar API에서 논문 제목으로 새로운 DOI 찾기
def get_paper_doi(title, max_retries=5):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {"query": title, "fields": "externalIds,title"}

    for attempt in range(max_retries):
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            if "data" in data and len(data["data"]) > 0:
                for paper in data["data"]:
                    if "externalIds" in paper and "DOI" in paper["externalIds"]:
                        return paper["externalIds"]["DOI"]
            return "DOI not found"
        elif response.status_code == 429:
            print(f"Rate limit exceeded. Retrying in {2 ** attempt} seconds...")
            time.sleep(2 ** attempt)  # 지수적 백오프 (1s, 2s, 4s, ...)
        else:
            return f"Error: {response.status_code}"

    return "Failed after multiple retries"

# 새로운 DOI 검색 및 업데이트
retry_df["New_DOI"] = retry_df["Title"].apply(get_paper_doi)

# 원본 updated_df에 반영
updated_df.loc[updated_df["New_DOI"] == "Failed after multiple retries", "New_DOI"] = retry_df["New_DOI"]

print("DOI 업데이트 완료!")


Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 2 seconds...
Rate limit exceeded. Retrying in 4 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 2 seconds...
Rate limit exceeded. Retrying in 4 seconds...
Rate limit exceeded. Retrying in 1 seconds...
DOI 업데이트 완료!


In [23]:
import pandas as pd

# 1. Load Updated_DOI_Dataset.csv
updated_df = pd.read_csv("Updated_DOI_Dataset.csv", encoding="ISO-8859-1")

# 2. Load Paper_Dataset_Bio.csv
paper_df = pd.read_csv("Paper_Dataset_Bio.csv")

# 3. Merge the two dataframes on the 'Title' column to match the records
merged_df = pd.merge(paper_df, updated_df[['Title', 'New_DOI']], on='Title', how='left')

# 4. Update the DOI column with the New_DOI from the merged data
merged_df['DOI'] = merged_df['New_DOI'].combine_first(merged_df['DOI'])

# 5. Drop the 'New_DOI' column as it's no longer needed
merged_df.drop(columns=['New_DOI'], inplace=True)

# 6. Save the updated Paper_Dataset_Bio.csv
merged_df.to_csv("Updated_Paper_Dataset_Bio.csv", index=False, encoding="utf-8-sig")


In [26]:
import requests
import pandas as pd
import time

# API 요청 함수
def get_citation_info(doi):
    url = f"https://api.semanticscholar.org/v1/paper/{doi}"
    retries = 5  # 최대 재시도 횟수
    delay = 5  # 초기 대기 시간 (초)
    
    for attempt in range(retries):
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            paper_year = data.get("year", "Unknown")  # 원 논문 출판 연도
            citation_data = []
            
            def extract_info(citation_list, citation_type):
                for paper in citation_list:
                    if 'doi' in paper and 'year' in paper and 'title' in paper:
                        citation_data.append({
                            "Paper_Doi": doi,
                            "Publish_Year": paper_year,
                            "Citation_Type": citation_type,
                            "Cited_Doi": paper["doi"],
                            "Year": paper["year"],
                            "Title": paper["title"]
                        })
            
            extract_info(data.get("citations", []), "Forward")
            extract_info(data.get("references", []), "Backward")
            
            return citation_data
        
        elif response.status_code == 429:  # Too Many Requests
            print(f"Rate limit exceeded. Retrying in {delay} seconds...")
            time.sleep(delay)
            delay *= 2  # 대기 시간 증가 (지수적 백오프)
        else:
            print(f"Error {response.status_code} for DOI: {doi}")
            return None
    
    print(f"Failed to fetch data for DOI: {doi} after {retries} attempts.")
    return None

# CSV 파일 로드
input_file = "Updated_DOI_Dataset.csv"
output_file = "citation_data.csv"
error_file = "error_log.txt"

# 기존 데이터 로드 (이미 수집된 DOI 중복 방지)
try:
    existing_data = pd.read_csv(output_file, encoding="utf-8-sig")
    collected_dois = set(existing_data["Paper_Doi"].unique())
except FileNotFoundError:
    collected_dois = set()

# Updated_DOI_Dataset 로드
df = pd.read_csv(input_file, encoding="utf-8-sig")
all_citations = []
errors = []

total_papers = len(df["DOI"].dropna().unique())

for idx, doi in enumerate(df["DOI"].dropna().unique()):
    if doi in collected_dois:
        continue  # 이미 수집된 DOI는 건너뛰기
    
    citation_info = get_citation_info(doi)
    if citation_info:
        all_citations.extend(citation_info)
    else:
        errors.append(doi)
    
    if (idx + 1) % 50 == 0:
        print(f"Processed {idx + 1}/{total_papers} papers...")
        pd.DataFrame(all_citations).to_csv(output_file, mode='a', header=not bool(collected_dois), index=False, encoding="utf-8-sig")
        all_citations = []
    
    time.sleep(1)  # 요청 간격 조정

# 마지막 데이터 저장
if all_citations:
    pd.DataFrame(all_citations).to_csv(output_file, mode='a', header=not bool(collected_dois), index=False, encoding="utf-8-sig")

# 오류 로그 저장
if errors:
    with open(error_file, "w") as f:
        for doi in errors:
            f.write(doi + "\n")

print("✅ Citation data collection complete!")


Error 404 for DOI: 10.1039/c2jm16448e
Error 404 for DOI: 10.1093/infdis/jir731
Error 404 for DOI: 10.1093/jxb/erw002
Error 404 for DOI: 10.1016/j.biomaterials.2020.120412
Processed 50/407 papers...


UnicodeEncodeError: 'latin-1' codec can't encode character '\u2018' in position 94: ordinal not in range(256)

In [27]:
pd.DataFrame(all_citations).to_csv(output_file, mode='a', header=not bool(collected_dois), index=False, encoding="utf-8")


In [39]:
import requests
import pandas as pd
import time

# 논문 DOI를 이용해 인용 정보 가져오기
def get_citation_info(doi):
    url = f"https://api.semanticscholar.org/v1/paper/{doi}"
    retries = 5  # 최대 재시도 횟수
    delay = 5  # 초기 대기 시간 (초)
    
    for attempt in range(retries):
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            citation_data = []
            
            def extract_info(citation_list, citation_type):
                for paper in citation_list:
                    if "doi" in paper and "year" in paper and "title" in paper:
                        citation_data.append({
                            "Paper_Doi": doi,
                            "Citation_Type": citation_type,
                            "Cited_Doi": paper["doi"],
                            "Year": paper["year"],
                            "Title": paper["title"]
                        })
            
            extract_info(data.get("citations", []), "Forward")
            extract_info(data.get("references", []), "Backward")
            
            return citation_data
        
        elif response.status_code == 429:
            time.sleep(delay)
            delay *= 2  # 지수적 백오프 적용
        else:
            return None
    
    return None

# CSV 파일 로드
input_file = "Paper_Dataset_Bio.csv"
output_file = "Citation_Dataset_Bio.csv"
error_file = "error_log.txt"

df = pd.read_csv(input_file, encoding="utf-8-sig")
dois = df["DOI"].dropna().unique()

all_citations = []
errors = []

total_papers = len(dois)
for idx, doi in enumerate(dois, 1):
    citation_info = get_citation_info(doi)
    if citation_info:
        all_citations.extend(citation_info)
    else:
        errors.append(doi)
    
    if idx % 50 == 0:
        print(f"Processed {idx}/{total_papers} papers...")
        pd.DataFrame(all_citations).to_csv(output_file, mode='a', index=False, encoding="utf-8-sig")
        all_citations = []
    
    time.sleep(1)  # API 요청 간격 조정

# 마지막 데이터 저장
if all_citations:
    pd.DataFrame(all_citations).to_csv(output_file, mode='a', index=False, encoding="utf-8-sig")

# 오류 로그 저장
if errors:
    with open(error_file, "w") as f:
        for doi in errors:
            f.write(doi + "\n")

print("✅ Citation data collection complete!")


Processed 50/6244 papers...
Processed 100/6244 papers...
Processed 150/6244 papers...
Processed 200/6244 papers...
Processed 250/6244 papers...
Processed 300/6244 papers...
Processed 350/6244 papers...
Processed 400/6244 papers...
Processed 450/6244 papers...
Processed 500/6244 papers...
Processed 550/6244 papers...
Processed 600/6244 papers...
Processed 650/6244 papers...
Processed 700/6244 papers...
Processed 750/6244 papers...
Processed 800/6244 papers...
Processed 850/6244 papers...
Processed 900/6244 papers...
Processed 950/6244 papers...
Processed 1000/6244 papers...
Processed 1050/6244 papers...
Processed 1100/6244 papers...
Processed 1150/6244 papers...
Processed 1200/6244 papers...
Processed 1250/6244 papers...
Processed 1300/6244 papers...
Processed 1350/6244 papers...
Processed 1400/6244 papers...
Processed 1450/6244 papers...
Processed 1500/6244 papers...
Processed 1550/6244 papers...
Processed 1600/6244 papers...
Processed 1650/6244 papers...
Processed 1700/6244 papers...


ConnectionError: HTTPSConnectionPool(host='api.semanticscholar.org', port=443): Max retries exceeded with url: /v1/paper/10.1186/s12934-019-1213-y (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002D4361A5760>: Failed to resolve 'api.semanticscholar.org' ([Errno 11001] getaddrinfo failed)"))

In [41]:
import pandas as pd

# CSV로 저장
df_citation.to_csv(output_file, index=False, encoding="utf-8-sig")


In [6]:
import pandas as pd

# Load the datasets
paper_df = pd.read_csv('Paper_Dataset_Bio.csv')
citation_df = pd.read_csv('Citation_Dataset_Bio.csv')

# Extract the 'DOI' from Paper_Dataset_Bio.csv and 'Paper_DOI' from Citation_Dataset_Bio.csv
paper_dois = set(paper_df['DOI'])
citation_paper_dois = set(citation_df['Paper_Doi'])

# Filter rows in paper_df where the DOI is not in citation_paper_dois
filtered_paper_df = paper_df[~paper_df['DOI'].isin(citation_paper_dois)]

# Save the filtered result to a new CSV file if you want
filtered_paper_df.to_csv('Filtered_Paper_Dataset_Bio.csv', index=False, encoding="utf-8-sig")

# Display the filtered dataframe
print(filtered_paper_df)


      ID_B                  ID_B_Org  ID_Project_Unique  ID_Paper  Year  \
28     172          201107011390401C         1345171157       165  2012   
32     229          201207011580401A         1345171791       180  2012   
56     135          200807020340401A         1345194033       322  2012   
69     135          200807020340401A         1345194033       329  2012   
71     135          200807020340401A         1345194033       339  2012   
...    ...                       ...                ...       ...   ...   
6271    75                  1.31E+22         1711105377     40735  2020   
6285    75                  1.31E+22         1711105377     40751  2020   
6297    85                  1.31E+22         1711113344     40842  2020   
6361    67  13111015015222002241428A         1711121217     41666  2020   
6486   270                  2.11E+22         9991006004     42135  2015   

      Year_Perf  Department  \
28         2012          15   
32         2012          15   
56    

  citation_df = pd.read_csv('Citation_Dataset_Bio.csv')


In [2]:
# 400개 누락만
import requests
import pandas as pd
import time

# 논문 DOI를 이용해 인용 정보 가져오기
def get_citation_info(doi):
    url = f"https://api.semanticscholar.org/v1/paper/{doi}"
    retries = 5  # 최대 재시도 횟수
    delay = 5  # 초기 대기 시간 (초)
    
    for attempt in range(retries):
        response = requests.get(url)
            
        if response.status_code == 200:
            data = response.json()
            paper_year = data.get("year", "Unknown")  # 원 논문 출판 연도
            citation_data = []
            
            def extract_info(citation_list, citation_type):
                for paper in citation_list:
                    if 'doi' in paper and 'year' in paper and 'title' in paper:
                        citation_data.append({
                            "Paper_Doi": doi,
                            "Publish_Year": paper_year,
                            "Citation_Type": citation_type,
                            "Cited_Doi": paper["doi"],
                            "Year": paper["year"],
                            "Title": paper["title"]
                        })
            
            extract_info(data.get("citations", []), "Forward")
            extract_info(data.get("references", []), "Backward")
            
            return citation_data
        
        elif response.status_code == 429:
            time.sleep(delay)
            delay *= 2  # 지수적 백오프 적용
        else:
            return None
    
    return None

# CSV 파일 로드
input_file = "Filtered_Paper_Dataset_Bio.csv"
output_file = "Add_Citation_Dataset_Bio.csv"
error_file = "error_log.txt"

df = pd.read_csv(input_file, encoding="utf-8-sig")
dois = df["DOI"].dropna().unique()

all_citations = []
errors = []

total_papers = len(dois)
for idx, doi in enumerate(dois, 1):
    citation_info = get_citation_info(doi)
    if citation_info:
        all_citations.extend(citation_info)
    else:
        errors.append(doi)
    
    if idx % 50 == 0:
        print(f"Processed {idx}/{total_papers} papers...")
        pd.DataFrame(all_citations).to_csv(output_file, mode='a', index=False, encoding="utf-8-sig")
        all_citations = []
    
    time.sleep(1)  # API 요청 간격 조정

# 마지막 데이터 저장
if all_citations:
    pd.DataFrame(all_citations).to_csv(output_file, mode='a', index=False, encoding="utf-8-sig")

# 오류 로그 저장
if errors:
    with open(error_file, "w") as f:
        for doi in errors:
            f.write(doi + "\n")

print("✅ Citation data collection complete!")


Processed 50/289 papers...
Processed 100/289 papers...
Processed 150/289 papers...
Processed 200/289 papers...
Processed 250/289 papers...
✅ Citation data collection complete!


In [16]:
import requests
import pandas as pd

# 논문의 Semantic Scholar ID (URL에서 추출)
paper_id = "3670a4ee1bf2acf18e55646ec0d927b25b2caddd"

def fetch_citation_info(paper_id):
    """Semantic Scholar API를 이용하여 논문의 인용 및 참고문헌 정보 가져오기"""
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}"
    params = {
        "fields": "title,year,citations.paperId,citations.title,citations.year,references.paperId,references.title,references.year"
    }
    
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        
        # Citation 정보
        citations = data.get("citations", [])
        citation_list = [
            {"Paper_ID": paper_id, "Citation_Type": "Forward", "Cited_PaperId": c["paperId"], "Year": c.get("year"), "Title": c.get("title")}
            for c in citations
        ]

        # Reference 정보
        references = data.get("references", [])
        reference_list = [
            {"Paper_ID": paper_id, "Citation_Type": "Backward", "Cited_PaperId": r["paperId"], "Year": r.get("year"), "Title": r.get("title")}
            for r in references
        ]
        
        return citation_list + reference_list
    else:
        print("❌ API 요청 실패:", response.status_code)
        return None

# 논문의 Citation 및 Reference 정보 가져오기
citation_data = fetch_citation_info(paper_id)

# 데이터 출력
if citation_data:
    df = pd.DataFrame(citation_data)
    print(df)
    # CSV 파일 저장 (원하면 주석 해제)
    # df.to_csv("Citation_Info.csv", index=False, encoding="utf-8-sig")
else:
    print("❌ 데이터 없음")


                                    Paper_ID Citation_Type  \
0   3670a4ee1bf2acf18e55646ec0d927b25b2caddd       Forward   
1   3670a4ee1bf2acf18e55646ec0d927b25b2caddd       Forward   
2   3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
3   3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
4   3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
5   3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
6   3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
7   3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
8   3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
9   3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
10  3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
11  3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
12  3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
13  3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
14  3670a4ee1bf2acf18e55646ec0d927b25b2caddd      Backward   
15  3670

In [None]:
# 이 코드는 ID를 가져옴.. DOI를 가져오도록 수정하기

import requests
import pandas as pd
import re
import time

# 파일 로드
df = pd.read_csv('Filtered_Paper_Dataset_Bio.csv')

def fetch_citation_info(paper_id):
    """Semantic Scholar API를 이용하여 논문의 인용 및 참고문헌 정보 가져오기"""
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}"
    params = {
        "fields": "title,year,externalIds.doi,citations.paperId,citations.title,citations.year,citations.externalIds.doi,references.paperId,references.title,references.year,references.externalIds.doi"
    }
    
    retries = 5  # 최대 재시도 횟수
    delay = 2  # 초기 대기 시간 (초)
    
    for attempt in range(retries):
        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            
            paper_doi = data.get("externalIds", {}).get("DOI", paper_id)  # DOI가 없으면 기존 ID 사용
            
            # Citation 정보
            citations = data.get("citations", [])
            citation_list = [
                {"Paper_ID": paper_doi, "Citation_Type": "Forward", "Cited_PaperId": c.get("externalIds", {}).get("DOI", c["paperId"]), "Year": c.get("year"), "Title": c.get("title")}
                for c in citations
            ]
    
            # Reference 정보
            references = data.get("references", [])
            reference_list = [
                {"Paper_ID": paper_doi, "Citation_Type": "Backward", "Cited_PaperId": r.get("externalIds", {}).get("DOI", r["paperId"]), "Year": r.get("year"), "Title": r.get("title")}
                for r in references
            ]
            
            return citation_list + reference_list
        elif response.status_code == 429:
            print(f"❌ API 요청이 429 오류로 실패했습니다. {attempt + 1}/{retries}번 재시도 중... {delay}초 대기")
            time.sleep(delay)
            delay *= 2  # 지수적 백오프 적용
        elif response.status_code == 404:
            print(f"❌ 404 오류: 논문 ID {paper_id}에 대한 정보가 없습니다.")
            return None  # 404 오류 발생 시 바로 None 반환
        else:
            print(f"❌ API 요청 실패: {response.status_code}")
            return None

    print(f"❌ API 요청이 {retries}번 실패하여 논문 ID {paper_id}를 건너뜁니다.")
    return None  # 모든 시도가 실패하면 None 반환

# 논문의 Citation 및 Reference 정보 수집
all_citation_data = []
missing_papers = []  # 404 오류로 누락된 논문 리스트

# 'Sem_URL'에서 paperId 추출 후 인용 정보 가져오기
for index, row in df.iterrows():
    sem_url = row.get('Sem_URL')
    if pd.notna(sem_url):  # 'Sem_URL'이 비어있지 않은 경우에만 처리
        # URL에서 paperId 추출 (예: https://www.semanticscholar.org/paper/Current-state-and-perspectives-on-erythropoietin-Lee-Ha/1aaa70f955841f3eb70d928964e388d66ba1197d)
        match = re.search(r"/paper/([^/]+)$", sem_url)
        if match:
            paper_id = match.group(1)
            print(f"Processing paper ID: {paper_id}")
            citation_data = fetch_citation_info(paper_id)
            if citation_data:
                all_citation_data.extend(citation_data)
            else:
                missing_papers.append(paper_id)  # 404 오류로 누락된 논문 기록

# 데이터 출력 및 CSV 저장
if all_citation_data:
    df_citations = pd.DataFrame(all_citation_data)
    print(df_citations)
    # 결과를 CSV로 저장 (원하면 주석 해제)
    df_citations.to_csv('Citation_Info_Updated.csv', index=False, encoding="utf-8-sig")
else:
    print("❌ No citation data found")

# 누락된 논문 ID 로그 파일로 저장
if missing_papers:
    with open("missing_papers.txt", "w") as f:
        for paper_id in missing_papers:
            f.write(paper_id + "\n")
    print("❌ 일부 논문은 404 오류로 누락되었습니다. missing_papers.txt에 기록되었습니다.")

In [12]:
import pandas as pd
import requests
import time

# 파일 로드
df = pd.read_csv('Filtered_Paper_Dataset_Bio.csv')

# Semantic Scholar 검색 URL
SEARCH_URL = "https://api.semanticscholar.org/graph/v1/paper/search"

# API 호출 함수
def get_semantic_scholar_url(title):
    params = {"query": title, "limit": 1}
    response = requests.get(SEARCH_URL, params=params)
    
    if response.status_code == 200:
        data = response.json()
        papers = data.get("data", [])
        if papers:
            return f"https://www.semanticscholar.org/paper/{papers[0]['paperId']}"
    return ""

# 진행 상황을 출력하는 함수
def print_progress(idx, total, title):
    print(f"Processing ({idx}/{total}): {title[:50]}...")  # 제목이 너무 길면 잘라서 출력

# 'Sem_URL'이 비어있는 경우 검색 수행
total_papers = len(df)
for idx, row in df.iterrows():
    if pd.isna(row['Sem_URL']):
        title = row['Title']
        if pd.notna(title):
            # 진행 상황 출력
            print_progress(idx + 1, total_papers, title)
            
            # Semantic Scholar URL을 얻어 업데이트
            df.at[idx, 'Sem_URL'] = get_semantic_scholar_url(title)
            
            # 요청 간 딜레이 추가
            time.sleep(1)
    
    # 진행 상황을 주기적으로 출력
    if (idx + 1) % 50 == 0:
        print(f"{idx + 1}/{total_papers} papers processed...")

# 결과 저장
df.to_csv('Filtered_Paper_Dataset_Bio_Updated.csv', index=False, encoding="utf-8-sig")

print("✅ Data processing complete!")


Processing (1/252): Kinetic studies on the formation of various II-VI ...
Processing (5/252): Transcriptional analysis of the rho-coumarate 3-hy...
Processing (7/252): Diet-Induced Obesity Dramatically Reduces the Effi...
Processing (21/252): Loose Plant Architecture1 (LPA1) determines lamina...
Processing (29/252): Diet-Induced Obesity Dramatically Reduces the Effi...
Processing (31/252): Development of a Novel Technique for Scaffold Fabr...
50/252 papers processed...
Processing (75/252): Combined Biocatalytic and Chemical Transformations...
Processing (89/252): STUDY ON TEMPERATURE ELEVATION OF PHOTOACOUSTIC TO...
Processing (90/252): DETECTION OF THE MORPHOLOGY OF THE CORONARY ARTERY...
Processing (95/252): Effects of Particulate Matter on the Developments ...
100/252 papers processed...
Processing (106/252): Comparison of 2D and 3D cell-based models using hu...
Processing (135/252): Optimization of adipose tissue-derived mesenchymal...
Processing (136/252): Amelioration of autoimmu

In [1]:
import pandas as pd

# Load the datasets
paper_df = pd.read_csv('Paper_Dataset_Bio.csv')
citation_df = pd.read_csv('Citation_Dataset_Bio.csv')

# Merge the dataframes on the 'DOI' and 'Paper_Doi' columns to compare the 'Year' and 'Publish_Year'
merged_df = pd.merge(paper_df, citation_df, left_on='DOI', right_on='Paper_Doi', how='inner')

# Filter the rows where the 'Year_Perf' in Paper Dataset does not match the 'Publish_Year' in Citation Dataset
mismatched_data = merged_df[merged_df['Year_Perf'] != merged_df['Publish_Year']]

# Remove duplicates based on 'DOI'
unique_mismatched_data = mismatched_data.drop_duplicates(subset='DOI')

# Select only the relevant columns
unique_mismatched_data = unique_mismatched_data[['DOI', 'Year_Perf', 'Paper_Doi', 'Publish_Year']]

# Display the unique mismatched data
print(unique_mismatched_data)

# If you want to save this data to a new CSV file
unique_mismatched_data.to_csv('Unique_Mismatched_Paper_Citation.csv', index=False)

  citation_df = pd.read_csv('Citation_Dataset_Bio.csv')


                               DOI  Year_Perf                   Paper_Doi  \
1295         10.3892/ijo.2011.1319       2012       10.3892/ijo.2011.1319   
2503     10.1007/s00253-011-3758-5       2012   10.1007/s00253-011-3758-5   
3416           10.1093/nar/gkr1021       2012         10.1093/nar/gkr1021   
3470           10.1093/nar/gkr1127       2012         10.1093/nar/gkr1127   
4784     10.1007/s00449-011-0610-3       2012   10.1007/s00449-011-0610-3   
...                            ...        ...                         ...   
429092     10.1538/expanim.19-0065       2020     10.1538/expanim.19-0065   
429546          10.1111/bcpt.13342       2020          10.1111/bcpt.13342   
429829  10.1016/j.bbrc.2019.10.079       2020  10.1016/j.bbrc.2019.10.079   
430340     10.1074/jbc.M116.737940       2017     10.1074/jbc.M116.737940   
430598         10.1038/mp.2017.113       2018         10.1038/mp.2017.113   

       Publish_Year  
1295           2011  
2503           2011  
3416     

In [8]:
import pandas as pd

# Load the CSV files into DataFrames
unique_mismatched_df = pd.read_csv('Unique_Mismatched_Paper_Citation.csv')
paper_bio_df = pd.read_csv('Paper_Dataset_Bio.csv')

# Ensure the column names are as expected
# Assuming the DOI column is named 'DOI' in both datasets and 'Title' is the column name in Paper_Dataset_Bio.csv
# If the column names are different, adjust the names accordingly

# Merge the dataframes on 'DOI' to get the title corresponding to each DOI in Unique_Mismatched_Paper_Citation
merged_df = pd.merge(unique_mismatched_df, paper_bio_df[['DOI', 'Title']], on='DOI', how='left')

# Save the result back to a CSV file
merged_df.to_csv('Updated_Unique_Mismatched_Paper_Citation.csv', index=False)

print("Title has been successfully added based on matching DOI and saved to 'Updated_Unique_Mismatched_Paper_Citation.csv'.")


Title has been successfully added based on matching DOI and saved to 'Updated_Unique_Mismatched_Paper_Citation.csv'.
