In [2]:
import pandas as pd
import re

# Read the existing Excel file
df = pd.read_excel('combined_results.xlsx')

# Ensure 'DOI_LINK' column exists; initialize with 'NOT_FOUND' for all entries
if 'DOI_LINK' not in df.columns:
    df['DOI_LINK'] = 'NOT_FOUND'

# Condition to select rows from ScienceDirect
condition = df['Database'] == 'ScienceDirect'

# Regex pattern to extract DOIs
doi_pattern = r"10\.\d{4,9}/[^\s]+"

# Function to extract DOI from URL
def extract_doi(url):
    match = re.search(doi_pattern, str(url))
    if match:
        return match.group()
    else:
        return 'NOT_FOUND'

# Process entries from ScienceDirect
# Move the DOI URL to 'DOI_LINK' and extract the DOI into 'DOI'
df.loc[condition, 'DOI_LINK'] = df.loc[condition, 'DOI']
df.loc[condition, 'DOI'] = df.loc[condition, 'DOI'].apply(extract_doi)

# Reorder columns to place 'REVIEW_STAGE' at the first position
columns = df.columns.tolist()
if 'REVIEW_STAGE' in columns:
    columns.insert(0, columns.pop(columns.index('REVIEW_STAGE')))
    df = df[columns]
else:
    print("'REVIEW_STAGE' column not found in the DataFrame.")

# Save the modified DataFrame back to the Excel file
df.to_excel('combined_results_doi_mapped.xlsx', index=False)
