In [6]:
import pandas as pd

# Load the Excel file
file_path = "combined_results_doi_mapped.xlsx"
df = pd.read_excel(file_path)

# Identify duplicates based on DOI and Title columns
duplicates_doi = df[df.duplicated(subset='DOI', keep=False)]
duplicates_title = df[df.duplicated(subset='Title', keep=False)]

# Combine DOI and Title duplicates for a unified summary
combined_duplicates = pd.concat([duplicates_doi, duplicates_title]).drop_duplicates()

# Group duplicates to summarize occurrences
duplicate_summary = combined_duplicates.groupby(['DOI', 'Title']).agg(
    Count=('Database', 'size'),
    Databases=('Database', lambda x: ', '.join(x.unique()))
).reset_index()

# Save the summary of duplicates to an Excel file
summary_file = "duplicate_summary_combined.xlsx"
duplicate_summary.to_excel(summary_file, index=False)

# Remove duplicates from the original dataset (keep the first occurrence)
df_cleaned = df.drop_duplicates(subset=['DOI', 'Title'], keep='first')

# Save the cleaned dataset to an Excel file
cleaned_file = "combined_results_cleaned.xlsx"
df_cleaned.to_excel(cleaned_file, index=False)

# Display results
print(f"Number of duplicate entries: {duplicate_summary.shape[0]}")
print(f"Duplicate summary saved to: {summary_file}")
print(f"Cleaned dataset saved to: {cleaned_file}")


Number of duplicate entries: 158
Duplicate summary saved to: duplicate_summary_combined.xlsx
Cleaned dataset saved to: combined_results_cleaned.xlsx
