In [1]:
#Author: Christian Hernandez
#Date: 01/04/2024
#Description: Converting Distiller SR PDFs conventional names to Ref IDs. This will facilitate NID
#creation in the future. ALso for vetting purposes.

#%pip install openpyxl PyPDF2

In [3]:
# Removes PDF Duplicates

import os
import hashlib

def calculate_file_hash(file_path):
    """
    Calculate the MD5 hash of a file's content.
    """
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as file:
        while True:
            data = file.read(8192)
            if not data:
                break
            hash_md5.update(data)
    return hash_md5.hexdigest()

def find_duplicate_pdfs(directory):
    """
    Find and remove duplicate PDF files in a directory.
    """
    file_hash_dict = {}
    duplicates = []

    # Traverse the directory and calculate hashes
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if filename.lower().endswith(".pdf"):
                file_path = os.path.join(root, filename)
                file_hash = calculate_file_hash(file_path)

                # Check if the same hash already exists
                if file_hash in file_hash_dict:
                    duplicates.append((file_path, file_hash_dict[file_hash]))
                else:
                    file_hash_dict[file_hash] = file_path

    return duplicates

def remove_duplicate_pdfs(directory):
    """
    Remove duplicate PDF files from a directory.
    """
    duplicates = find_duplicate_pdfs(directory)

    for duplicate in duplicates:
        file_path, original_path = duplicate
        os.remove(file_path)
        print(f"Removed duplicate: {file_path}")

if __name__ == "__main__":
    directory = "/snfs1/Project/birds/team_documents/maternal_morbidity/high_FPG_SR/Extraction/NIDs/PDFs/"
    remove_duplicate_pdfs(directory)
print("Done")

Done


In [6]:
### Unstacking the vertical stacked characters in Column B

import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_excel("/snfs1/Project/birds/team_documents/maternal_morbidity/high_FPG_SR/Extraction/NIDs/bench.xlsx")

# Initialize an empty DataFrame to store the separated rows
new_rows = []

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    # Split the vertically stacked values in Column B by newline character '\n'
    stacked_values = row["Column B"].split("\n")

    # Create a new row for each separated value, with the same value in Column A
    for value in stacked_values:
        new_row = {"Column A": row["Column A"], "Column B": value.strip()}
        new_rows.append(new_row)

# Create a new DataFrame from the list of separated rows
new_df = pd.DataFrame(new_rows)

# Save the separated data to a new Excel file
new_df.to_excel("/snfs1/Project/birds/team_documents/maternal_morbidity/high_FPG_SR/Extraction/NIDs/bench.xlsx", index=False)
print("Done")

# Rename the RefID column 'Column A'
# Rename the Attachment column "Column B"

Done


In [8]:
# Deleting .pdf, .docx, () & , from Column B characters.

import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_excel("/snfs1/Project/birds/team_documents/maternal_morbidity/high_FPG_SR/Extraction/NIDs/bench.xlsx")

# Remove '.pdf' and '.docx' from values in Column B
df["Column B"] = df["Column B"].str.replace(".pdf", "").str.replace(".docx", "").str.replace(",", "").str.replace("?", "")

# Save the cleaned data to a new Excel file
df.to_excel("/snfs1/Project/birds/team_documents/maternal_morbidity/high_FPG_SR/Extraction/NIDs/bench.xlsx", index=False)


  df["Column B"] = df["Column B"].str.replace(".pdf", "").str.replace(".docx", "").str.replace(",", "").str.replace("?", "")
  df["Column B"] = df["Column B"].str.replace(".pdf", "").str.replace(".docx", "").str.replace(",", "").str.replace("?", "")


In [9]:
import os
import openpyxl

# Load the Excel file
excel_file = "/snfs1/Project/birds/team_documents/maternal_morbidity/high_FPG_SR/Extraction/NIDs/bench.xlsx"  # Replace with your Excel file name
workbook = openpyxl.load_workbook(excel_file)
worksheet = workbook.active

# Create a dictionary to map old names to new names
name_mapping = {}
for row in worksheet.iter_rows(min_row=2, values_only=True):
    new_name, old_name = row
    if old_name and new_name:
        name_mapping[old_name] = new_name

# Directory where the PDF files are located
pdf_directory = "/snfs1/Project/birds/team_documents/maternal_morbidity/high_FPG_SR/Extraction/NIDs/PDFs/"  # Replace with the path to your PDF files directory

# Iterate through the PDF files in the directory
for pdf_file in os.listdir(pdf_directory):
    if pdf_file.endswith(".pdf"):
        old_name = pdf_file[:-4]  # Remove ".pdf" extension
        if old_name in name_mapping:
            new_name = name_mapping[old_name]
            old_pdf_path = os.path.join(pdf_directory, pdf_file)
            new_pdf_path = os.path.join(pdf_directory, f"{new_name}.pdf")
            
            # Rename the PDF file
            os.rename(old_pdf_path, new_pdf_path)
            print(f"Renamed '{old_name}.pdf' to '{new_name}.pdf'")

# Close the Excel file
workbook.close()
print("Done")

Renamed '17392.pdf' to '17392.pdf'
Renamed 'bmjopen-2018-025084.PMC6500205.pdf' to '20885.pdf'
Renamed '26883.pdf' to '26883.pdf'
Renamed '35914203.pdf' to '35914203.pdf'
Renamed '24239Yong-High early pregnancy serum 25-hydroxy vit.pdf' to '24239.pdf'
Renamed '18810Satodiya-Comparison of One-Step Versus Two-Ste.pdf' to '18810.pdf'
Renamed '15634Alsaedi-Prevalence and risk factors of gestati.pdf' to '15634.pdf'
Renamed 'Glasser et al. 2011.pdf' to '35914562.pdf'
Renamed 'Yin et al. 2022.pdf' to '23072.pdf'
Renamed 'fendo-12-644770.pdf' to '25264.pdf'
Renamed '24139Nigatu-Prevalence of Gestational Diabetes Mell.pdf' to '24139.pdf'
Renamed 'JFMPC-11-4545.PMC9638613.pdf' to '22886.pdf'
Renamed 'je-32-069.pdf' to '26126.pdf'
Renamed '18257.pdf' to '18257.pdf'
Renamed '16322Dong-Chocolate consumption and risk of gestati.pdf' to '16322.pdf'
Renamed 'Lee et al. 1996.pdf' to '1161.pdf'
Renamed '17606.pdf' to '17606.pdf'
Renamed '618.PMC3579359.pdf' to '35914593.pdf'
Renamed '24639Nwali-Universa

Renamed 'Beischer et al. 1991.pdf' to '2131.pdf'
Renamed 'rmhp-14-1025.PMC7966303.pdf' to '25653.pdf'
Renamed '26980Hassan-Blood Groups and Hematological Paramete.pdf' to '26980.pdf'
Renamed '12884_2019_Article_2406.pdf' to '16353.pdf'
Renamed 'Ogunyemi et al. 1998.pdf' to '662.pdf'
Renamed '41598_2019_Article_51861.pdf' to '16003.pdf'
Renamed 'PAMJ-36-182.pdf' to '14794.pdf'
Renamed '15618.pdf' to '15618.pdf'
Renamed 'kcj-41-23.PMC3040399.pdf' to '7464.pdf'
Renamed '592_2018_Article_1162.pdf' to '35914586.pdf'
Renamed '18972Hanna-Diagnosing gestational diabetes mellitus.pdf' to '18972.pdf'
Renamed '12884_2015_Article_682.pdf' to '13955.pdf'
Renamed 'bmjopen-2019-033296.PMC7045252.pdf' to '15637.pdf'
Renamed 'Whitaker et al. 1998.pdf' to '35914329.pdf'
Renamed 'PJMS-37-1625.pdf' to '24640.pdf'
Renamed '23671Chelu-Prevalence of Gestational Diabetes in pr.pdf' to '23671.pdf'
Renamed 'nutrients-08-00574.pdf' to '19506.pdf'
Renamed 'Biodun et al. 2015.pdf' to '25855324.pdf'
Renamed '22229P