In [None]:
import pandas as pd
from data_pipeline.harvester import WosHarvester, ScopusHarvester
from data_pipeline.deduplicator import DataFrameProcessor
from dspace.client import DSpaceClient
import logging

In [2]:
# Set default queries if none provided
default_queries = {
    "wos": "AI=(GCM-6397-2022)",
    "scopus": "AU-ID ( 6602190457 )",
}
start_date="2000-01-01"
end_date="2024-09-05"

In [3]:
wos_harvester = WosHarvester(start_date, end_date, default_queries["wos"])
wos_publications = wos_harvester.harvest()

In [None]:
wos_publications

In [4]:
scopus_harvester = ScopusHarvester(start_date, end_date, default_queries["scopus"])
scopus_publications = scopus_harvester.harvest()

In [None]:
# Merge
deduplicator = DataFrameProcessor(wos_publications, scopus_publications)
deduplicated_sources_df = deduplicator.deduplicate_dataframes()
deduplicated_sources_df

In [7]:
file_path = "./data/merged_items"
deduplicated_sources_df.to_excel(f"{file_path}.xlsx", index=False)

In [None]:
df_final, df_unloaded = deduplicator.deduplicate_infoscience(deduplicated_sources_df)

In [None]:
df_unloaded

In [10]:
file_path = "./data/unloaded_items"
df_unloaded.to_excel(f"{file_path}.xlsx", index=False)

In [11]:
file_path = "./data/import_items"
df_final.to_excel(f"{file_path}.xlsx", index=False)

In [None]:
def import_to_dspace():

    # Lire le fichier Excel
    df = pd.read_excel("./data/import_items.xlsx")

    # Ajouter la colonne 'workspace_item_id' si elle n'existe pas déjà
    if "workspace_item_id" not in df.columns:
        df["workspace_item_id"] = None

    # Initialiser le client DSpace
    d = DSpaceClient()

    # Authentification du client DSpace
    d.authenticate()

    # Parcourir chaque ligne du DataFrame
    for index, row in df.iterrows():
        # Récupérer les valeurs des colonnes 'source' et 'ifs3_collection_id'
        source = row["source"]
        ifs3_collection_id = row["ifs3_collection_id"]
        internal_id = row["internal_id"]

        if ifs3_collection_id == "8a8d3310-6535-4d3a-90b6-2a4428097b5b":
            form_section = "article_details"
        elif ifs3_collection_id == "e91ecd9f-56a2-4b2f-b7cc-f03e03d2643d":
            form_section = "conference_details"

        # Créer un élément dans l'espace de travail depuis la source externe
        response = d.create_workspaceitem_from_external_source(
            source, internal_id, ifs3_collection_id
        )

        # Récupérer l'ID de l'élément créé
        workspace_id = response.get("id")

        # Définir les opérations de patch
        patch_operations = [
            {
                "op": "add",
                "path": f"/sections/{form_section}/dc.language.iso",
                "value": [
                    {
                        "value": "en",
                        "language": None,
                        "authority": None,
                        "display": "English",
                        "securityLevel": 0,
                        "confidence": -1,
                        "place": 0,
                        "otherInformation": None,
                    }
                ],
            },
            {
                "op": "add",
                "path": f"/sections/{form_section}/dc.description.sponsorship",
                "value": [
                    {
                        "value": "Non-EPFL",
                        "language": None,
                        "authority": f"will be referenced::ACRONYM::Non-EPFL",
                        "securityLevel": 0,
                        "confidence": 600,
                    }
                ],
            },
            {
                "op": "add",
                "path": f"/sections/{form_section}/epfl.peerreviewed",
                "value": [
                    {
                        "value": "REVIEWED",
                        "language": None,
                        "authority": None,
                        "display": "REVIEWED",
                        "securityLevel": 0,
                        "confidence": -1,
                        "place": 0,
                        "otherInformation": None,
                    }
                ],
            },
            {
                "op": "add",
                "path": f"/sections/{form_section}/epfl.writtenAt",
                "value": [
                    {
                        "value": "OTHER",
                        "language": None,
                        "authority": None,
                        "securityLevel": 0,
                        "confidence": -1,
                        "place": 0,
                        "otherInformation": None,
                    }
                ],
            },
            {"op": "add", "path": "/sections/license/granted", "value": "true"},
        ]

        # Mettre à jour le DataFrame avec l'ID de l'élément dans workspace_item_id
        df.at[index, "workspace_item_id"] = workspace_id

        # Essayer de mettre à jour l'élément dans l'espace de travail
        try:
            update_response = d.update_workspaceitem(workspace_id, patch_operations)
            if update_response:
                ft = d.import_unpaywall_fulltext(workspace_id)
                if ft:
                    logging.info(f"import unpaywall réussie")
                    patch_file_metadata = [
                        {
                            "op": "add",
                            "path": "/sections/upload-publication/files/0/metadata/dc.type",
                            "value": [
                                {
                                    "value": "main document",
                                    "language": None,
                                    "authority": None,
                                    "display": "Main document",
                                    "securityLevel": 0,
                                    "confidence": -1,
                                    "place": 0,
                                    "otherInformation": None,
                                }
                            ],
                        },
                        {
                            "op": "add",
                            "path": "/sections/upload-publication/files/0/accessConditions",
                            "value": [{"name": "openaccess"}],
                        },
                    ]
                    try:
                        update_file_metadata = d.update_workspaceitem(
                            workspace_id, patch_file_metadata
                        )
                    except Exception as e:
                        logging.error(f"An error occurred while updating workspace item's file metadata: {str(e)}")
                else:
                    logging.error(f"Échec de l'import unpaywall.")
            if update_response:
                try:
                    wf_response = d.create_workflowitem(workspace_id)
                    logging.info(f"Workflow item #{workspace_id} created")
                except Exception as e:
                    logging.error(
                        f"An error occurred while creating workflow item: {str(e)}"
                    )
            else:
                logging.error(
                    f"Failed to update workspace item with ID: {workspace_id}"
                )
        except Exception as e:
            logging.error(f"An error occurred while updating workspace item: {str(e)}")

    # Exporter le DataFrame mis à jour vers un fichier Excel
    df.to_excel(f"./data/imported_records.xlsx", index=False)

In [None]:
# create workspace and workflow items
import_to_dspace()