# Import WOS to Infoscience

In [3]:
import pandas as pd
from wos_client import WosClient
from csv import DictWriter
import os, base64
import logging
import ipywidgets as widgets
from ipywidgets import Layout
from IPython.display import display, Markdown, HTML
from dspace.client import DSpaceClient
import config
import utils
from dotenv import load_dotenv
import plotly.express as px
import plotly.io as pio
#pio.renderers.default = "vscode"

In [5]:
### Workflow metadata #####################
current_date = config.CURRENT_DATE
folder_path = "harvested-data"
path = os.path.join(folder_path, str(current_date).replace("-", "_"))
if not os.path.exists(path):
    os.mkdir(path)
doctypes = [key for key in utils.get_doctype_mapping().keys()]

databaseId = "WOS"
epfl_query = "OG=(Ecole Polytechnique Federale de Lausanne) AND DT=article"
# epfl_query = "AI=(GCM-6397-2022) AND DT=article"
# createdTimeSpan = "2023-10-01+2024-04-01"

### global vars ###############################
total = 0
recs = []
ids_to_load = []

### env vars #################################
load_dotenv()

### ipywidgets config #########################
style = {'description_width': 'initial'}
spinner_output = widgets.Output()

logging.basicConfig(
    filename=os.path.join(path, "execute.log"),
    format="%(asctime)s %(message)s",
    encoding="utf-8",
    level=logging.INFO,
)

In [6]:
# common functions ###############################
def create_download_link(filename, title = "Cliquer ici pour télécharger le fichier : "):  
    data = open(filename, "rb").read()
    b64 = base64.b64encode(data)
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title+f' {filename}',filename=filename)
    return HTML(html)

***

## Etape 1 : Récupération des WOS IDs des publications EPFL

In [7]:
def wos_harvesting(start_date, end_date):
    createdTimeSpan = f"{start_date}+{end_date}"
    global total
    total = WosClient.count_results(
      databaseId=databaseId, usrQuery=epfl_query, createdTimeSpan=createdTimeSpan
    )
    print(f"- Nombre de publications trouvées dans le WOS: {total}")
    count = 50
    global recs
    for i in range(1, int(total), int(count)):
        print(
          f"Harvest publications {str(i)} to {str(int(i) + int(count))} on a total of {str(total)} publications"
        )
        logging.info(
          f"Harvest publications {str(i)} to {str(int(i) + int(count))} on a total of {str(total)} publications"
        )
        h_recs = WosClient.get_wos_digest(
          databaseId=databaseId,
          usrQuery=epfl_query,
          count=count,
          firstRecord=i,
          createdTimeSpan=createdTimeSpan,
        )
        filtered_recs = [item for item in h_recs if item.get("doctype") in doctypes]
        recs.extend(filtered_recs)
    df_wos_harvest = (pd.DataFrame(recs)
                      .dropna(how='all')
                      .to_csv(
                       os.path.join(path, "AllWosHarvestedPublications.csv"), index=False, encoding="utf-8")
                     )
    print(f"- Fichier csv archivé dans {os.path.join(path,'AllWosHarvestedPublications.csv')}")  

In [None]:
start_date = widgets.DatePicker(
    description='Date de publication postérieure au : ',
    value=pd.to_datetime('2024-07-22'),
    disabled=False,
    style=style,
    layout = Layout(width='50%', height='30px', display='flex')
)

end_date = widgets.DatePicker(
    description="Date de publication antérieure au : ",
    value=pd.to_datetime("2024-08-01"),
    disabled=False,
    style=style,
    layout=Layout(width="50%", height="30px", display="flex"),
)

harvest_button = widgets.Button(description="Harvest the WOS")
harvest_output = widgets.Output()

display(start_date, end_date,harvest_button, harvest_output, spinner_output)

def harvest_button_clicked(b):
    with spinner_output:
        display(widgets.HTML("<p>Chargement en cours...</p>"))
    with harvest_output:
        data_harvested = wos_harvesting(start_date.value, end_date.value)
        if data_harvested:
            display(create_download_link(os.path.join(path,"AllWosHarvestedPublications.csv")))
        spinner_output.clear_output()

harvest_button.on_click(harvest_button_clicked)

***

## Etape 2 : Dédoublonnage sur les publications déjà présentes dans Infoscience 

In [None]:
def wos_ifs_dedup(recs):
    # Instantiate DSpace client
    d = DSpaceClient()

    # Authenticate against the DSpace client
    authenticated = d.authenticate()

    global ids_to_load
    unloaded_duplicated_publications = []
    cache = {}  # Cache to store API responses and avoid repeat calls

    for x in recs:
        # Clean the title once and use it for deduplication
        cleaned_title = utils.clean_title(x["title"])
        pubyear = x["pubyear"]
        if isinstance(pubyear, str) and pubyear.isdigit():
            pubyear = int(pubyear)
        elif not isinstance(pubyear, int):
            raise ValueError("pubyear doit être numérique")
        previous_year = pubyear - 1
        next_year = pubyear + 1

        # Build queries for each matching rule
        wos_query = f"(itemidentifier:{str(x['wos_id'][4:]).strip()})"
        title_query = f"(title:({cleaned_title}) AND (dateIssued:{pubyear} OR dateIssued:{previous_year} OR dateIssued:{next_year}))"
        doi_query = f"(itemidentifier:{str(x['doi']).strip()})" if "doi" in x else None

        # Check each identifier, stopping if a duplicate is found
        for query in [wos_query, title_query, doi_query]:
            if query is None:
                continue

            # Use cached results if available
            if query in cache:
                is_duplicate = cache[query]
            else:
                # Check the researchoutput configuration
                dsos_researchoutputs = d.search_objects(
                    query=query,
                    page=0,
                    size=1,
                    dso_type="item",
                    configuration="researchoutputs",
                )
                num_items_researchoutputs = len(dsos_researchoutputs)

                # Check the supervision configuration
                dsos_supervision = d.search_objects(
                    query=query,
                    page=0,
                    size=1,
                    configuration="supervision",
                )
                num_items_supervision = len(dsos_supervision)

                # Determine if the item is a duplicate in either configuration
                is_duplicate = (num_items_researchoutputs > 0) or (
                    num_items_supervision > 0
                )

                # Cache the result
                cache[query] = is_duplicate

            if is_duplicate:
                unloaded_duplicated_publications.append(x)
                break
        else:
            # No duplicates found after all checks
            ids_to_load.append(x["wos_id"])

    # Save the results to a CSV file
    df_unload = (
        pd.DataFrame(unloaded_duplicated_publications)
        .dropna(how="all")
        .to_csv(
            os.path.join(path, "UnloadedDuplicatedPublications.csv"),
            index=False,
            encoding="utf-8",
        )
    )
    df_unload = pd.read_csv(
        os.path.join(path, "UnloadedDuplicatedPublications.csv"),
        sep=",",
        encoding="utf-8",
    )
    print(f"- Nombre de publications déjà présentes dans Infoscience : {str(df_unload.shape[0])}")
    print(f"- Fichier csv des publications doublons archivé dans {os.path.join(path, 'UnloadedDuplicatedPublications.csv')}")
    print(f"- Nombre de publications à importer : {str(int(total) - int(df_unload.shape[0]))}")

In [None]:
dedup_button = widgets.Button(description="Lancer le dédoublonnage")
dedup_output = widgets.Output()
spinner_output = widgets.Output()
display(dedup_button, dedup_output, spinner_output)

def dedup_button_clicked(b):
    with spinner_output:
        display(widgets.HTML("<p>Chargement en cours...</p>"))

    with dedup_output:
        duplicates = wos_ifs_dedup(recs)
        if duplicates:
            display(create_download_link(os.path.join(path, "UnloadedDuplicatedPublications.csv")))
        spinner_output.clear_output()

dedup_button.on_click(dedup_button_clicked)

***

## Etape 3 : Moissonnage des métadonnées du WOS

In [None]:
def wos_metadata_retrieval(ids_to_load):
    # Information log
    logging.info("Etape 3 : Moissonnage des métadonnées du WOS")
    logging.info("Starting...")

    # Launch workflow
    appended_data = []
    appended_auth = []
    no_record_ids = []
    error_ids = []

    for id in ids_to_load:
        try:
            result = WosClient.query_unique_id(id, infoscience_format=True)
            if result is not None:
                authors_record = {
                    "wos_id": result["wos_id"],
                    "authors": result["authors"],
                }
                df_authors = pd.DataFrame(authors_record)
                record = result.copy()
                del record["authors"]
                df = pd.json_normalize(record, max_level=3)
                appended_data.append(df)
                appended_auth.append(df_authors)
            else:
                logging.info(f"None value for {id}")
                no_record_ids.append(id)
        except Exception as e:
            error_ids.append({"wos_id": id, "error": str(e)})

    # Log processing results
    df_no_record_ids = pd.DataFrame(data={"wos_id": no_record_ids}).dropna(how="all")
    df_no_record_ids.to_csv(
        os.path.join(path, "EmptyWosRecords.csv"),
        index=False,
        encoding="utf-8",
    )

    df_error_ids = pd.DataFrame(error_ids).dropna(how="all")
    df_error_ids.to_csv(
        os.path.join(path,  "ErrorProcessingWosRecords.csv"),
        index=False,
        encoding="utf-8",
    )

    # Concatenate and save all data
    appended_data = pd.concat(appended_data).dropna(how="all")
    appended_auth = pd.concat(appended_auth).dropna(how="all")
    df_auth_final = pd.concat(
        [appended_auth["wos_id"], appended_auth["authors"].apply(pd.Series)], axis=1
    )

    appended_data.dropna(how="all").to_csv(
        os.path.join(path, "ResearchOutput.csv"),
        index=False,
        encoding="utf-8",
    )
    logging.info(
        f"Fichier csv des métadonnées des publications archivé dans {os.path.join(path, 'ResearchOutput.csv')}"
    )

    df_auth_final.dropna(how="all").to_csv(
        os.path.join(path, "AddressesAndNames.csv"),
        index=False,
        encoding="utf-8",
    )
    logging.info(
        f"Fichier csv des informations auteurs avec affiliations et corporate auteurs archivé dans {os.path.join(path, 'AddressesAndNames.csv')}"
    )

    # Complete authors & labs
    logging.info("Enrichissement des métadonnées auteurs & laboratoires")

    df_auth_epfl_only = df_auth_final.loc[
        (
            (
                (df_auth_final["role"].isin(["author"]))
                & (df_auth_final["organizations"].notna())
            )
            & (
                df_auth_final["organizations"].str.contains(
                    "EPF", case=False, regex=False, na=False
                )
            )
        )
        | (
            df_auth_final["organizations"].str.contains(
                "Ecole Polytech Fed Lausanne", case=False, regex=False, na=False
            )
        )
        | (
            df_auth_final["organizations"].str.contains(
                "Ecole Polytechnique Federale de Lausanne",
                case=False,
                regex=False,
                na=False,
            )
        )
    ]

    authors_list = list(set(df_auth_epfl_only["full_name"].values.tolist()))
    field_names = ["full_name", "author_infos"]

    with open(
        os.path.join(path, "epfl_authors_and_labs_metadata.csv"),
        "a",
        encoding="utf-8",
        errors="ignore",
    ) as f_object:
        for x in authors_list:
            harvest_metadata = {
                "full_name": x.strip(),
                "author_infos": utils.enrich_author(path, x.strip()),
            }
            dictwriter_object = DictWriter(f_object, fieldnames=field_names)
            dictwriter_object.writerow(harvest_metadata)

    df_auth_epfl_only = pd.read_csv(
        os.path.join(path, "epfl_authors_and_labs_metadata.csv"),
        sep=",",
        encoding="utf-8",
    ).dropna(how="all")
    df_auth_epfl_only.columns = ["full_name", "author_infos"]
    df_auth_epfl_only.to_csv(
        os.path.join(path, "epfl_authors_and_labs_metadata.csv"),
        index=False,
        encoding="utf-8",
    )

    df = pd.merge(
        df_auth_final,
        df_auth_epfl_only,
        how="left",
        left_on=["full_name"],
        right_on=["full_name"],
    ).dropna(how="all")
    df.to_csv(
        os.path.join(path, "AddressesAndNames.csv"),
        index=False,
        encoding="utf-8",
    )

    # Adjust column names for control file
    df_controle_epfl_authors_and_labs_metadata = pd.read_csv(
        os.path.join(
            path, "controle_epfl_authors_and_labs_metadata.csv"
        ),
        sep=",",
        encoding="utf-8",
    ).dropna(how="all")
    df_controle_epfl_authors_and_labs_metadata.columns = [
        "authority_type",
        "label",
        "source_metadata",
        "harvested_metadata",
    ]
    df_controle_epfl_authors_and_labs_metadata.to_csv(
        os.path.join(
            path, "controle_epfl_authors_and_labs_metadata.csv"
        ),
        index=False,
        encoding="utf-8",
    )

    logging.info("Etape 3 terminée")
    logging.info("----------------------------------------------------------------")

In [None]:
wos_retrieval_button = widgets.Button(description="Moissonner le WOS")
wos_retrieval_output = widgets.Output()
display(wos_retrieval_button, wos_retrieval_output, spinner_output)

def wos_retrieval_button_clicked(b):
    with spinner_output:
        display(widgets.HTML("<p>Chargement en cours...</p>"))
    with wos_retrieval_output:
        df_auth_final = wos_metadata_retrieval(ids_to_load)
        # complete_authors_and_labs(df_auth_final)
        display(Markdown('''### Publications'''))
        display(create_download_link(os.path.join(path,"ResearchOutput.csv")))
        display(create_download_link(os.path.join(path,"AddressesAndNames.csv")))
    spinner_output.clear_output()

wos_retrieval_button.on_click(wos_retrieval_button_clicked)

***

## Etape 4 : Importation en tant que workspace items dans DSpace

In [None]:
def wos_to_dspace():
    df = pd.read_csv(
        os.path.join(path, "ResearchOutput.csv"), sep=",", encoding="utf-8"
    )
    if "workspace_item_id" not in df.columns:
        df["workspace_item_id"] = None

    # Instantiate DSpace client
    d = DSpaceClient()

    # Authenticate against the DSpace client
    authenticated = d.authenticate()
    collection_id = "8a8d3310-6535-4d3a-90b6-2a4428097b5b"

    ids_to_workspace = df["wos_id"].tolist()
    for index, wos_id in enumerate(ids_to_workspace):
        response = d.create_workspaceitem_from_external_source("wos", wos_id, collection_id)

        workspace_id = response.get("id")
        units = utils.get_units_for_id(path, wos_id)
        sponsorships = []
        for unit in units:
            sponsorships.append(
                {
                    "value": unit.get("acro"),
                    "language": None,
                    "authority": f"will be referenced::ACRONYM::{unit.get('acro')}",
                    "securityLevel": 0,
                    "confidence": 400,
                    "place": 0,
                }
            )

        patch_operations = [
            {
                "op": "add",
                "path": "/sections/article_details/dc.language.iso",
                "value": [
                    {
                        "value": "en",
                        "language": None,
                        "authority": None,
                        "display": "English",
                        "securityLevel": 0,
                        "confidence": -1,
                        "place": 0,
                        "otherInformation": None,
                    }
                ],
            },
            {
                "op": "add",
                "path": "/sections/article_details/dc.description.sponsorship",
                "value": sponsorships,
            },
            {
                "op": "add",
                "path": "/sections/article_details/epfl.peerreviewed",
                "value": [
                    {
                        "value": "REVIEWED",
                        "language": None,
                        "authority": None,
                        "display": "REVIEWED",
                        "securityLevel": 0,
                        "confidence": -1,
                        "place": 0,
                        "otherInformation": None,
                    }
                ],
            },
            {
                "op": "add",
                "path": "/sections/article_details/epfl.writtenAt",
                "value": [
                    {
                        "value": "EPFL",
                        "language": None,
                        "authority": None,
                        "display": "EPFL",
                        "securityLevel": 0,
                        "confidence": -1,
                        "place": 0,
                        "otherInformation": None,
                    }
                ],
            },
            {"op": "add", "path": "/sections/license/granted", "value": "true"},
        ]
        df.at[index, "workspace_item_id"] = workspace_id
        try:
            update_response = d.update_workspaceitem(workspace_id, patch_operations)
            if update_response:
                try:
                    wf_response = d.create_workflowitem(workspace_id)
                    logging.info(f"Workflow item #{workspace_id} created")
                except Exception as e:
                    logging.error(
                        f"An error occurred while creating workflow item: {str(e)}"
                    )
            else:
                logging.error(
                    f"Failed to update workspace item with ID: {workspace_id}"
                )
        except Exception as e:
            logging.error(f"An error occurred while updating workspace item: {str(e)}")

    df.to_csv(
        os.path.join(path, "ResearchOutput.csv"), sep=",", encoding="utf-8", index=False
    )

# create workspace and workflow items
wos_to_dspace()