# Import WOS & Scopus to Infoscience

In [1]:
from data_pipeline.main import main
import pandas as pd
import logging
import ipywidgets as widgets
from ipywidgets import Layout
from IPython.display import display, Markdown, HTML
from datetime import date
import os, base64

In [2]:
# common functions ###############################
def create_download_link(filename, title = "Cliquer ici pour télécharger le fichier : "):  
    data = open(filename, "rb").read()
    b64 = base64.b64encode(data)
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title+f' {filename}',filename=filename)
    return HTML(html)

***

## Etape 1 : Récupération des publications EPFL (métadonnées, auteurs, affiliations) depuis les bases biblios externes, dédoublonnage entre sources et dédoublonnage sur publications déjà présentes dans Infoscience

In [3]:
### Workflow metadata #####################
current_date = str(date.today())
folder_path = "harvested-data"
path = os.path.join(folder_path, str(current_date).replace("-", "_"))
if not os.path.exists(path):
    os.mkdir(path)

### ipywidgets config #########################
style = {'description_width': 'initial'}
spinner_output = widgets.Output()

**`main.py`** : The default queries are the ones that identifify and harvest the publications of the EPFL institutions

- WoS : "OG=(Ecole Polytechnique Federale de Lausanne)"
- Scopus : "AF-ID(60028186) OR AF-ID(60210159) OR AF-ID(60070536) OR AF-ID(60204330) OR AF-ID(60070531) OR AF-ID(60070534) OR AF-ID(60070538) OR AF-ID(60014951) OR AF-ID(60070529) OR AF-ID(60070532) OR AF-ID(60070535) OR AF-ID(60122563) OR AF-ID(60210160) OR AF-ID(60204331)"

These queries are completed by the selected data range as parameters

**notebook** : To overwrite and customize the default queries:
```
custom_queries = {
    "wos": "OG=(Your Custom Query for WOS)",
    "scopus": "AF-ID(Your Custom Scopus ID)",
    "openalex": "YOUR_CUSTOM_OPENALEX_QUERY",
    "zenodo": "YOUR_CUSTOM_ZENODO_QUERY"
}
df_metadata, df_authors, df_unloaded = main(start_date="2023-01-01", end_date="2023-12-31", queries=custom_queries)
```

In [4]:
start_date = widgets.DatePicker(
    description='Date de publication postérieure au : ',
    value=pd.to_datetime('2024-07-22'),
    disabled=False,
    style=style,
    layout = Layout(width='50%', height='30px', display='flex')
)

end_date = widgets.DatePicker(
    description="Date de publication antérieure au : ",
    value=pd.to_datetime("2024-08-01"),
    disabled=False,
    style=style,
    layout=Layout(width="50%", height="30px", display="flex"),
)

harvest_button = widgets.Button(description="Harvest WOS and Scopus")
harvest_output = widgets.Output()

display(start_date, end_date,harvest_button, harvest_output, spinner_output)

def harvest_button_clicked(b):
    with spinner_output:
        display(widgets.HTML("<p>Chargement en cours...</p>"))
    with harvest_output:
        start = start_date.value.strftime('%Y-%m-%d')
        end = end_date.value.strftime('%Y-%m-%d')
        df_metadata, df_authors, df_unloaded = main(start_date=start, end_date=end)
        df_metadata.to_csv(
                       os.path.join(path, "ResearchOutput.csv"), index=False, encoding="utf-8")
        df_authors.to_csv(
                       os.path.join(path, "AddressesAndNames.csv"), index=False, encoding="utf-8")
        df_unloaded.to_csv(
                       os.path.join(path, "UnloadedDuplicatedPublications.csv"), index=False, encoding="utf-8")
        # Liens de téléchargement
        display(create_download_link(os.path.join(path,"ResearchOutput.csv")))
        display(create_download_link(os.path.join(path,"AddressesAndNames.csv")))
        display(create_download_link(os.path.join(path,"UnloadedDuplicatedPublications.csv")))
        spinner_output.clear_output()

harvest_button.on_click(harvest_button_clicked)

DatePicker(value=Timestamp('2024-07-22 00:00:00'), description='Date de publication postérieure au : ', layout…

DatePicker(value=Timestamp('2024-08-01 00:00:00'), description='Date de publication antérieure au : ', layout=…

Button(description='Harvest WOS and Scopus', style=ButtonStyle())

Output()

Output()

***

## Etape 2 : Enrichissements des entités auteurs à partir de api.epfl.ch

***

## Etape 3 : Importation en tant que workspace items dans DSpace

In [None]:
def wos_to_dspace():
    df = pd.read_csv(
        os.path.join(path, "ResearchOutput.csv"), sep=",", encoding="utf-8"
    )
    if "workspace_item_id" not in df.columns:
        df["workspace_item_id"] = None

    # Instantiate DSpace client
    d = DSpaceClient()

    # Authenticate against the DSpace client
    authenticated = d.authenticate()
    collection_id = "8a8d3310-6535-4d3a-90b6-2a4428097b5b"

    ids_to_workspace = df["wos_id"].tolist()
    for index, wos_id in enumerate(ids_to_workspace):
        response = d.create_workspaceitem_from_external_source("wos", wos_id, collection_id)

        workspace_id = response.get("id")
        units = utils.get_units_for_id(path, wos_id)
        sponsorships = []
        for unit in units:
            sponsorships.append(
                {
                    "value": unit.get("acro"),
                    "language": None,
                    "authority": f"will be referenced::ACRONYM::{unit.get('acro')}",
                    "securityLevel": 0,
                    "confidence": 400,
                    "place": 0,
                }
            )

        patch_operations = [
            {
                "op": "add",
                "path": "/sections/article_details/dc.language.iso",
                "value": [
                    {
                        "value": "en",
                        "language": None,
                        "authority": None,
                        "display": "English",
                        "securityLevel": 0,
                        "confidence": -1,
                        "place": 0,
                        "otherInformation": None,
                    }
                ],
            },
            {
                "op": "add",
                "path": "/sections/article_details/dc.description.sponsorship",
                "value": sponsorships,
            },
            {
                "op": "add",
                "path": "/sections/article_details/epfl.peerreviewed",
                "value": [
                    {
                        "value": "REVIEWED",
                        "language": None,
                        "authority": None,
                        "display": "REVIEWED",
                        "securityLevel": 0,
                        "confidence": -1,
                        "place": 0,
                        "otherInformation": None,
                    }
                ],
            },
            {
                "op": "add",
                "path": "/sections/article_details/epfl.writtenAt",
                "value": [
                    {
                        "value": "EPFL",
                        "language": None,
                        "authority": None,
                        "display": "EPFL",
                        "securityLevel": 0,
                        "confidence": -1,
                        "place": 0,
                        "otherInformation": None,
                    }
                ],
            },
            {"op": "add", "path": "/sections/license/granted", "value": "true"},
        ]
        df.at[index, "workspace_item_id"] = workspace_id
        try:
            update_response = d.update_workspaceitem(workspace_id, patch_operations)
            if update_response:
                try:
                    wf_response = d.create_workflowitem(workspace_id)
                    logging.info(f"Workflow item #{workspace_id} created")
                except Exception as e:
                    logging.error(
                        f"An error occurred while creating workflow item: {str(e)}"
                    )
            else:
                logging.error(
                    f"Failed to update workspace item with ID: {workspace_id}"
                )
        except Exception as e:
            logging.error(f"An error occurred while updating workspace item: {str(e)}")

    df.to_csv(
        os.path.join(path, "ResearchOutput.csv"), sep=",", encoding="utf-8", index=False
    )

# create workspace and workflow items
wos_to_dspace()