# Import WOS & Scopus to Infoscience

In [1]:
from data_pipeline.main import main
import pandas as pd
import logging
import ipywidgets as widgets
from ipywidgets import Layout
from IPython.display import display, Markdown, HTML
from datetime import date
import os, base64
import utils

In [2]:
# common functions ###############################
def create_download_link(filename, title = "Cliquer ici pour télécharger le fichier : "):  
    data = open(filename, "rb").read()
    b64 = base64.b64encode(data)
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title+f' {filename}',filename=filename)
    return HTML(html)

***

## Etape 1 : Récupération des publications EPFL (métadonnées, auteurs, affiliations) depuis les bases biblios externes, dédoublonnage entre sources et dédoublonnage sur publications déjà présentes dans Infoscience

In [3]:
### Workflow metadata #####################
current_date = str(date.today())
folder_path = "harvested-data"
path = os.path.join(folder_path, str(current_date).replace("-", "_"))
if not os.path.exists(path):
    os.mkdir(path)

### ipywidgets config #########################
style = {'description_width': 'initial'}
spinner_output = widgets.Output()

**`main.py`** : The default queries are the ones that identifify and harvest the publications of the EPFL institutions

- WoS : "OG=(Ecole Polytechnique Federale de Lausanne)"
- Scopus : "AF-ID(60028186) OR AF-ID(60210159) OR AF-ID(60070536) OR AF-ID(60204330) OR AF-ID(60070531) OR AF-ID(60070534) OR AF-ID(60070538) OR AF-ID(60014951) OR AF-ID(60070529) OR AF-ID(60070532) OR AF-ID(60070535) OR AF-ID(60122563) OR AF-ID(60210160) OR AF-ID(60204331)"

These queries are completed by the selected data range as parameters

**notebook** : To overwrite and customize the default queries:
```
custom_queries = {
    "wos": "OG=(Your Custom Query for WOS)",
    "scopus": "AF-ID(Your Custom Scopus ID)",
    "openalex": "YOUR_CUSTOM_OPENALEX_QUERY",
    "zenodo": "YOUR_CUSTOM_ZENODO_QUERY"
}
df_metadata, df_authors, df_unloaded = main(start_date="2023-01-01", end_date="2023-12-31", queries=custom_queries)
```

In [None]:
start_date = widgets.DatePicker(
    description='Date de publication postérieure au : ',
    value=pd.to_datetime('2024-09-01'),
    disabled=False,
    style=style,
    layout = Layout(width='50%', height='30px', display='flex')
)

end_date = widgets.DatePicker(
    description="Date de publication antérieure au : ",
    value=pd.to_datetime("2024-09-05"),
    disabled=False,
    style=style,
    layout=Layout(width="50%", height="30px", display="flex"),
)

harvest_button = widgets.Button(description="Harvest WOS and Scopus")
harvest_output = widgets.Output()

display(start_date, end_date,harvest_button, harvest_output, spinner_output)

def harvest_button_clicked(b):
    with spinner_output:
        display(widgets.HTML("<p>Chargement en cours...</p>"))
    with harvest_output:
        start = start_date.value.strftime('%Y-%m-%d')
        end = end_date.value.strftime('%Y-%m-%d')
        df_metadata, df_authors, df_epfl_authors, df_unloaded = main(start_date=start, end_date=end)
        df_metadata.to_csv(
                       os.path.join(path, "ResearchOutput.csv"), index=False, encoding="utf-8")
        df_authors.to_csv(
                       os.path.join(path, "AddressesAndNames.csv"), index=False, encoding="utf-8")
        df_epfl_authors.to_csv(
                       os.path.join(path, "EpflAuthors.csv"), index=False, encoding="utf-8")
        df_unloaded.to_csv(
                       os.path.join(path, "UnloadedDuplicatedPublications.csv"), index=False, encoding="utf-8")
        # Liens de téléchargement
        display(create_download_link(os.path.join(path,"ResearchOutput.csv")))
        display(create_download_link(os.path.join(path,"AddressesAndNames.csv")))
        display(create_download_link(os.path.join(path,"EpflAuthors.csv")))
        display(create_download_link(os.path.join(path,"UnloadedDuplicatedPublications.csv")))
        spinner_output.clear_output()

harvest_button.on_click(harvest_button_clicked)

***

## Etape 3 (WIP) : Importation en tant que workspace items dans DSpace