# Import Scopus to Infoscience

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
from data_pipeline.harvester import ScopusHarvester
from data_pipeline.deduplicator import DataFrameProcessor
from data_pipeline.enricher import AuthorProcessor, PublicationProcessor
from data_pipeline.loader import Loader
from config import default_queries
from datetime import datetime

***

In [3]:
# Création du dossier avec la date actuelle
current_datetime = datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
folder_path = "harvested-data"
os.makedirs(folder_path, exist_ok=True)
path = os.path.join(folder_path, current_datetime)

if not os.path.exists(path):
    os.mkdir(path)

***

## Init query and date intervals

In [4]:
start_date = "2024-03-01"
end_date = "2024-04-01"
# queries = {
#     "scopus": "(DOCTYPE(ar) OR DOCTYPE(dp)) AND (AF-ID(60028186) OR AF-ID(60210159) OR AF-ID(60070536) OR AF-ID(60204330) OR AF-ID(60070531) OR AF-ID(60070534) OR AF-ID(60070538) OR AF-ID(60014951) OR AF-ID(60070529) OR AF-ID(60070532) OR AF-ID(60070535) OR AF-ID(60122563) OR AF-ID(60210160) OR AF-ID(60204331))"
# }

queries = {"scopus": "EID(2-s2.0-85188450496)"}


# queries = None

if queries:
    default_queries.update(queries)

***

## Harvest data from Scopus

In [5]:
scopus_harvester = ScopusHarvester(start_date, end_date, default_queries["scopus"])

In [None]:
scopus_publications = scopus_harvester.harvest()

In [7]:
scopus_publications.to_csv(
    os.path.join(path, "ScopusHarvestedData.csv"), index=False, encoding="utf-8"
)

***

## Deduplicate with Infoscience

In [8]:
deduplicator = DataFrameProcessor(scopus_publications)

In [None]:
# Deduplicate the publications : first deduplicate operation between the sources
deduplicated_sources_df = deduplicator.deduplicate_dataframes()
# and second operation : filter by removing founded duplicates in Infoscience
df_final,df_unloaded = deduplicator.deduplicate_infoscience(deduplicated_sources_df)

In [10]:
df_unloaded.to_csv(
    os.path.join(path, "UnloadedDuplicatedPublications.csv"),
    index=False,
    encoding="utf-8",
)

***

## Generate Main Dataframes

In [11]:
df_metadata, df_authors = deduplicator.generate_main_dataframes(df_final)

In [12]:
# Sauvegarde des données au format CSV
df_metadata.to_csv(
    os.path.join(path, "ResearchOutput.csv"), index=False, encoding="utf-8"
)
df_authors.to_csv(
    os.path.join(path, "AddressesAndNames.csv"), index=False, encoding="utf-8"
)

***

## EPFL Authors Reconciliation

In [None]:
author_processor = AuthorProcessor(df_authors)

df_epfl_authors = (
    author_processor.process()
    .filter_epfl_authors()
    .clean_authors()
    .nameparse_authors()
    .api_epfl_reconciliation()
    .generate_dspace_uuid(return_df=True)
)

In [14]:
df_epfl_authors.to_csv(
    os.path.join(path, "EpflAuthors.csv"), index=False, encoding="utf-8"
)

***

## Get OA fulltexts from Unpaywall/Crossref

In [None]:
    # Generate publications dataframe enriched with OA attributes
publication_processor = PublicationProcessor(df_metadata)
df_oa_metadata = publication_processor.process(return_df=True)

In [16]:
df_oa_metadata.to_csv(
    os.path.join(path, "ResearchOutputsWithOA.csv"),
    index=False,
    encoding="utf-8",
)

***

## Upload data in DSpace

In [None]:
loader_instance = Loader(df_oa_metadata, df_epfl_authors, df_authors)
loaded_items = loader_instance.create_complete_publication()

In [18]:
### Generated reports
loaded_items.to_csv(
    os.path.join(path, "ImportedResearchOutputs.csv"), index=False, encoding="utf-8"
)