# Scopus Scrapper

In [153]:
queryText = "data"
counts = 200
# Your API key obtained from Elsevier Developer Portal
API_KEY = '4ed47d847e4bf40990b3535919df219a'

In [154]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import numpy as np
from tqdm import tqdm

## Get all researches, authors, and affiliations id from provided data

In [155]:
researches_id_set = set()
authors_id_set = set()
affiliations_id_set = set()

In [156]:
provided_researches_df = pd.read_csv("../data/researches.csv", index_col=0)
provided_authors_df = pd.read_csv("../data/authors.csv", index_col=0)
provided_affiliations_df = pd.read_csv("../data/affiliations.csv", index_col=0)

In [157]:
researches_id_set = set(provided_researches_df['id'])
authors_id_set = set(provided_authors_df['id'])
affiliations_id_set = set(provided_affiliations_df['id'])

## Scraping from Scopus API

In [158]:
# Scopus API endpoint for document search
SCOPUS_API_URL = 'https://api.elsevier.com/content/search/scopus'

def query_scopus(query, count, year, start):
    params = {
        'apiKey': API_KEY,  # Include your API key here
        'query': query,
        'count': count,
        'date': year,
        'start': start,
    }

    response = requests.get(SCOPUS_API_URL, params=params)

    if response.status_code == 200:
        data = response.json()
        return data
    else:
        # print(f"Failed to query Scopus API. Status code: {response.status_code}")
        # print(response.text)
        return None

In [159]:
def scopus_finder(queryText, counts, year, start=0):
    results = query_scopus(queryText, counts, year, start)
    
    if results:
        # print("Search Successful")
        return [e["dc:identifier"].split(":")[1] for e in results["search-results"]["entry"]]
    else:
        # print("Search Failed")
        return []
    

In [168]:
def get_full_text_id(id, api_key):
    crossref_api_url = f'https://api.elsevier.com/content/abstract/scopus_id/{id}'
    response = requests.get(crossref_api_url ,headers={"Accept" : "application/json","X-ELS-APIKey" : api_key },params={"view" : "FULL"})
    if response.status_code == 200:
        data = json.loads(response.text)
        return data
    else:
        print(f"Failed to fetch full-text. Status code: {response.status_code}")
    return None

In [161]:
fetched_id_list = []

for year in tqdm(range(2014, 2024)):
    for start in range(0, 100000, counts):
        cur_list = scopus_finder(queryText, counts, year, int(start))
        if len(cur_list) == 0:
            break
        fetched_id_list += cur_list

100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [24:04<00:00, 160.45s/it]


## Get Abstracts Retrieval

In [165]:
authors_list = []
affiliations_list = []
papers_list = []

In [166]:
def extract_json_from_data(data):
    # Check if there is key in data
    assert "abstracts-retrieval-response" in data

    data = data["abstracts-retrieval-response"]

    if data.get("item").get("bibrecord").get("tail") is None:
        new_ref_id_list = []
    else:
        if type(data.get("item").get("bibrecord").get("tail").get("bibliography").get("reference")) is dict:
            ref_id_list = [data.get("item").get("bibrecord").get("tail").get("bibliography").get("reference").get("ref-info").get("refd-itemidlist").get("itemid")]
        else:
            ref_id_list = [field.get("ref-info").get("refd-itemidlist").get("itemid") for field in data.get("item").get("bibrecord").get("tail").get("bibliography").get("reference")] if data.get("item").get("bibrecord").get("tail") else []
        new_ref_id_list = []
        for r in ref_id_list:
            if type(r) is list:
                new_ref = [g.get("$") for g in r if g.get("@idtype") in "SGR"][0]
            else:
                new_ref = r.get("$")
            new_ref_id_list.append(new_ref)

    # Authors
    if data.get("authors").get("author"):
        for author in data.get("authors").get("author"):
            author_id = author.get("@auid")
            if author_id in authors_id_set:
                continue

            if type(author.get("affiliation")) is dict:
                aff_list = [author.get("affiliation")]
            else :
                aff_list = author.get("affiliation") if author.get("affiliation") else []

            authors_id_set.add(author_id)
            authors_list.append({
                "id": author_id,
                "given_name": author.get("ce:given-name"),
                "initials": author.get("ce:initials"),
                "surname": author.get("ce:surname"),
                "indexed_name": author.get("ce:indexed-name"),
                "affiliations_id": "|".join(set([a.get("@id") for a in aff_list]))
            })

    # Affiliations
    if data.get("affiliation"):
        if type(data.get("affiliation")) is dict:
            aff_list = [data.get("affiliation")]
        else:
            aff_list = data.get("affiliation") if data.get("affiliation") else []
            
        for aff in aff_list:
            
            if aff.get("@id") in affiliations_id_set:
                continue
            

            affiliations_id_set.add(aff.get("@id"))
            affiliations_list.append({
                "id": aff.get("@id"),
                "name": aff.get("affilname"),
                "city": aff.get("affiliation-city"),
                "country": aff.get("affiliation-country"),
            })

    # Research
    return {
        "id": data.get("coredata").get("dc:identifier").split(":")[1],
        "doi": data.get("coredata").get("prism:doi"),
        "eid": data.get("coredata").get("eid"),
        "cover_date": data.get("coredata").get("prism:coverDate"),
        "title": data.get("item").get("bibrecord").get("head").get("citation-title"),
        "abstract": data.get("item").get("bibrecord").get("head").get("abstracts"),
        "subject_areas": "|".join(set([field.get("@abbrev") for field in data.get("subject-areas").get("subject-area")])) if data.get("subject-areas").get("subject-area") else "",
        "auth_keywords": "|".join(set([field.get("$") for field in data.get("auth-keywords")])) if data.get("auth-keywords") else "",
        "authors_id": "|".join(set([field.get("@auid") for field in data.get("authors").get("author")])) if data.get("authors").get("author") else "",
        "citedby_count": data.get("coredata").get("citedby-count"),
        "ref_count": data.get("item").get("bibrecord").get("tail").get("bibliography").get("@refcount") if data.get("item").get("bibrecord").get("tail") else 0,
        "ref_ids": "|".join(new_ref_id_list),
        "published_year": data.get("coredata").get("prism:coverDate").split("-")[0],
        "published_month": data.get("coredata").get("prism:coverDate").split("-")[1],
        "published_day": data.get("coredata").get("prism:coverDate").split("-")[2],
    }

In [174]:
api_key_list = ['49e301ab27670d477a58dbda7017afaa', '4d63f2fb8818ed78e56f87f827e63814', '02cdad2f7107da8afd15dd0d6d60d576', '8d8d49bdc4f2a5da0ce3f6fc9c2db2f7', 'e25c586349486b09dad4e824158cc752', '888f9cb63ff0518476ec776ca4d6116f']
cnt = 1500
idx = -1
errcnt = 0

for research_id in tqdm(fetched_id_list):
    # Skip some research because of too long time to process
    idx += 1
    if idx%2 == 0:
        continue
    if research_id in researches_id_set:
        continue
        
    try:
        data = get_full_text_id(research_id, api_key_list[(cnt//9000)%6])
        json_obj = extract_json_from_data(data)
        papers_list.append(json_obj)
        researches_id_set.add(research_id)
    except Exception as err:
        errcnt += 1
    cnt += 1

100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [7:17:25<00:00,  1.91it/s]


## Additional Researches from Provided Data

In [175]:
research_df = pd.json_normalize(papers_list) 
research_df.head()

Unnamed: 0,id,doi,eid,cover_date,title,abstract,subject_areas,auth_keywords,authors_id,citedby_count,ref_count,ref_ids,published_year,published_month,published_day
0,85160777146,10.32871/rmrj1402.02.02,2-s2.0-85160777146,2014-12-31,Life Expectancy Simulation Model among HIV (Hu...,"© 2014, University of San Jose-Recoletos. All ...",MULT,,58297949800|58296000800|58295018000,0,34,47649115323|77955145227|84874147989|3375120946...,2014,12,31
1,85160751303,10.32871/rmrj1402.02.04,2-s2.0-85160751303,2014-12-31,Philippine Enhanced Basic Education (K to12) D...,"© 2014, University of San Jose-Recoletos. All ...",MULT,,58295510100|58297507100,0,22,51649092717|84883034321|33644510693|8516074256...,2014,12,31
2,85160675925,10.32871/rmrj1402.02.06,2-s2.0-85160675925,2014-12-31,2014 Classroom Practices of Teacher Interns in...,"© 2014, University of San Jose-Recoletos. All ...",MULT,,57193817095|56946110200,0,44,85013222320|1942482427|57649112222|23844469406...,2014,12,31
3,85160631510,10.32871/rmrj1402.02.19,2-s2.0-85160631510,2014-12-31,Financial Assessment of A Credit Union In West...,"© 2014, University of San Jose-Recoletos. All ...",MULT,,58161348400|58295187500,0,13,10244246384|85129791155|85160648394|8516063619...,2014,12,31
4,85160623655,10.32871/rmrj1402.02.16,2-s2.0-85160623655,2014-12-31,Factors Affecting Business Stability among ASE...,"© 2014, University of San Jose-Recoletos. All ...",MULT,,58296000700|58296172100|58295675200|58246298400,0,23,85160610888|85160643134|77953581116|8516067355...,2014,12,31


In [176]:
research_df.shape

(24951, 15)

## Additional Authors from Provided Data

In [177]:
author_df = pd.json_normalize(authors_list)
author_df.head()

Unnamed: 0,id,given_name,initials,surname,indexed_name,affiliations_id
0,58295018000,Melvin R.,M.R.,de Castro,de Castro M.R.,60089610
1,58296000800,Tonette M.,T.M.,Villanueva,Villanueva T.M.,60089610
2,58297949800,Ronnie B.,R.B.,Gonzalve,Gonzalve R.B.,60089610
3,58295510100,Helmae N.,H.N.,Etulle,Etulle H.N.,60089610
4,58297507100,Cindy M.,C.M.,Carmelotes,Carmelotes C.M.,60089610


In [178]:
author_df.shape

(123452, 6)

## Additional Affiliations from Provided Data

In [179]:
affiliation_df = pd.json_normalize(affiliations_list)
affiliation_df.head()

Unnamed: 0,id,name,city,country
0,60089610,University of San Jose-Recoletos,Cebu,Philippines
1,60134705,Cebu Normal University,Cebu,Philippines
2,60018181,Aletheia University,Tamsui,Taiwan
3,60071487,University of San Carlos,Cebu,Philippines
4,116553639,Widya Mandira Catholic University,"Kupang, Timor",Indonesia


In [180]:
affiliation_df.shape

(18776, 4)

## Export to CSV

In [181]:
research_df.to_csv("scraped_researches.csv")
author_df.to_csv("scraped_authors.csv")
affiliation_df.to_csv("scraped_affiliations.csv")