# SLR Data Pipeline

In [4]:
import pybliometrics

pybliometrics.scopus.init()

In [1]:
import pandas as pd
import requests
from tqdm import tqdm
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval

## Search Strings 

In [2]:
search_string = """TITLE-ABS("environmental justice" OR "environmental injustice")"""


In [5]:
results = ScopusSearch(search_string, verbose = True, download = True)

In [6]:
results.get_results_size()

8688

In [7]:
pd.set_option('display.max_columns', None) 

In [21]:
search_df = pd.DataFrame(results.results)

In [24]:
search_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8688 entries, 0 to 8687
Data columns (total 36 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   eid                  8688 non-null   object
 1   doi                  7643 non-null   object
 2   pii                  1254 non-null   object
 3   pubmed_id            1069 non-null   object
 4   title                8688 non-null   object
 5   subtype              8688 non-null   object
 6   subtypeDescription   8688 non-null   object
 7   creator              8587 non-null   object
 8   afid                 8077 non-null   object
 9   affilname            8077 non-null   object
 10  affiliation_city     8031 non-null   object
 11  affiliation_country  8073 non-null   object
 12  author_count         8587 non-null   object
 13  author_names         8587 non-null   object
 14  author_ids           8587 non-null   object
 15  author_afids         8077 non-null   object
 16  coverD

In [23]:
eids = search_df["eid"].tolist()

## Abstract Retrieval

The code cells below are for retrieving the articles information and filter them by type and language

In [9]:
articles = []
for eid in tqdm(eids):
    articles.append(AbstractRetrieval(eid, view='FULL'))

100%|██████████████████████████████████████████████████████████████████████████████████| 8688/8688 [45:44<00:00,  3.17it/s]


In [10]:
len(articles)

8688

In [11]:
import pickle

#with open('articles_raw_list.pkl', 'wb') as file:
#    pickle.dump(articles, file)


with open('articles_raw_list.pkl', 'rb') as file:
    articles = pickle.load(file)

In [38]:
filtered_df = search_df.copy()

In [39]:
filtered_df["language"] = ""
filtered_df["type"] = ""
for i, article in enumerate(articles):
    filtered_df.loc[i,"language"] = article.language
    filtered_df.loc[i,"type"] = article.subtype
    filtered_df.loc[i,"abstract"] = article.abstract

In [40]:
df = filtered_df.loc[((filtered_df["type"] == "ar") | (filtered_df["type"] == "re")) 
                    & (filtered_df["language"] == "eng"),]
df = df.reset_index(drop=True)

In [42]:
df["year1"] = df["coverDate"].str[:4]
df["year2"] = df["coverDisplayDate"].str[:4]

In [50]:
df = df.loc[df["year1"] != "2025",]
df = df.reset_index(drop=True)

In [51]:
df.head(10)

Unnamed: 0,eid,doi,pii,pubmed_id,title,subtype,subtypeDescription,creator,afid,affilname,affiliation_city,affiliation_country,author_count,author_names,author_ids,author_afids,coverDate,coverDisplayDate,publicationName,issn,source_id,eIssn,aggregationType,volume,issueIdentifier,article_number,pageRange,description,authkeywords,citedby_count,openaccess,freetoread,freetoreadLabel,fund_acr,fund_no,fund_sponsor,language,type,abstract,year1,year2
0,2-s2.0-85210625076,10.1016/j.scitotenv.2024.177767,S0048969724079245,,Toxic layering and compound extremes: Per- and...,ar,Article,Chukwuonye G.N.,60010065,The University of Arizona,Tucson,United States,6,"Chukwuonye, God'sgift N.;Alqattan, Zain Alabda...",59152713400;58691548900;57732589300;5768892380...,60010065;60010065;60010065;60010065;60010065;6...,2024-12-20,20 December 2024,Science of the Total Environment,489697.0,25349,18791026,Journal,957,,177767,,Per- and polyfluoroalkyl substances (PFAS) are...,Compound climate extremes | Long- and short-ch...,0,0,,,NIEHS,R21ES034591,National Institute of Environmental Health Sci...,eng,ar,© 2024 Elsevier B.V.Per- and polyfluoroalkyl s...,2024,20 D
1,2-s2.0-85206236373,10.1016/j.envres.2024.120070,S0013935124019777,39406285.0,Air quality and wheeze symptoms in a rural chi...,ar,Article,Johnston J.E.,60015481;60015183;119723570,University of Washington;Keck School of Medici...,Seattle;Los Angeles;Brawley,United States;United States;United States,10,"Johnston, Jill E.;Kamai, Elizabeth;Duenas Bara...",36959656600;55207254500;58926934500;5719123346...,60015183;60015183;60015183;119723570;119723570...,2024-12-15,15 December 2024,Environmental Research,139351.0,21524,10960953,Journal,263,,120070,,"Background: In California, climate change and ...",Air pollution | Children's respiratory health ...,0,0,,,NIEHS,undefined,National Institute of Environmental Health Sci...,eng,ar,"© 2024Background: In California, climate chang...",2024,15 D
2,2-s2.0-85208480484,10.1016/j.jhazmat.2024.136460,S0304389424030395,,Global air quality enhancement pathways to hea...,ar,Article,Xu Y.,60276981;60019499;60017060;60006356;131910455;...,The Hong Kong University of Science and Techno...,Guangzhou;Beijing;Changsha;Guilin;Jiangmen;Wuhan,China;China;China;China;China;China,5,"Xu, Yong;Zhou, Shang Chen;Feng, Yu Xi;Zou, Bin...",57223006644;59400443600;58380195900;5568445370...,60006356-60017060;60006356;131910455-60276981;...,2024-12-05,5 December 2024,Journal of Hazardous Materials,3043894.0,25858,18733336,Journal,480,,136460,,"Nowadays, air pollution has emerged as a criti...",Environmental justice | Governance strategies ...,0,0,,,MOE,2023YSJS08,Open Research Fund Program of Key Laboratory o...,eng,ar,"© 2024Nowadays, air pollution has emerged as a...",2024,5 De
3,2-s2.0-85190870529,10.1108/MHSI-02-2024-0025,,,Navigating the labyrinth of social exclusion: ...,re,Review,Ahmad Izhan F.F.,60212344;60004351,"College of Business, Universiti Utara Malaysia...",Sintok;Shah Alam,Malaysia;Malaysia,4,"Ahmad Izhan, Farsha Farahana;Ahmi, Aidi;Othman...",58994427200;55512419100;58248098600;59138466300,60004351;60212344;60004351;60004351,2024-12-03,3 December 2024,Mental Health and Social Inclusion,,19700175275,20428308,Journal,28,6,,1183-1203,Purpose: This study aims to provide a comprehe...,Academic trends | Bibliometric analysis | Digi...,0,0,,,MOHE,FRGS/1/2022/SS02/UITM/02/13,"Ministry of Higher Education, Malaysia",eng,re,"© 2024, Emerald Publishing Limited.Purpose: Th...",2024,3 De
4,2-s2.0-85210170584,10.1111/cobi.14387,,39587020.0,Youth engagement in global conservation govern...,ar,Article,Sithole S.S.,60000356;60000239,University of Cape Town;Université de Lausanne...,Cape Town;Lausanne,South Africa;Switzerland,4,"Sithole, Samantha S.;Walters, Gretchen M.;Mbat...",57375886900;8969511700;57205283928;6507518631,60000239;60000239;60000356;60000356,2024-12-01,December 2024,Conservation Biology,8888892.0,17822,15231739,Journal,38,6,e14387,,Youth are increasingly recognized for their im...,conservación | conservation | environmental ju...,0,1,all,All Open Access,UNIL,undefined,Université de Lausanne,eng,ar,© 2024 The Author(s). Conservation Biology pub...,2024,Dece
5,2-s2.0-85210021948,10.1016/j.chest.2024.07.143,S0012369224048451,39059578.0,Recommendations for Clinicians to Combat Envir...,re,Review,Patti M.A.,60030521;60018701;60014662;60002746,Boston Children's Hospital;School of Public He...,Boston;Boston;Philadelphia;Boston,United States;United States;United States;Unit...,4,"Patti, Marisa A.;Henderson, Noelle B.;Phipatan...",57200082012;57210734349;6701453005;57198497447,60014662;60018701;60030521-60002746;60030521,2024-12-01,December 2024,Chest,123692.0,18429,19313543,Journal,166,6,,1309-1318,Topic Importance: Asthma is a common and compl...,asthma | children | disparities | environmenta...,0,0,,,NHLBI,K01HL171354,"National Heart, Lung, and Blood Institute",eng,re,© 2024 American College of Chest PhysiciansTop...,2024,Dece
6,2-s2.0-85209814335,10.1177/20426445241274848,,,"Concrete cracks, wood burns: Competing narrati...",ar,Article,Larasatie P.,60026956;60013402,University of Arkansas at Monticello;Oregon St...,Monticello;Corvallis,United States;United States,3,"Larasatie, Pipiet;Young, Kathy;Hansen, Eric",57204967863;59419517800;7402448400,60026956;60013402;60013402,2024-12-01,December 2024,International Wood Products Journal,20426445.0,19900192130,20426453,Journal,15,2-4,,110-118,Innate to the human condition are rules of thu...,Concrete advantage | Construction material | C...,0,0,,,,undefined,,eng,ar,© The Author(s) 2024.Innate to the human condi...,2024,Dece
7,2-s2.0-85209657547,10.1016/j.jenvman.2024.123414,S0301479724034005,,Supporting knowledge justice through community...,ar,Article,Serrano-Salomón V.,60154476;60023908;60016338;60010307;60000221,College of Engineering and Applied Science;Por...,Boulder;Portland;Greeley;Denver;Boulder,United States;United States;United States;Unit...,9,"Serrano-Salomón, Valentina;Westbrook, Marisa;P...",58247633900;57202285776;59203218800;5920209040...,60000221;60023908;60016338;60010307;60154476;6...,2024-12-01,December 2024,Journal of Environmental Management,3014797.0,23371,10958630,Journal,372,,123414,,"In community science on air quality, low-cost ...",Air monitoring | Air quality | Community scien...,0,1,all,All Open Access,CU,1952223,University of Colorado,eng,ar,© 2024 The AuthorsIn community science on air ...,2024,Dece
8,2-s2.0-85209650872,10.1016/j.envint.2024.109091,S0160412024006779,,A systematic evidence map protocol for mapping...,ar,Article,Seewoo B.J.,60031806;60026489;127365629,The University of Western Australia;Royal Chil...,Perth;Melbourne;Perth,Australia;Australia;Australia,7,"Seewoo, Bhedita J.;Wong, Enoch V.S.;Mulders, Y...",57201450644;57216413579;6505756024;57202090820...,127365629-60031806;127365629-60031806;12736562...,2024-12-01,December 2024,Environment International,1604120.0,20912,18736750,Journal,194,,109091,,Background: Bisphenol A (BPA) is one of the hi...,Biomonitoring | Bisphenol | Exposure | Human |...,0,1,,,MOE,undefined,Ministry of Environment,eng,ar,© 2024 The AuthorsBackground: Bisphenol A (BPA...,2024,Dece
9,2-s2.0-85209402182,10.1016/j.envsci.2024.103951,S1462901124002855,,What is equitable urban forest governance? A s...,re,Review,Pike K.,60016849;60010365;130379873,University of Toronto;The University of Britis...,Toronto;Vancouver;Zeist,Canada;Canada;Netherlands,5,"Pike, Kaitlyn;Nesbitt, Lorien;Conway, Tenley;D...",57220177415;57190954149;10239007800;7203058966...,60010365;60010365;60016849;60010365;130379873,2024-12-01,December 2024,Environmental Science and Policy,14629011.0,21536,18736416,Journal,162,,103951,,Urban forest governance comprises the formal a...,Decision making | Distribution | Environmental...,0,1,all,All Open Access,,undefined,,eng,re,© 2024Urban forest governance comprises the fo...,2024,Dece


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5920 entries, 0 to 5919
Data columns (total 41 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   eid                  5920 non-null   object
 1   doi                  5561 non-null   object
 2   pii                  1144 non-null   object
 3   pubmed_id            923 non-null    object
 4   title                5920 non-null   object
 5   subtype              5920 non-null   object
 6   subtypeDescription   5920 non-null   object
 7   creator              5896 non-null   object
 8   afid                 5662 non-null   object
 9   affilname            5662 non-null   object
 10  affiliation_city     5645 non-null   object
 11  affiliation_country  5661 non-null   object
 12  author_count         5896 non-null   object
 13  author_names         5896 non-null   object
 14  author_ids           5896 non-null   object
 15  author_afids         5662 non-null   object
 16  coverD

In [13]:
dir(articles[1])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cache_file_path',
 '_confevent',
 '_head',
 '_json',
 '_mdate',
 '_ref',
 '_refresh',
 '_view',
 'abstract',
 'affiliation',
 'aggregationType',
 'authkeywords',
 'authorgroup',
 'authors',
 'chemicals',
 'citedby_count',
 'citedby_link',
 'confcode',
 'confdate',
 'conflocation',
 'confname',
 'confsponsor',
 'contributor_group',
 'copyright',
 'copyright_type',
 'correspondence',
 'coverDate',
 'date_created',
 'description',
 'document_entitlement_status',
 'doi',
 'eid',
 'endingPage',
 'funding',
 'funding_text',
 'get_bibtex',
 'get_cache_file_age',
 'get_cache_file_mdate',
 'get_html',
 'get_key_