In [1]:
from pyzotero import zotero
import os
import sys
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm.notebook import tqdm
tqdm.pandas()

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from code_utils.zotero import get_data_from_zotero
from code_utils.glutton import get_doi_glutton
from code_utils.utils import get_doi_cleaned,aplatir,wg_chap_to_dict,get_year_ipbes,check_doi_glutton
from code_utils.pickle import load_cache,write_cache
from code_utils.enriching_data_OpenAlex import get_countries_concepts_sdg,get_open_alex_data,get_open_alex_data_not_in_references

In [2]:
cached_openalex_data_ipbes = {}
cached_openalex_data_not_ipbes = {}

In [3]:
try:
    cached_openalex_data_ipbes = load_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\caches\\cached_openalex_data_ipbes.pkl')
    cached_openalex_data_not_ipbes = load_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\caches\\cached_openalex_data_not_ipbes.pkl')
except:
    #write_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\caches\\cached_openalex_data_ipbes.pkl')
    #write_cache(cached_openalex_data_not_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\caches\\cached_openalex_data_not_ipbes.pkl')
    print('oups')

5690 data in cached openalex data
58 data in cached openalex data


Read IPBES references in zotero files

In [None]:
ZOTERO_KEY = os.getenv('ZOTERO_KEY')
IPBES_ZOTERO_ID = '2333077'
COLLECTION_IDS = ['8DQ8YFJI','JR5LKU4U','LBFNF62B','7IMYH9U3','DRZLUY9K','KTPTCAVF','UWNSUNXQ','BJGWRJAK','JT8FJXP6']
CHAPTER_IDS=['1','2.1','2.2','2.3','3','4','5','6','glossary']
ALL_COLLECTIONS={}

In [None]:
ALL_COLLECTIONS=get_data_from_zotero(ZOTERO_KEY,IPBES_ZOTERO_ID,COLLECTION_IDS,CHAPTER_IDS,ALL_COLLECTIONS)

DEBUG:urllib3.connectionpool:https://api.zotero.org:443 "GET /groups/2333077/collections/JR5LKU4U/items?start=300&limit=100&format=json HTTP/1.1" 200 19681
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.zotero.org:443


In [None]:
df_ipbes_chap={}
for i in range(len(COLLECTION_IDS)):
    df_ipbes_chap[CHAPTER_IDS[i]]=pd.DataFrame(list(pd.DataFrame(ALL_COLLECTIONS[f'collection_chapter_{CHAPTER_IDS[i]}']).data))[['DOI','title','date','creators','tags']]
    df_ipbes_chap[CHAPTER_IDS[i]]['chap']=f'chap_{CHAPTER_IDS[i]}'
    print(f'{len(df_ipbes_chap[CHAPTER_IDS[i]][~pd.isna(df_ipbes_chap[CHAPTER_IDS[i]].DOI)])/len(df_ipbes_chap[CHAPTER_IDS[i]])*100} % doi available for chapter {CHAPTER_IDS[i]}')

Clean and enrich data 

In [None]:
df_ipbes=pd.concat(list(df_ipbes_chap.values())).reset_index()
del df_ipbes['index']

In [None]:
len(df_ipbes[~pd.isna(df_ipbes.DOI)])/len(df_ipbes)*100

In [None]:
df_ipbes.DOI.drop_duplicates()

In [None]:
get_doi_glutton(df_ipbes.iloc[1,:])

In [None]:
df_ipbes.loc[pd.isna(df_ipbes.DOI),'DOI']=df_ipbes.loc[pd.isna(df_ipbes.DOI),:].progress_apply(get_doi_glutton, axis=1)

In [None]:
len(df_ipbes[~pd.isna(df_ipbes.DOI)])/len(df_ipbes)*100

In [None]:
df_ipbes.loc[:,'DOI']=df_ipbes.loc[:,'DOI'].apply(lambda x: get_doi_cleaned(x))
df_ipbes.loc[:,'DOI']=df_ipbes.loc[:,'DOI'].apply(lambda x: None if str(x)[:4]=='http' else x)
df_ipbes['freq']=1
df_ipbes=df_ipbes.rename(columns={'DOI':'doi'})

In [None]:
df_ipbes.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes.json', orient='records')

In [None]:
df_ipbes.columns

Enriching data through OpenAlex API integration

In [4]:
df_ipbes=pd.read_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes.json', orient='records')

In [5]:
df_unique_doi=df_ipbes.groupby(by=['doi'], dropna=True).agg({'chap': lambda x: list(x), 'freq': 'sum'})
df_unique_doi=df_unique_doi.reset_index()

In [6]:
#get data_OpenAlex from the API 
df_unique_doi.apply(lambda row: get_open_alex_data(cached_openalex_data_ipbes,row['doi']), axis= 1)

0       [{'id': 'https://openalex.org/W2077652067', 'd...
1       [{'id': 'https://openalex.org/W2320183287', 'd...
2       [{'id': 'https://openalex.org/W1487177596', 'd...
3       [{'id': 'https://openalex.org/W1546680881', 'd...
4       [{'id': 'https://openalex.org/W1795688558', 'd...
                              ...                        
5685    [{'id': 'https://openalex.org/W2752653997', 'd...
5686    [{'id': 'https://openalex.org/W2754023309', 'd...
5687    [{'id': 'https://openalex.org/W2029219906', 'd...
5688    [{'id': 'https://openalex.org/W2221413160', 'd...
5689    [{'id': 'https://openalex.org/W3124428460', 'd...
Length: 5690, dtype: object

In [7]:
len(cached_openalex_data_ipbes)

5690

In [8]:
write_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\caches\\cached_openalex_data_ipbes.pkl')

In [9]:
df_ipbes=pd.read_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes.json')

In [10]:
df_unique_doi=df_ipbes['doi'].drop_duplicates().dropna()
df_unique_doi=pd.DataFrame({'doi':list(df_unique_doi)})

In [13]:
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []
is_OA_available_list=[]
title_list=[]
year_list=[]
names_list=[]
rors_list=[]
institutions_names_list=[]
locations_names_list=[]
locations_ids_list=[]
type_list=[]
type_crossref_list=[]

for i,row in df_unique_doi.iterrows():
    countries,concepts,sdg,year,topics,doi,bool_OA,title,name,rors,institutions_names,locations_names,locations_ids,type_OA,type_crossref=get_countries_concepts_sdg(cached_openalex_data_ipbes,row,ipcc=True,i=0)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    topics_list.append(topics)  
    is_OA_available_list.append(bool_OA)
    title_list.append(title)
    year_list.append(year)
    names_list.append(name)
    rors_list.append(rors)
    institutions_names_list.append(institutions_names)
    locations_names_list.append(locations_names)
    locations_ids_list.append(locations_ids)
    type_list.append(type_OA)
    type_crossref_list.append(type_crossref)


df_unique_doi['countries'] = countries_list
df_unique_doi['concepts'] = concepts_list
df_unique_doi['sdg'] = sdg_list
df_unique_doi['topics'] = topics_list
df_unique_doi['is_OA_available'] = is_OA_available_list
df_unique_doi['year_OA'] = year_list
df_unique_doi['authors_name'] = names_list
df_unique_doi['rors'] = rors_list
df_unique_doi['institutions_names'] = institutions_names_list
df_unique_doi['locations_names'] = locations_names_list
df_unique_doi['locations_ids'] = locations_ids_list
df_unique_doi['type'] = type_list
df_unique_doi['type_crossref'] = type_crossref_list

In [14]:
df_unique_doi.is_OA_available.value_counts()

is_OA_available
True     5668
False      22
Name: count, dtype: int64

In [15]:
df_unique_doi=df_unique_doi[df_unique_doi.is_OA_available]

In [16]:
df_unique_doi=df_unique_doi.reset_index()
del df_unique_doi['index']

In [17]:
df_unique_doi.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes_OA.json', orient='records')

Ipbes references data

In [18]:
df_unique_doi=pd.read_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes_OA.json', orient='records')

In [19]:
df_ipbes=pd.read_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes.json')

In [20]:
df_ipbes=pd.merge(df_ipbes[['doi', 'title', 'date', 'creators', 'tags']], df_unique_doi, on='doi', how='inner')

In [21]:
df_ipbes['year']=df_ipbes['date'].apply(lambda x: get_year_ipbes(x) if pd.isna(x)==False else None)

In [22]:
df_ipbes['check']=df_ipbes.apply(lambda row: check_doi_glutton(row), axis=1)
df_ipbes.check.value_counts()

check
True     6363
False      80
Name: count, dtype: int64

In [23]:
df_ipbes[df_ipbes.check==False]

Unnamed: 0,doi,title,date,creators,tags,countries,concepts,sdg,topics,is_OA_available,year_OA,authors_name,rors,institutions_names,locations_names,locations_ids,type,type_crossref,year,check
98,10.1007/s11027-013-9479-6,Combining backcasting and exploratory scenario...,2015,"[{'creatorType': 'author', 'firstName': 'Mathi...","[{'tag': 'ipbes-global_chapter1'}, {'tag': 'ip...",[NL],"[Backcasting, Computer science, Exploratory re...",,"[Water resources management and optimization, ...",True,2013,"[[M. van Vliet, [NL]], [Kasper Kok, [NL]]]","[[https://ror.org/04qw24q55, NL], [https://ror...","[[Wageningen University & Research, NL], [Wage...",Mitigation and Adaptation Strategies for Globa...,1381-2386,article,journal-article,2015,False
99,10.1007/s11027-013-9479-6,Combining backcasting and exploratory scenario...,2015,"[{'creatorType': 'author', 'firstName': 'Mathi...","[{'tag': 'ipbes-global_chapter1'}, {'tag': 'ip...",[NL],"[Backcasting, Computer science, Exploratory re...",,"[Water resources management and optimization, ...",True,2013,"[[M. van Vliet, [NL]], [Kasper Kok, [NL]]]","[[https://ror.org/04qw24q55, NL], [https://ror...","[[Wageningen University & Research, NL], [Wage...",Mitigation and Adaptation Strategies for Globa...,1381-2386,article,journal-article,2015,False
421,10.1016/j.enpol.2010.03.012,Distributional effects of taxing transport fuel,2012,"[{'creatorType': 'author', 'firstName': 'Thoma...",[{'tag': 'ipbes-global_chapter2.1_drivers'}],[SE],"[Opposition (politics), Economics, Surprise, C...","[{'id': '13', 'name': 'Climate action'}]","[Energy, Environment, and Transportation Polic...",True,2010,"[[Thomas Sterner, [SE]]]","[[https://ror.org/01tm6cn81, SE]]","[[University of Gothenburg, SE]]",Energy Policy,0301-4215,article,journal-article,2012,False
652,10.1073/pnas.1220362110,The material footprint of nations,2015-05-19,"[{'creatorType': 'author', 'firstName': 'Thoma...","[{'tag': 'ipbes-global_chapter2.1_drivers'}, {...","[AU, NO, JP, US]","[Decoupling (probability), Natural resource, G...","[{'id': '8', 'name': 'Decent work and economic...","[Environmental Impact and Sustainability, Glob...",True,2013,"[[Thomas Wiedmann, [AU]], [Heinz Schandl, [AU]...","[[https://ror.org/03qn8fb07, AU], [https://ror...",[[Commonwealth Scientific and Industrial Resea...,Proceedings of the National Academy of Sciences,0027-8424,article,journal-article,2015,False
653,10.1073/pnas.1220362110,The material footprint of nations,2015-05-19,"[{'creatorType': 'author', 'firstName': 'Thoma...","[{'tag': 'ipbes-global_chapter2.1_drivers'}, {...","[AU, NO, JP, US]","[Decoupling (probability), Natural resource, G...","[{'id': '8', 'name': 'Decent work and economic...","[Environmental Impact and Sustainability, Glob...",True,2013,"[[Thomas Wiedmann, [AU]], [Heinz Schandl, [AU]...","[[https://ror.org/03qn8fb07, AU], [https://ror...",[[Commonwealth Scientific and Industrial Resea...,Proceedings of the National Academy of Sciences,0027-8424,article,journal-article,2015,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6289,10.1111/cobi.13056,Dynamics in the global protected‐area estate s...,06/2019,"[{'creatorType': 'author', 'firstName': 'Edwar...",[{'tag': 'ipbes-global_chapter6'}],"[CH, GB, DK]","[Protected area, Realm, Environmental resource...","[{'id': '14', 'name': 'Life below water'}]","[Conservation, Biodiversity, and Resource Mana...",True,2017,"[[Edward Lewis, [GB]], [Brian MacSharry, [GB]]...","[[https://ror.org/04570b518, GB], [https://ror...","[[World Conservation Monitoring Centre, GB], [...",Conservation Biology,0888-8892,article,journal-article,2019,False
6291,10.1111/faf.12044,Evaluating the relative conservation value of ...,03/2015,"[{'creatorType': 'author', 'firstName': 'Marij...",[{'tag': 'ipbes-global_chapter6'}],[GB],"[Marine protected area, Fishing, Biomass (ecol...","[{'id': '14', 'name': 'Life below water'}]","[Coral and Marine Ecosystems Studies, Marine a...",True,2013,"[[Marija Sciberras, [GB]], [Stuart R. Jenkins,...","[[https://ror.org/006jb1a24, GB], [https://ror...","[[Bangor University, GB], [Bangor University, ...",Fish and Fisheries,1467-2960,article,journal-article,2015,False
6344,10.1177/0013916510383238,"The Effect of Trees on Crime in Portland, Oregon",01/2012,"[{'creatorType': 'author', 'firstName': 'Geoff...",[{'tag': 'ipbes-global_chapter6'}],[US],"[Criminology, Violent crime, Property crime, P...","[{'id': '16', 'name': 'Peace, justice, and str...","[Urban Green Space and Health, Land Use and Ec...",True,2010,"[[Geoffrey H. Donovan, [US]], [Jeffrey P. Pres...","[[https://ror.org/03zmjc935, US], [https://ror...","[[US Forest Service, US], [Southern Research S...",Environment and Behavior,0013-9165,article,journal-article,2012,False
6375,10.1108/17561451011087337,Land and rivers can own themselves,2017,"[{'creatorType': 'author', 'firstName': 'M.', ...",[{'tag': 'ipbes-global_chapter6'}],[GB],"[Procurement, Building information modeling, O...",,"[BIM and Construction Integration, Constructio...",True,2010,"[[Brodie McAdam, [GB]]]","[[https://ror.org/01tmqtf75, GB]]","[[University of Salford, GB]]",International Journal of Law in the Built Envi...,1756-1450,article,journal-article,2017,False


In [24]:
df_ipbes=df_ipbes[(df_ipbes.check) & (pd.isna(df_ipbes['is_OA_available'])==False)]

In [25]:
del df_ipbes['check']

In [26]:
df_ipbes.loc[:,'countries']=df_ipbes.loc[:,'countries'].apply(lambda x: str(x).replace('nan','None').replace('[]','None').replace('[None]','None') if ((str(x)=='nan')|(str(x)=='[None]')|((str(x)=='[]'))) else x )
df_ipbes.loc[:,'countries']=df_ipbes.loc[:,'countries'].apply(lambda x: ['None'] if x=='None' else x)

In [28]:
#df_ipbes.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipbes\\data_ipbes.jsonl', orient='records',lines=True)
df_ipbes.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes_visualization.jsonl', orient='records',lines=True)

Find the data for constructing the learning model ( IPBES related or not )

In [None]:
df_ipbes=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipbes\\data_ipbes.jsonl', lines= True).dropna(subset=['year'])

In [None]:
df_ipbes['year'] = pd.to_numeric(df_ipbes['year'], errors='coerce')
df_ipbes=df_ipbes.dropna(subset=['year','doi', 'title','topics','locations_names','locations_id']).drop_duplicates(subset=['doi'])
year=df_ipbes.drop_duplicates(subset='doi').dropna(subset=['doi'])['year'].dropna().sort_values().apply(lambda x: int(x))
year_counts = pd.Series(list(year)).value_counts().to_dict()

In [None]:
dois=list(df_ipbes.doi.dropna().drop_duplicates())

In [None]:
sum(list(year_counts.values()))

In [None]:
year_counts_not_ipbes={}

In [None]:
df_ipbes.topics.value_counts()[:50]

In [None]:
for year in list(year_counts.keys()):   
    cached_openalex_data_not_ipbes[year]=[]
    year_counts_not_ipbes[year]=0
    while year_counts_not_ipbes[year]<year_counts[year]:
        get_open_alex_data_not_in_references(dois,cached_openalex_data_not_ipbes,year_counts,year_counts_not_ipbes,year,ipcc="ipbes")
    cached_openalex_data_not_ipbes[year]=cached_openalex_data_not_ipbes[year][:year_counts[year]]

In [None]:
sum(list(year_counts.values()))

In [None]:
len(aplatir(list(cached_openalex_data_not_ipbes.values())))


In [None]:
len(cached_openalex_data_not_ipbes)

In [None]:
write_cache(cached_openalex_data_not_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\caches\\cached_openalex_data_not_ipbes.pkl')

In [None]:
df_not_ipbes=pd.DataFrame()
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []
is_OA_available_list=[]
title_list=[]
year_list=[]
names_list=[]
rors_list=[]
institutions_names_list=[]
locations_names_list=[]
locations_ids_list=[]
doi_list=[]

for k in range(len(aplatir(list(cached_openalex_data_not_ipbes.values())))):
    countries,concepts,sdg,year,topics,doi,bool_OA,title,name,rors,institutions_names,locations_names,locations_ids=get_countries_concepts_sdg(cached_openalex_data=aplatir(list(cached_openalex_data_not_ipbes.values())),ipcc=False,i=k)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    topics_list.append(topics)  
    is_OA_available_list.append(bool_OA)
    title_list.append(title)
    year_list.append(year)
    names_list.append(name)
    rors_list.append(rors)
    institutions_names_list.append(institutions_names)
    locations_names_list.append(locations_names)
    locations_ids_list.append(locations_ids)
    doi_list.append(doi)


df_not_ipbes['doi'] = doi_list
df_not_ipbes['title'] = title_list
df_not_ipbes['countries'] = countries_list
df_not_ipbes['concepts'] = concepts_list
df_not_ipbes['sdg'] = sdg_list
df_not_ipbes['topics'] = topics_list
df_not_ipbes['is_OA_available'] = is_OA_available_list
df_not_ipbes['year'] = year_list
df_not_ipbes['authors_name'] = names_list
df_not_ipbes['rors'] = rors_list
df_not_ipbes['institutions_names'] = institutions_names_list
df_not_ipbes['locations_names'] = locations_names_list
df_not_ipbes['locations_id'] = locations_ids_list

In [None]:
df_not_ipbes

In [None]:
df_not_ipbes.to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipbes\\data_not_ipbes.jsonl', orient='records', lines= True)