In [1]:
from pyzotero import zotero
import os
import sys
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm.notebook import tqdm
tqdm.pandas()

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from code_utils.zotero import get_data_from_zotero
from code_utils.glutton import get_doi_glutton
from code_utils.utils import get_doi_cleaned,aplatir,wg_chap_to_dict,get_year_ipbes,check_doi_glutton
from code_utils.pickle import load_cache,write_cache
from code_utils.enriching_data_OpenAlex import get_countries_concepts_sdg,get_open_alex_data,get_open_alex_data_not_in_references

In [2]:
cached_openalex_data_ipbes = {}
cached_openalex_data_not_ipbes = {}

In [3]:
try:
    cached_openalex_data_ipbes = load_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_ipbes.pkl')
    cached_openalex_data_not_ipbes = load_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipbes.pkl')
except:
    #write_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_ipbes.pkl')
    #write_cache(cached_openalex_data_not_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipbes.pkl')
    print('oups')

6608 data in cached openalex data
70 data in cached openalex data


Read IPBES references in zotero files

In [4]:
ZOTERO_KEY = os.getenv('ZOTERO_KEY')
IPBES_ZOTERO_ID = '2333077'
COLLECTION_IDS = ['8DQ8YFJI','JR5LKU4U','LBFNF62B','7IMYH9U3','DRZLUY9K','KTPTCAVF','UWNSUNXQ','BJGWRJAK','JT8FJXP6']
CHAPTER_IDS=['1','2.1','2.2','2.3','3','4','5','6','glossary']
ALL_COLLECTIONS={}

In [5]:
ALL_COLLECTIONS=get_data_from_zotero(ZOTERO_KEY,IPBES_ZOTERO_ID,COLLECTION_IDS,CHAPTER_IDS,ALL_COLLECTIONS)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.zotero.org:443
DEBUG:urllib3.connectionpool:https://api.zotero.org:443 "GET /groups/2333077/collections/8DQ8YFJI/items?start=0&limit=100&format=json HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.zotero.org:443
DEBUG:urllib3.connectionpool:https://api.zotero.org:443 "GET /groups/2333077/collections/8DQ8YFJI/items?start=100&limit=100&format=json HTTP/1.1" 200 31551
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.zotero.org:443
DEBUG:urllib3.connectionpool:https://api.zotero.org:443 "GET /groups/2333077/collections/8DQ8YFJI/items?start=200&limit=100&format=json HTTP/1.1" 200 2
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.zotero.org:443
DEBUG:urllib3.connectionpool:https://api.zotero.org:443 "GET /groups/2333077/collections/JR5LKU4U/items?start=0&limit=100&format=json HTTP/1.1" 200 30948
DEBUG:urllib3.connectionpool:Starting new HTTPS conne

In [6]:
df_ipbes_chap={}
for i in range(len(COLLECTION_IDS)):
    df_ipbes_chap[CHAPTER_IDS[i]]=pd.DataFrame(list(pd.DataFrame(ALL_COLLECTIONS[f'collection_chapter_{CHAPTER_IDS[i]}']).data))[['DOI','title','date','creators','tags']]
    df_ipbes_chap[CHAPTER_IDS[i]]['chap']=f'chap_{CHAPTER_IDS[i]}'
    print(f'{len(df_ipbes_chap[CHAPTER_IDS[i]][~pd.isna(df_ipbes_chap[CHAPTER_IDS[i]].DOI)])/len(df_ipbes_chap[CHAPTER_IDS[i]])*100} % doi available for chapter {CHAPTER_IDS[i]}')

63.2258064516129 % doi available for chapter 1
71.26436781609196 % doi available for chapter 2.1
79.26829268292683 % doi available for chapter 2.2
79.60199004975125 % doi available for chapter 2.3
81.64682539682539 % doi available for chapter 3
85.9103385178408 % doi available for chapter 4
73.2587064676617 % doi available for chapter 5
81.23682361208714 % doi available for chapter 6
37.90613718411552 % doi available for chapter glossary


Clean and enrich data 

In [7]:
df_ipbes=pd.concat(list(df_ipbes_chap.values())).reset_index()
del df_ipbes['index']

In [8]:
len(df_ipbes[~pd.isna(df_ipbes.DOI)])/len(df_ipbes)*100

77.36948241093435

In [11]:
df_ipbes.doi.drop_duplicates()

0       10.1890/1051-0761(2000)010[1251:roteka]2.0.co;2
1                                                  None
2                       10.1016/j.gloenvcha.2013.12.012
3                               10.5751/es-07868-200344
9                              10.1177/0959683607087927
                             ...                       
8907                         10.1016/j.tree.2017.06.014
8910                                    10.2307/2388682
8913                                    10.2307/3060213
8914                        10.1016/bs.apar.2016.02.009
8915                         10.2993/0278-0771-34.3.294
Name: doi, Length: 5691, dtype: object

In [None]:
get_doi_glutton(df_ipbes.iloc[1,:])

In [None]:
df_ipbes.loc[pd.isna(df_ipbes.DOI),'DOI']=df_ipbes.loc[pd.isna(df_ipbes.DOI),:].progress_apply(get_doi_glutton, axis=1)

In [None]:
len(df_ipbes[~pd.isna(df_ipbes.DOI)])/len(df_ipbes)*100

In [9]:
df_ipbes.loc[:,'DOI']=df_ipbes.loc[:,'DOI'].apply(lambda x: get_doi_cleaned(x))
df_ipbes.loc[:,'DOI']=df_ipbes.loc[:,'DOI'].apply(lambda x: None if str(x)[:4]=='http' else x)
df_ipbes['freq']=1
df_ipbes=df_ipbes.rename(columns={'DOI':'doi'})

Enriching data through OpenAlex API integration

In [None]:
df_unique_doi=df_ipbes.groupby(by=['doi'], dropna=True).agg({'chap': lambda x: list(x), 'freq': 'sum'})
df_unique_doi=df_unique_doi.reset_index()

In [None]:
#get data_OpenAlex from the API 
df_unique_doi.apply(lambda row: get_open_alex_data(cached_openalex_data_ipbes,row['doi']), axis= 1)

In [None]:
len(cached_openalex_data_ipbes)

In [None]:
#write_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_ipbes.pkl')

In [12]:
df_ipbes=pd.read_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes_visualization.json')

In [13]:
df_unique_doi=df_ipbes['doi'].drop_duplicates().dropna()
df_unique_doi=pd.DataFrame({'doi':list(df_unique_doi)})

In [14]:
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []
is_OA_available_list=[]
title_list=[]
year_list=[]
names_list=[]
rors_list=[]
institutions_names_list=[]
locations_list=[]

for i,row in df_unique_doi.iterrows():
    countries,concepts,sdg,year,topics,doi,bool_OA,title,name,rors,institutions_names,locations_names=get_countries_concepts_sdg(cached_openalex_data_ipbes,row,ipcc=True,i=0)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    topics_list.append(topics)  
    is_OA_available_list.append(bool_OA)
    title_list.append(title)
    year_list.append(year)
    names_list.append(name)
    rors_list.append(rors)
    institutions_names_list.append(institutions_names)
    locations_list.append(locations_names)

df_unique_doi['countries'] = countries_list
df_unique_doi['concepts'] = concepts_list
df_unique_doi['sdg'] = sdg_list
df_unique_doi['topics'] = topics_list
df_unique_doi['is_OA_available'] = is_OA_available_list
df_unique_doi['year_OA'] = year_list
df_unique_doi['authors_name'] = names_list
df_unique_doi['rors'] = rors_list
df_unique_doi['institutions_names'] = institutions_names_list
df_unique_doi['locations_names'] = locations_list

In [17]:
df_unique_doi.is_OA_available.value_counts()

is_OA_available
True    6106
Name: count, dtype: int64

In [18]:
df_unique_doi=df_unique_doi[df_unique_doi.is_OA_available]

In [19]:
df_unique_doi=df_unique_doi.reset_index()
del df_unique_doi['index']

In [12]:
df_unique_doi.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes_visualization2.json')

Ipbes references data

In [20]:
df_ipbes=pd.merge(df_ipbes[['doi', 'title', 'date', 'creators', 'tags']], df_unique_doi, on='doi', how='inner')

In [22]:
df_ipbes['year']=df_ipbes['date'].apply(lambda x: get_year_ipbes(x) if pd.isna(x)==False else None)

In [23]:
df_ipbes['check']=df_ipbes.apply(lambda row: check_doi_glutton(row), axis=1)
df_ipbes.check.value_counts()

check
True    6938
Name: count, dtype: int64

In [14]:
df_ipbes=df_ipbes[(df_ipbes.check) & (pd.isna(df_ipbes['is_OA_available'])==False)]

In [15]:
del df_ipbes['check']

In [16]:
df_ipbes.loc[:,'countries']=df_ipbes.loc[:,'countries'].apply(lambda x: str(x).replace('nan','None').replace('[]','None').replace('[None]','None') if ((str(x)=='nan')|(str(x)=='[None]')|((str(x)=='[]'))) else x )
df_ipbes.loc[:,'countries']=df_ipbes.loc[:,'countries'].apply(lambda x: ['None'] if x=='None' else x)

In [17]:
df_ipbes.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes_visualization.json', orient='records')

Find the data for constructing the learning model ( IPBES related or not )

In [None]:
df_ipbes['year'] = pd.to_numeric(df_ipbes['year'], errors='coerce')
year=df_ipbes.drop_duplicates(subset='doi')['year'].dropna().sort_values().apply(lambda x: int(x))
year_counts = pd.Series(list(year)).value_counts().to_dict()

In [None]:
dois=list(df_unique_doi.doi)

In [None]:
year_counts_not_ipbes={}
target_iterations = len(dois)

In [None]:
for year in list(year_counts.keys()):   
    cached_openalex_data_not_ipbes[year]=[]
    year_counts_not_ipbes[year]=0
    while year_counts_not_ipbes[year]<year_counts[year]:
        get_open_alex_data_not_in_references(dois,cached_openalex_data_not_ipbes,year_counts,year_counts_not_ipbes,year)
    cached_openalex_data_not_ipbes[year]=cached_openalex_data_not_ipbes[year][:year_counts[year]+1]

In [None]:
len(cached_openalex_data_not_ipbes)

In [None]:
write_cache(cached_openalex_data_not_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipbes.pkl')

In [None]:
df_not_ipbes=pd.DataFrame()
year_list = []
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []
dois_list = []
for k in range(len(aplatir(list(cached_openalex_data_not_ipbes.values())))):
    countries,concepts,sdg,year,topics,doi=get_countries_concepts_sdg(cached_openalex_data=aplatir(list(cached_openalex_data_not_ipbes.values())),ipcc=False,i=k)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    year_list.append(year)   
    topics_list.append(topics)   
    dois_list.append(doi)   

df_not_ipbes['countries'] = countries_list
df_not_ipbes['concepts'] = concepts_list
df_not_ipbes['sdg'] = sdg_list
df_not_ipbes['year'] = year_list
df_not_ipbes['topics'] = topics_list
df_not_ipbes['doi'] = dois_list

In [None]:
df_not_ipbes

In [None]:
#df_not_ipbes.to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_not_ipbes.jsonl', orient='records', lines= True)