In [1]:
from pyzotero import zotero
import os
import sys
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
tqdm.pandas()

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from code_utils.zotero import get_data_from_zotero
from code_utils.glutton import get_doi_glutton
from code_utils.utils import get_doi_cleaned,aplatir,wg_chap_to_dict
from code_utils.pickle import load_cache,write_cache
from code_utils.enriching_data_OpenAlex import get_countries_concepts_sdg,get_open_alex_data,get_open_alex_data_not_in_references

In [2]:
cached_openalex_data_ipbes = {}
cached_openalex_data_not_ipbes = {}

In [3]:
try:
    cached_openalex_data_ipbes = load_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_ipbes.pkl')
    cached_openalex_data_not_ipbes = load_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipbes.pkl')
except:
    write_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_ipbes.pkl')
    write_cache(cached_openalex_data_not_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipbes.pkl')

6608 data in cached openalex data
70 data in cached openalex data


Read IPBES references in zotero files

In [4]:
ZOTERO_KEY = os.getenv('ZOTERO_KEY')
IPBES_ZOTERO_ID = '2333077'
COLLECTION_IDS = ['8DQ8YFJI','JR5LKU4U','LBFNF62B','7IMYH9U3','DRZLUY9K','KTPTCAVF','UWNSUNXQ','BJGWRJAK','JT8FJXP6']
CHAPTER_IDS=['1','2.1','2.2','2.3','3','4','5','6','glossary']
ALL_COLLECTIONS={}

In [5]:
ALL_COLLECTIONS=get_data_from_zotero(ZOTERO_KEY,IPBES_ZOTERO_ID,COLLECTION_IDS,CHAPTER_IDS,ALL_COLLECTIONS)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.zotero.org:443
DEBUG:urllib3.connectionpool:https://api.zotero.org:443 "GET /groups/2333077/collections/8DQ8YFJI/items?start=0&limit=100&format=json HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.zotero.org:443
DEBUG:urllib3.connectionpool:https://api.zotero.org:443 "GET /groups/2333077/collections/8DQ8YFJI/items?start=100&limit=100&format=json HTTP/1.1" 200 31537
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.zotero.org:443
DEBUG:urllib3.connectionpool:https://api.zotero.org:443 "GET /groups/2333077/collections/8DQ8YFJI/items?start=200&limit=100&format=json HTTP/1.1" 200 2
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.zotero.org:443
DEBUG:urllib3.connectionpool:https://api.zotero.org:443 "GET /groups/2333077/collections/JR5LKU4U/items?start=0&limit=100&format=json HTTP/1.1" 200 30984
DEBUG:urllib3.connectionpool:Starting new HTTPS conne

In [6]:
df_ipbes_chap={}
for i in range(len(COLLECTION_IDS)):
    df_ipbes_chap[CHAPTER_IDS[i]]=pd.DataFrame(list(pd.DataFrame(ALL_COLLECTIONS[f'collection_chapter_{CHAPTER_IDS[i]}']).data))[['DOI','title','date','creators','tags']]
    df_ipbes_chap[CHAPTER_IDS[i]]['chap']=f'chap_{CHAPTER_IDS[i]}'
    print(f'{len(df_ipbes_chap[CHAPTER_IDS[i]][~pd.isna(df_ipbes_chap[CHAPTER_IDS[i]].DOI)])/len(df_ipbes_chap[CHAPTER_IDS[i]])*100} % doi available for chapter {CHAPTER_IDS[i]}')

63.2258064516129 % doi available for chapter 1
71.26436781609196 % doi available for chapter 2.1
79.26829268292683 % doi available for chapter 2.2
79.60199004975125 % doi available for chapter 2.3
81.64682539682539 % doi available for chapter 3
85.9103385178408 % doi available for chapter 4
73.2587064676617 % doi available for chapter 5
81.23682361208714 % doi available for chapter 6
37.90613718411552 % doi available for chapter glossary


Clean and enrich data 

In [7]:
df_ipbes=pd.concat(list(df_ipbes_chap.values())).reset_index()
del df_ipbes['index']

In [8]:
len(df_ipbes[~pd.isna(df_ipbes.DOI)])/len(df_ipbes)*100

77.36948241093435

In [9]:
df_ipbes.loc[pd.isna(df_ipbes.DOI),'DOI']=df_ipbes.loc[pd.isna(df_ipbes.DOI),:].progress_apply(get_doi_glutton, axis=1)

  0%|          | 0/2020 [00:00<?, ?it/s]

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): cloud.science-miner.com:443
DEBUG:urllib3.connectionpool:https://cloud.science-miner.com:443 "GET /glutton/service/lookup?atitle=World%20Social%20Science%20Report%202016,%20Challenging%20Inequalities:%20Pathways%20to%20a%20Just%20World&firstAuthor=None HTTP/1.1" 404 76
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): cloud.science-miner.com:443
DEBUG:urllib3.connectionpool:https://cloud.science-miner.com:443 "GET /glutton/service/lookup?atitle=The%20assessment%20report%20of%20the%20Intergovernmental%20Science-Policy%20Platform%20on%20Biodiversity%20and%20Ecosystem%20Services%20on%20pollinators,%20pollination%20and%20food%20production&firstAuthor=Potts HTTP/1.1" 404 76
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): cloud.science-miner.com:443
DEBUG:urllib3.connectionpool:https://cloud.science-miner.com:443 "GET /glutton/service/lookup?atitle=The%20Environmentalism%20of%20the%20Poor:%20A%20Stu

In [10]:
len(df_ipbes[~pd.isna(df_ipbes.DOI)])/len(df_ipbes)*100

89.27851221151691

In [11]:
df_ipbes.loc[:,'DOI']=df_ipbes.loc[:,'DOI'].apply(lambda x: get_doi_cleaned(x))
df_ipbes.loc[:,'DOI']=df_ipbes.loc[:,'DOI'].apply(lambda x: None if str(x)[:4]=='http' else x)
df_ipbes['freq']=1
df_ipbes=df_ipbes.rename(columns={'DOI':'doi'})

Enriching data through OpenAlex API integration

In [12]:
df_unique_doi=df_ipbes.groupby(by=['doi'], dropna=True).agg({'chap': lambda x: list(x), 'freq': 'sum'})
df_unique_doi=df_unique_doi.reset_index()

In [13]:
#get data_OpenAlex from the API 
df_unique_doi.apply(lambda row: get_open_alex_data(cached_openalex_data_ipbes,row['doi']), axis= 1)

0       [{'id': 'https://openalex.org/W1970666665', 'd...
1       [{'id': 'https://openalex.org/W2077652067', 'd...
2       [{'id': 'https://openalex.org/W2320183287', 'd...
3       [{'id': 'https://openalex.org/W1487177596', 'd...
4       [{'id': 'https://openalex.org/W1546680881', 'd...
                              ...                        
6599    [{'id': 'https://openalex.org/W2890993837', 'd...
6600    [{'id': 'https://openalex.org/W2029219906', 'd...
6601    [{'id': 'https://openalex.org/W2221413160', 'd...
6602    [{'id': 'https://openalex.org/W3124428460', 'd...
6603    [{'id': 'https://openalex.org/W3137011924', 'd...
Length: 6604, dtype: object

In [14]:
len(cached_openalex_data_ipbes)

6608

In [None]:
write_cache(cached_openalex_data_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_ipbes.pkl')

In [15]:
year_list = []
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []
is_OA_available_list=[]

for i,row in df_unique_doi.iterrows():
    countries,concepts,sdg,year,topics,doi,bool_OA=get_countries_concepts_sdg(cached_openalex_data_ipbes,row,ipcc=True,i=0)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    year_list.append(year)   
    topics_list.append(topics)  
    is_OA_available_list.append(bool_OA)

df_unique_doi['countries'] = countries_list
df_unique_doi['concepts'] = concepts_list
df_unique_doi['sdg'] = sdg_list
df_unique_doi['year'] = year_list
df_unique_doi['topics'] = topics_list
df_unique_doi['is_OA_available'] = is_OA_available_list

In [20]:
dd=df_unique_doi[df_unique_doi.is_OA_available]
dd=dd.dropna(subset=['doi']).groupby(['doi']).agg({'chap': lambda x: aplatir(list(x)),'countries':lambda x: list(x)[0],'topics':lambda x: list(x)[0]}).reset_index()

Visualization of the contribution of each countries in ipcc references

In [17]:
del df_ipbes['chap']

In [21]:
pd.merge(df_ipbes, dd, on='doi', how='inner').columns

Index(['doi', 'title', 'date', 'creators', 'tags', 'freq', 'chap', 'countries',
       'topics'],
      dtype='object')

In [23]:
df_ipbes=pd.merge(df_ipbes, dd, on='doi', how='inner')

In [24]:
df_ipbes.loc[:,'countries']=df_ipbes.loc[:,'countries'].apply(lambda x: str(x).replace('nan','None').replace('[]','None').replace('[None]','None') if ((str(x)=='nan')|(str(x)=='[None]')|((str(x)=='[]'))) else x )
df_ipbes.loc[:,'countries']=df_ipbes.loc[:,'countries'].apply(lambda x: ['None'] if x=='None' else x)

In [25]:
df_ipbes.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipbes_visualization.json', orient='records')

In [None]:
df_ipbes.columns

In [None]:
data_counts = pd.Series(aplatir(list(df_ipbes['countries']))).value_counts().drop('None')

In [None]:
plt.figure(figsize=(10, 6))
data_counts[:20].plot(kind='bar')
ax = data_counts[:20].plot(kind='bar')
for i, v in enumerate(data_counts[:20]):
    ax.text(i, v + 0.1, str(v), ha='center', va='bottom')
plt.show()

Find the data for constructing the learning model ( IPBES related or not )

In [None]:
df_ipbes['year'] = pd.to_numeric(df_ipbes['year'], errors='coerce')
year=df_ipbes.drop_duplicates(subset='doi')['year'].dropna().sort_values().apply(lambda x: int(x))
year_counts = pd.Series(list(year)).value_counts().to_dict()

In [None]:
dois=list(df_unique_doi.doi)

In [None]:
year_counts_not_ipbes={}
target_iterations = len(dois)

In [None]:
for year in list(year_counts.keys()):   
    cached_openalex_data_not_ipbes[year]=[]
    year_counts_not_ipbes[year]=0
    while year_counts_not_ipbes[year]<year_counts[year]:
        get_open_alex_data_not_in_references(dois,cached_openalex_data_not_ipbes,year_counts,year_counts_not_ipbes,year)
    cached_openalex_data_not_ipbes[year]=cached_openalex_data_not_ipbes[year][:year_counts[year]+1]

In [None]:
len(cached_openalex_data_not_ipbes)

In [None]:
write_cache(cached_openalex_data_not_ipbes,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipbes.pkl')

In [None]:
df_not_ipbes=pd.DataFrame()
year_list = []
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []
dois_list = []
for k in range(len(aplatir(list(cached_openalex_data_not_ipbes.values())))):
    countries,concepts,sdg,year,topics,doi=get_countries_concepts_sdg(cached_openalex_data=aplatir(list(cached_openalex_data_not_ipbes.values())),ipcc=False,i=k)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    year_list.append(year)   
    topics_list.append(topics)   
    dois_list.append(doi)   

df_not_ipbes['countries'] = countries_list
df_not_ipbes['concepts'] = concepts_list
df_not_ipbes['sdg'] = sdg_list
df_not_ipbes['year'] = year_list
df_not_ipbes['topics'] = topics_list
df_not_ipbes['doi'] = dois_list

In [None]:
df_not_ipbes

In [None]:
#df_not_ipbes.to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_not_ipbes.jsonl', orient='records', lines= True)