In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import requests

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from code_utils.read_ipcc_bib import read_bib_wg
from code_utils.enriching_data_OpenAlex import get_countries_concepts_sdg,get_publi_not_in_ipcc,get_open_alex_data
from code_utils.utils import aplatir,wg_chap_to_dict,get_doi_cleaned
from code_utils.glutton import get_doi_glutton

Read IPCC references in .bib files

In [2]:
wgs={'1':{'wg1':[], 'dataframes_1':{}, 'listdir1':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG1")},
     '2':{'wg2':[], 'wg2_update':[], 'dataframes_2':{}, 'listdir2':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG2")},
     '2_CROSS':{'wg2_CROSS':[], 'wg2_CROSS_update':[], 'dataframes_2_CROSS':{}, 'listdir2_CROSS':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG2_CROSS")},
     '3':{'wg1':[], 'dataframes_3':{}, 'listdir3':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG3")}}

In [None]:
for k in ['1','2','2_CROSS','3']:
    wgs[k][f'df_{k}'] = read_bib_wg(wgs,k,verbose=True)
    print(wgs[k][f'wg{k}_update'])
    #wgs[k][f'df_{k}'].to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_wg{k}.jsonl', orient='records', lines=True)

Read and clean the data from 'structured_data' folder

In [3]:
data_all={}

In [4]:
for k in ['1','2','2_CROSS','3']:
   data_all[f'df_wg{k}']=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_wg{k}.jsonl', lines= True)

In [5]:
#all data 
df_ipcc=pd.concat(list(data_all.values()), ignore_index=True)

In [6]:
df_ipcc['doi']=df_ipcc.loc[:,'doi'].apply(lambda x: get_doi_cleaned(x))
df_ipcc.loc[:,'doi']=df_ipcc.loc[:,'doi'].apply(lambda x: None if str(x)[:4]=='http' else x)
df_ipcc['freq']=1

Enriching data through OpenAlex API integration

In [7]:
df_unique_doi=df_ipcc.groupby(by='doi', dropna=True).agg({'wg': lambda x: list(x),'chap': lambda x: list(x), 'freq': 'sum'})

In [8]:
df_unique_doi=df_unique_doi.reset_index()

In [20]:
#get data_OpenAlex from the API (6h)
json_OA=[]
df_unique_doi.apply(lambda row: get_open_alex_data(json_OA,row['doi']), axis= 1)
df_OA=pd.DataFrame(json_OA)

In [None]:
df_doi_bizarre=df_unique_doi.loc[df_unique_doi.isin(list(df_OA.loc[df_OA.results==[],'doi']))]
df_doi_bizarre.loc[:,'doi']=df_doi_bizarre.progress_apply(get_doi_glutton,axis=1)

In [None]:
df_OA1=df_OA[~df_OA.doi.isin(list(df_OA.loc[df_OA.results==[],'doi']))]

In [None]:
json_OA2=[]
df_doi_bizarre.apply(lambda row: get_open_alex_data(json_OA2,row['doi']), axis= 1)
df_OA2=pd.DataFrame(json_OA2)

In [None]:
df_OA_complet=pd.concat([df_OA1,df_OA2])

In [None]:
df_OA_complet.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_OpenAlex.json', orient='records')

In [None]:
#get data_OpenAlex from the file
data_OpenAlex_all=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_OpenAlex.json')

In [None]:
countries_list = []
concepts_list = []
sdg_list = []
for i,row in df_unique_doi.iterrows():
    countries,concepts,sdg=get_countries_concepts_sdg(data_OpenAlex_all,row)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    

df_unique_doi['countries'] = countries_list
df_unique_doi['concepts'] = concepts_list
df_unique_doi['sdg'] = sdg_list

In [None]:
df_unique_doi['wg_chap']=df_unique_doi.apply(lambda row: wg_chap_to_dict(row), axis=1)

In [None]:
# df_unique_doi.to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_unique_doi.jsonl', orient= 'records', lines= True)

Visualization of the contribution of each countries in ipcc references

In [None]:
df_unique_doi=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_unique_doi.jsonl', lines= True)

In [None]:
df_unique_doi=df_unique_doi[['doi','countries','concepts','sdg','wg_chap']]

In [None]:
df_ipcc=pd.merge(df_ipcc, df_unique_doi, on='doi', how='left')
df_ipcc.loc[:,'countries']=df_ipcc.loc[:,'countries'].apply(lambda x: str(x).replace('nan','None').replace('[]','None').replace('[None]','None') if ((str(x)=='nan')|(str(x)=='[None]')|((str(x)=='[]'))) else x )
df_ipcc.loc[:,'countries']=df_ipcc.loc[:,'countries'].apply(lambda x: ['None'] if x=='None' else x)

In [None]:
data_counts = pd.Series(aplatir(list(df_ipcc['countries']))).value_counts().drop('None')

In [None]:
plt.figure(figsize=(10, 6))
data_counts[:20].plot(kind='bar')
ax = data_counts[:20].plot(kind='bar')
for i, v in enumerate(data_counts[:20]):
    ax.text(i, v + 0.1, str(v), ha='center', va='bottom')
plt.show()

Find the data for constructing the learning model ( IPCC related or not )

In [None]:
df_ipcc[df_ipcc['wg']=='wg1'].year.max()

In [None]:
df_ipcc['year'] = pd.to_numeric(df_ipcc['year'], errors='coerce')
year=df_ipcc.drop_duplicates(subset='doi')['year'].dropna().sort_values().apply(lambda x: int(x))
year_counts = pd.Series(list(year)).value_counts().to_dict()
del year_counts[202]

In [None]:
dois=list(df_unique_doi.doi)

In [None]:
dict_year={}
year_counts_not_ipcc={}

In [None]:
while sum(list(year_counts_not_ipcc.values()))<len(dois):
    get_publi_not_in_ipcc(dois,dict_year,year_counts,year_counts_not_ipcc)

In [None]:
pd.DataFrame(aplatir(list(dict_year.values()))).to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_not_ipcc.jsonl', orient= 'records', lines= True)