In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from code_utils.read_ipcc_bib import read_bib_wg
from code_utils.enriching_data_OpenAlex import get_countries_concepts_sdg,get_open_alex_data,get_open_alex_data_not_in_references
from code_utils.utils import aplatir,wg_chap_to_dict,get_doi_cleaned
from code_utils.pickle import load_cache,write_cache

In [2]:
cached_openalex_data = {}
cached_openalex_data_not_ipcc = {}

In [3]:
try:
    cached_openalex_data = load_cache(cached_openalex_data,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data.pkl')
    cached_openalex_data_not_ipcc = load_cache(cached_openalex_data_not_ipcc,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipcc.pkl')
except:
    #write_cache(cached_openalex_data,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data.pkl')
    #write_cache(cached_openalex_data_not_ipcc,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipcc.pkl')
    print('oups not in bibliography folder')

53358 data in cached openalex data
98 data in cached openalex data


In [None]:
#write_cache(cached_openalex_data,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data.pkl')
#write_cache(cached_openalex_data_not_ipcc,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipcc.pkl')

Read IPCC references in .bib files

In [None]:
wgs={'1':{'wg1':[], 'dataframes_1':{}, 'listdir1':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG1")},
     '2':{'wg2':[], 'wg2_update':[], 'dataframes_2':{}, 'listdir2':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG2")},
     '2_CROSS':{'wg2_CROSS':[], 'wg2_CROSS_update':[], 'dataframes_2_CROSS':{}, 'listdir2_CROSS':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG2_CROSS")},
     '3':{'wg1':[], 'dataframes_3':{}, 'listdir3':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG3")}}

In [None]:
for k in ['1','2','2_CROSS','3']:
    wgs[k][f'df_{k}'] = read_bib_wg(wgs,k,verbose=True)
    #wgs[k][f'df_{k}'].to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_wg{k}.jsonl', orient='records', lines=True)

Read and clean the data from 'structured_data' folder

In [4]:
data_all={}

In [5]:
for k in ['1','2','2_CROSS','3']:
   data_all[f'df_wg{k}']=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_wg{k}.jsonl', lines= True)

In [6]:
#all data 
df_ipcc=pd.concat(list(data_all.values()), ignore_index=True)

In [7]:
df_ipcc.loc[:,'doi']=df_ipcc.loc[:,'doi'].apply(lambda x: get_doi_cleaned(x))
df_ipcc.loc[:,'doi']=df_ipcc.loc[:,'doi'].apply(lambda x: None if str(x)[:4]=='http' else x)
df_ipcc['freq']=1

Enriching data through OpenAlex API integration

In [8]:
df_unique_doi=df_ipcc.groupby(by=['doi'], dropna=True).agg({'wg': lambda x: list(x),'chap': lambda x: list(x), 'freq': 'sum'})

In [9]:
df_unique_doi=df_unique_doi.reset_index()

In [10]:
#get data_OpenAlex from the API 
df_unique_doi.apply(lambda row: get_open_alex_data(cached_openalex_data,row['doi']), axis= 1)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.openalex.org:443
DEBUG:urllib3.connectionpool:https://api.openalex.org:443 "GET /works?filter=doi:10.00098 HTTP/1.1" 200 116
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.openalex.org:443
DEBUG:urllib3.connectionpool:https://api.openalex.org:443 "GET /works?filter=doi:10.1002/0471743984.vse8966 HTTP/1.1" 200 2069
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.openalex.org:443
DEBUG:urllib3.connectionpool:https://api.openalex.org:443 "GET /works?filter=doi:10.1002/0471743984.vse9437 HTTP/1.1" 200 2063
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.openalex.org:443
DEBUG:urllib3.connectionpool:https://api.openalex.org:443 "GET /works?filter=doi:10.1002/9781118603048.oth1 HTTP/1.1" 200 1619
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.openalex.org:443
DEBUG:urllib3.connectionpool:https://api.openalex.org:443 "GET /works?filter=doi:10.10

0                                                       []
1                                                     None
2                                                       []
3        [{'id': 'https://openalex.org/W2154789424', 'd...
4        [{'id': 'https://openalex.org/W2136833125', 'd...
                               ...                        
54816    [{'id': 'https://openalex.org/W2747424513', 'd...
54817                                                 None
54818                                                 None
54819                                                 None
54820                                                   []
Length: 54821, dtype: object

In [11]:
len(cached_openalex_data)

54821

In [None]:
write_cache(cached_openalex_data,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data.pkl')

In [16]:
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []
is_OA_available_list=[]
title_list=[]
year_list=[]


for i,row in df_unique_doi.iterrows():
    countries,concepts,sdg,year,topics,doi,bool_OA,title=get_countries_concepts_sdg(cached_openalex_data,row)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    topics_list.append(topics)
    is_OA_available_list.append(bool_OA)
    title_list.append(title)
    year_list.append(year)
    

df_unique_doi['countries'] = countries_list
df_unique_doi['concepts'] = concepts_list
df_unique_doi['sdg'] = sdg_list
df_unique_doi['topics'] = topics_list
df_unique_doi['is_OA_available'] = is_OA_available_list
df_unique_doi['title_OA'] = title_list
df_unique_doi['year'] = year_list

In [17]:
df_unique_doi['wg_chap']=df_unique_doi.apply(lambda row: wg_chap_to_dict(row), axis=1)

In [19]:
dd=df_unique_doi[df_unique_doi.is_OA_available]
dd=dd.dropna(subset=['doi']).groupby(['doi']).agg({'wg_chap': lambda x: aplatir(list(x)),'countries':lambda x: list(x)[0],'topics':lambda x: list(x)[0],'title_OA':lambda x: list(x)[0]}).reset_index()

In [21]:
df_unique_doi[df_unique_doi.is_OA_available].to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipcc_model.json', orient='records')

In [None]:
df_ipcc=df_ipcc[['year', 'url', 'doi', 'author','freq']]

In [None]:
df_ipcc=pd.merge(df_ipcc, df_unique_doi, on='doi', how='left')
df_ipcc.loc[:,'countries']=df_ipcc.loc[:,'countries'].apply(lambda x: str(x).replace('nan','None').replace('[]','None').replace('[None]','None') if ((str(x)=='nan')|(str(x)=='[None]')|((str(x)=='[]'))) else x )
df_ipcc.loc[:,'countries']=df_ipcc.loc[:,'countries'].apply(lambda x: ['None'] if x=='None' else x)

In [None]:
df_ipcc.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipcc_visualization.json', orient='records')

Find the data for constructing the learning model ( IPCC related or not )

In [22]:
df_ipcc=pd.read_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipcc_model.json')

In [23]:
len(df_ipcc)

54528

In [24]:
df_ipcc['year'] = pd.to_numeric(df_ipcc['year'], errors='coerce')
year=df_ipcc.drop_duplicates(subset='doi')['year'].dropna().sort_values().apply(lambda x: int(x))
year_counts = pd.Series(list(year)).value_counts().to_dict()

In [25]:
len(aplatir(list(cached_openalex_data_not_ipcc.values())))

54296

In [27]:
dois=list(df_ipcc.doi.dropna().drop_duplicates())
year_counts_not_ipcc={}

In [31]:
for year in list(year_counts.keys()):   
    cached_openalex_data_not_ipcc[year]=[]
    year_counts_not_ipcc[year]=0
    while year_counts_not_ipcc[year]<year_counts[year]:
        get_open_alex_data_not_in_references(dois,cached_openalex_data_not_ipcc,year_counts,year_counts_not_ipcc,year)
    cached_openalex_data_not_ipcc[year]=cached_openalex_data_not_ipcc[year][:year_counts[year]]

In [32]:
year_counts[2018]

7486

In [33]:
len(cached_openalex_data_not_ipcc[2018])

7486

In [35]:
write_cache(cached_openalex_data_not_ipcc,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipcc.pkl')

In [36]:
df_not_ipcc=pd.DataFrame()
year_list = []
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []
dois_list = []
title_list=[]
for k in range(len(aplatir(list(cached_openalex_data_not_ipcc.values())))):
    countries,concepts,sdg,year,topics,doi,bool_OA,title=get_countries_concepts_sdg(cached_openalex_data=aplatir(list(cached_openalex_data_not_ipcc.values())),ipcc=False,i=k)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    year_list.append(year)   
    topics_list.append(topics)   
    dois_list.append(doi)
    title_list.append(title)   

df_not_ipcc['countries'] = countries_list
df_not_ipcc['concepts'] = concepts_list
df_not_ipcc['sdg'] = sdg_list
df_not_ipcc['year'] = year_list
df_not_ipcc['topics'] = topics_list
df_not_ipcc['doi'] = dois_list
df_not_ipcc['title_OA'] = title_list

In [37]:
len(df_not_ipcc)

54532

In [38]:
df_not_ipcc.to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_not_ipcc.jsonl', orient='records', lines= True)