In [15]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from code_utils.read_ipcc_bib import read_bib_wg
from code_utils.enriching_data_OpenAlex import get_countries_concepts_sdg,get_open_alex_data,get_open_alex_data_not_in_references
from code_utils.utils import aplatir,wg_chap_to_dict,get_doi_cleaned
from code_utils.pickle import load_cache,write_cache

In [16]:
cached_openalex_data = {}
cached_openalex_data_not_ipcc = {}

In [17]:
try:
    cached_openalex_data = load_cache(cached_openalex_data,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data.pkl')
    cached_openalex_data_not_ipcc = load_cache(cached_openalex_data_not_ipcc,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipcc.pkl')
except:
    write_cache(cached_openalex_data,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data.pkl')
    write_cache(cached_openalex_data_not_ipcc,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipcc.pkl')

0 data in cached openalex data
0 data in cached openalex data


Read IPCC references in .bib files

In [None]:
wgs={'1':{'wg1':[], 'dataframes_1':{}, 'listdir1':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG1")},
     '2':{'wg2':[], 'wg2_update':[], 'dataframes_2':{}, 'listdir2':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG2")},
     '2_CROSS':{'wg2_CROSS':[], 'wg2_CROSS_update':[], 'dataframes_2_CROSS':{}, 'listdir2_CROSS':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG2_CROSS")},
     '3':{'wg1':[], 'dataframes_3':{}, 'listdir3':os.listdir(module_path+f"\\IPCC_bibliography\\AR6\\WG3")}}

In [None]:
for k in ['1','2','2_CROSS','3']:
    wgs[k][f'df_{k}'] = read_bib_wg(wgs,k,verbose=True)
    print(wgs[k][f'wg{k}_update'])
    #wgs[k][f'df_{k}'].to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_wg{k}.jsonl', orient='records', lines=True)

Read and clean the data from 'structured_data' folder

In [None]:
data_all={}

In [None]:
for k in ['1','2','2_CROSS','3']:
   data_all[f'df_wg{k}']=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_wg{k}.jsonl', lines= True)

In [None]:
#all data 
df_ipcc=pd.concat(list(data_all.values()), ignore_index=True)

In [None]:
df_ipcc.loc[:,'doi']=df_ipcc.loc[:,'doi'].apply(lambda x: get_doi_cleaned(x))
df_ipcc.loc[:,'doi']=df_ipcc.loc[:,'doi'].apply(lambda x: None if str(x)[:4]=='http' else x)
df_ipcc['freq']=1

Enriching data through OpenAlex API integration

In [None]:
df_unique_doi=df_ipcc.groupby(by=['year','doi','title','author'], dropna=True).agg({'wg': lambda x: list(x),'chap': lambda x: list(x), 'freq': 'sum'})

In [None]:
df_unique_doi=df_unique_doi.reset_index()

In [None]:
#get data_OpenAlex from the API 
df_unique_doi.apply(lambda row: get_open_alex_data(cached_openalex_data,row['doi']), axis= 1)
write_cache(cached_openalex_data,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data.pkl')

In [None]:
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []

for i,row in df_unique_doi.iterrows():
    countries,concepts,sdg,year,topics,doi=get_countries_concepts_sdg(cached_openalex_data,row)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    topics_list.append(topics)
    

df_unique_doi['countries'] = countries_list
df_unique_doi['concepts'] = concepts_list
df_unique_doi['sdg'] = sdg_list
df_unique_doi['topics'] = topics_list

In [None]:
df_unique_doi['wg_chap']=df_unique_doi.apply(lambda row: wg_chap_to_dict(row), axis=1)

Visualization of the contribution of each countries in ipcc references

In [None]:
df_ipcc=pd.merge(df_ipcc, df_unique_doi[['doi','countries','concepts','sdg','wg_chap']], on='doi', how='left')
df_ipcc.loc[:,'countries']=df_ipcc.loc[:,'countries'].apply(lambda x: str(x).replace('nan','None').replace('[]','None').replace('[None]','None') if ((str(x)=='nan')|(str(x)=='[None]')|((str(x)=='[]'))) else x )
df_ipcc.loc[:,'countries']=df_ipcc.loc[:,'countries'].apply(lambda x: ['None'] if x=='None' else x)

In [None]:
# df_ipcc.to_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipcc_visualization.json', orient='records')

In [None]:
data_counts = pd.Series(aplatir(list(df_ipcc['countries']))).value_counts().drop('None')

In [None]:
plt.figure(figsize=(10, 6))
data_counts[:20].plot(kind='bar')
ax = data_counts[:20].plot(kind='bar')
for i, v in enumerate(data_counts[:20]):
    ax.text(i, v + 0.1, str(v), ha='center', va='bottom')
plt.show()

Find the data for constructing the learning model ( IPCC related or not )

In [18]:
df_ipcc=pd.read_json(module_path+'\\IPCC_bibliography\\AR6\\structured_data\\data_ipcc_visualization.json')

In [19]:
df_ipcc['year'] = pd.to_numeric(df_ipcc['year'], errors='coerce')
year=df_ipcc.drop_duplicates(subset='doi')['year'].dropna().sort_values().apply(lambda x: int(x))
year_counts = pd.Series(list(year)).value_counts().to_dict()
del year_counts[202]

In [24]:
dois=list(df_ipcc.doi.dropna().drop_duplicates())
year_counts_not_ipcc={}

In [32]:
write_cache(cached_openalex_data_not_ipcc,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipcc.pkl')

In [25]:
for year in list(year_counts.keys()):   
    cached_openalex_data_not_ipcc[year]=[]
    year_counts_not_ipcc[year]=0
    while year_counts_not_ipcc[year]<year_counts[year]:
        get_open_alex_data_not_in_references(dois,cached_openalex_data_not_ipcc,year_counts,year_counts_not_ipcc,year)
    cached_openalex_data_not_ipcc[year]=cached_openalex_data_not_ipcc[year][:year_counts[year]+1]
write_cache(cached_openalex_data_not_ipcc,module_path+'\\IPCC_bibliography\\AR6\\structured_data\\cached_openalex_data_not_ipcc.pkl')


plus que 7559 publications pour completer l'année 2018
plus que 7473 publications pour completer l'année 2018
plus que 7397 publications pour completer l'année 2018
plus que 7301 publications pour completer l'année 2018
plus que 7202 publications pour completer l'année 2018
plus que 7116 publications pour completer l'année 2018
plus que 7014 publications pour completer l'année 2018
plus que 6920 publications pour completer l'année 2018
plus que 6830 publications pour completer l'année 2018
plus que 6741 publications pour completer l'année 2018
plus que 6644 publications pour completer l'année 2018
plus que 6550 publications pour completer l'année 2018
plus que 6463 publications pour completer l'année 2018
plus que 6375 publications pour completer l'année 2018
plus que 6287 publications pour completer l'année 2018
plus que 6185 publications pour completer l'année 2018
plus que 6096 publications pour completer l'année 2018
plus que 5998 publications pour completer l'année 2018
plus que 5

In [30]:
df_not_ipcc=pd.DataFrame()
year_list = []
countries_list = []
concepts_list = []
sdg_list = []
topics_list = []
dois_list = []
for k in range(len(aplatir(list(cached_openalex_data_not_ipcc.values())))):
    countries,concepts,sdg,year,topics,doi=get_countries_concepts_sdg(cached_openalex_data=aplatir(list(cached_openalex_data_not_ipcc.values())),ipcc=False,i=k)

    countries_list.append(countries)
    concepts_list.append(concepts)
    sdg_list.append(sdg)
    year_list.append(year)   
    topics_list.append(topics)   
    dois_list.append(doi)   

df_not_ipcc['countries'] = countries_list
df_not_ipcc['concepts'] = concepts_list
df_not_ipcc['sdg'] = sdg_list
df_not_ipcc['year'] = year_list
df_not_ipcc['topics'] = topics_list
df_not_ipcc['doi'] = dois_list

https://doi.org/10.1016/j.dld.2018.04.011
https://doi.org/10.1093/annonc/mdy282.069
https://doi.org/10.4000/rccs.7663
https://doi.org/10.1109/jssc.2018.2863946
https://doi.org/10.1007/s40472-018-0217-6
https://doi.org/10.1162/jocn_a_01301
https://doi.org/10.1161/atvb.38.suppl_1.662
https://doi.org/10.5272/jimab.2018241.1891
https://doi.org/10.7774/cevr.2018.7.2.111
https://doi.org/10.1103/physrevd.97.095033
https://doi.org/10.4274/uob.975
https://doi.org/10.14419/ijet.v7i4.7.20538
https://doi.org/10.1016/j.ijcard.2018.07.054
https://doi.org/10.1101/369538
https://doi.org/10.1007/978-3-319-99954-8_3
https://doi.org/10.30699/sjhnmf.26.4.247
https://doi.org/10.1016/j.jaip.2017.07.030
https://doi.org/10.1117/12.2311369
https://doi.org/10.1007/978-3-658-21169-1_6
https://doi.org/10.1093/oso/9780198791492.003.0014
https://doi.org/10.30928/2527-2039e-2018692
https://doi.org/10.1007/s00339-018-1758-3
https://doi.org/10.18172/brocar.3752
https://doi.org/10.1176/appi.ps.201800197
https://doi.org

In [31]:
df_not_ipcc.to_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_not_ipcc.jsonl', orient='records', lines= True)

IPCC learning model (fastext)

In [34]:
df_not_ipcc=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_not_ipcc.jsonl', lines= True)

In [35]:
df_not_ipcc

Unnamed: 0,countries,concepts,sdg,year,topics,doi
0,"[CZ, FR]","[{'name': 'Medicine'}, {'name': 'Ulcerative co...","[{'id': '3', 'name': 'Good health and well-bei...",2018,[{'name': 'Genetics and Treatment of Inflammat...,https://doi.org/10.1016/j.dld.2018.04.011
1,[JP],"[{'name': 'Medicine'}, {'name': 'Internal medi...","[{'id': '3', 'name': 'Good health and well-bei...",2018,[{'name': 'Gastric Cancer Research and Treatme...,https://doi.org/10.1093/annonc/mdy282.069
2,[],"[{'name': 'Humanities'}, {'name': 'Philosophy'}]","[{'id': '10', 'name': 'Reduced inequalities'}]",2018,[{'name': 'Impact of International Migration o...,https://doi.org/10.4000/rccs.7663
3,[SG],"[{'name': 'Flip-flop'}, {'name': 'CMOS'}, {'na...","[{'id': '7', 'name': 'Affordable and clean ene...",2018,[{'name': 'Low-Power VLSI Circuit Design and O...,https://doi.org/10.1109/jssc.2018.2863946
4,[US],"[{'name': 'Medicine'}, {'name': 'Immunosuppres...","[{'id': '2', 'name': 'Zero hunger'}]",2018,[{'name': 'Advancements in Transplantation Pro...,https://doi.org/10.1007/s40472-018-0217-6
...,...,...,...,...,...,...
54291,[],"[{'name': 'Icon'}, {'name': 'Citation'}, {'nam...","[{'id': '17', 'name': 'Partnerships for the go...",1952,[{'name': 'Cultural Dynamics in Japan and East...,https://doi.org/10.2307/3024133
54292,[US],"[{'name': 'Ovule'}, {'name': 'Embryo'}, {'name...","[{'id': '2', 'name': 'Zero hunger'}]",1954,[{'name': 'Formation and Health Effects of Acr...,https://doi.org/10.1007/bf02861636
54293,[CA],"[{'name': 'Missile'}, {'name': 'Elasticity (ph...","[{'id': '7', 'name': 'Affordable and clean ene...",1954,[{'name': 'Missile Guidance and Control Strate...,https://doi.org/10.2514/8.3086
54294,[],"[{'name': 'Diplomacy'}, {'name': 'Politics'}, ...","[{'id': '10', 'name': 'Reduced inequalities'}]",1967,[{'name': 'Historical and Social Dynamics in C...,https://doi.org/10.2307/2204495
