In [None]:
import requests
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
tqdm.pandas()

Clean DOI

In [None]:
path='../IPCC_bibliography/AR6/structured_data/'

In [None]:
#data for each wg
df_wgi = pd.read_json(path+'data_wg1.jsonl', lines= True)
df_wgii = pd.read_json(path+'data_wg2.jsonl', lines= True)
df_cross_wgii = pd.read_json(path+'data_cross_wg2.jsonl', lines= True)
df_wgiii = pd.read_json(path+'data_wg3.jsonl', lines= True) 

In [None]:
#all data
df_ipcc=pd.concat([df_wgi, df_wgii, df_cross_wgii, df_wgiii], ignore_index=True)

In [None]:
def get_doi_cleaned(x):
    low_x=str(x).lower()
    if pd.isna(x):
        return None
    if low_x.find('https://doi.org/')>0:
        return low_x.replace('https://doi.org/','')
    else:
        return low_x

In [None]:
df_ipcc['doi']=df_ipcc.loc[:,'doi'].apply(lambda x: get_doi_cleaned(x))

Get open_access, sdg, countries from OpenAlex

In [None]:
def aplatir(conteneurs):
    return [conteneurs[i][j] for i in range(len(conteneurs)) for j in range(len(conteneurs[i]))]

In [None]:
json_OA=[]
def get_open_alex_data(row):
    doi=row.doi
    if pd.isna(doi)==False:
        url=f"https://api.openalex.org/works?filter=doi:{doi}"
        response = requests.get(url)
        data = response.json()
        if 'results' in data.keys():
            json_OA.append({"doi": doi, "results": data.get('results')})
        else:
            json_OA.append({"doi": doi, "results": []})

In [None]:
def get_status_sdg_coutries(df,row):
    doi=row.doi
    data=df[df.doi==doi]
    i=df[df.doi==doi].index[0]
    if data['results'][i]!=[]:
        open_access=data['results'][i][0].get('open_access')
        if open_access!={}:
            status={'is_oa':open_access.get('is_oa'),'oa_status':open_access.get('oa_status')}
        else:
            status=None

        sdgs=data['results'][i][0].get('sustainable_development_goals')
        if sdgs!=[]:
            sdgs_id_name=[{'id': str(sdg.get('id'))[-2:].replace("/",""), 'name': sdg.get('display_name')} for sdg in sdgs]
        else:
            sdgs_id_name=None

        authors=data['results'][i][0].get('authorships')
        if authors!=[]:
            countries=list(set(aplatir([author.get('countries') for author in authors]))) 
        else:
            countries=[None]
    else:
        return None,None,[None]

    return status, sdgs_id_name, countries

In [None]:
#get data_OpenAlex from the API (6h)
""" df_unique_doi=df_ipcc.drop_duplicates(subset=['doi'])
df_unique_doi.loc[:,'doi']=df_unique_doi.loc[:,'doi'].apply(lambda x: None if str(x)[:4]=='http' else x)
df_unique_doi=df_unique_doi.dropna(subset=['doi'])
df_unique_doi.progress_apply(get_open_alex_data, axis= 1)
pd.DataFrame(json_OA).to_json(path+'data_OpenAlex.json', orient='records') """

In [None]:
#get data_OpenAlex from the file
data_OpenAlex_all=pd.read_json(path+'data_OpenAlex.json')

In [None]:
status_list = []
sdg_list = []
countries_list = []
for i,row in data_OpenAlex_all.iterrows():
    status, sdg, countries=get_status_sdg_coutries(data_OpenAlex_all,row)

    status_list.append(status)
    sdg_list.append(sdg)
    countries_list.append(countries)

data_OpenAlex_all['status'] = status_list
data_OpenAlex_all['sdg'] = sdg_list
data_OpenAlex_all['countries'] = countries_list

In [None]:
data_OpenAlex=data_OpenAlex_all[['doi','status','sdg','countries']]

The part of each countries in ipcc references

In [None]:
df_ipcc=pd.merge(df_ipcc, data_OpenAlex, on='doi', how='left')
df_ipcc.loc[:,'countries']=df_ipcc.loc[:,'countries'].apply(lambda x: str(x).replace('nan','None').replace('[]','None').replace('[None]','None') if ((str(x)=='nan')|(str(x)=='[None]')|((str(x)=='[]'))) else x )
df_ipcc.loc[:,'countries']=df_ipcc.loc[:,'countries'].apply(lambda x: ['None'] if x=='None' else x)

In [None]:
data_counts = pd.Series(aplatir(list(df_ipcc['countries']))).value_counts()

In [None]:
plt.figure(figsize=(10, 6))
data_counts[:20].plot(kind='bar')
ax = data_counts[:20].plot(kind='bar')
for i, v in enumerate(data_counts[:20]):
    ax.text(i, v + 0.1, str(v), ha='center', va='bottom')
plt.show()

The part of each countries in WG1 references

In [None]:
df_wgiii=pd.merge(df_wgiii, data_OpenAlex, on='doi', how='left')
df_wgiii.loc[:,'countries']=df_wgiii.loc[:,'countries'].apply(lambda x: str(x).replace('nan','None').replace('[]','None').replace('[None]','None') if ((str(x)=='nan')|(str(x)=='[None]')|((str(x)=='[]'))) else x )
df_wgiii.loc[:,'countries']=df_wgiii.loc[:,'countries'].apply(lambda x: ['None'] if x=='None' else x)

In [None]:
#retirer les None
data_counts = pd.Series(aplatir(list(df_wgiii['countries']))).value_counts()

In [None]:
plt.figure(figsize=(10, 6))
data_counts[:20].plot(kind='bar')
ax = data_counts[:20].plot(kind='bar')
for i, v in enumerate(data_counts[:20]):
    ax.text(i, v + 0.1, str(v), ha='center', va='bottom')
plt.show()

SDG in IPCC references

In [None]:
sdg_names=data_OpenAlex.loc[:,'sdg'].dropna().apply(lambda x: str(x[0].get('name')).lower().replace(',',''))

In [None]:
data_counts = pd.Series(list(sdg_names)).value_counts()

In [None]:
plt.figure(figsize=(6, 10))
plt.pie(data_counts, labels=data_counts.index, autopct='%1.1f%%', startangle=140)
plt.subplots_adjust(wspace=0.5, hspace=1)
plt.show()

Get concepts from OpenAlex

In [None]:
def get_concepts(df,row):
    doi=row.doi
    data=df[df.doi==doi]
    i=df[df.doi==doi].index[0]
    if data['results'][i]!=[]:
        concepts=data['results'][i][0].get('concepts')
        if concepts!=[]:
            concepts_name=[{'name': concept.get('display_name')} for concept in concepts]
        else:
            concepts_name=None
    else:
        return None
    return concepts_name

In [None]:
concepts_list = []
for i,row in data_OpenAlex_all.iterrows():
    concepts=get_concepts(data_OpenAlex_all,row)
    concepts_list.append(concepts)

data_OpenAlex_all['concepts'] = concepts_list

In [None]:
concepts_names=data_OpenAlex_all.loc[:,'concepts'].dropna().apply(lambda x: str(x[0].get('name')).lower().replace(',',''))

In [None]:
data_counts = pd.Series(list(concepts_names)).value_counts()

In [None]:
data_counts

In [None]:
plt.figure(figsize=(6, 10))
plt.pie(data_counts[:30], labels=data_counts[:30].index, autopct='%1.1f%%', startangle=140)
plt.subplots_adjust(wspace=0.5, hspace=1)
plt.show()

In [None]:
data_OpenAlex_all.iloc[8]

In [None]:
data_OpenAlex_all['concepts'][16]

In [None]:
data_counts[:30]

In [None]:
list_green_concepts=['climate change','environmental science','climatology','greenhouse gas','climatology','ecology','climate model','greenhouse gas']

Training model: Publication from IPCC's references or not:

In [None]:
df_ipcc[df_ipcc['wg']=='wg1'].year.max()

In [None]:
df_ipcc['year'] = pd.to_numeric(df_ipcc['year'], errors='coerce')
year=df_ipcc.drop_duplicates(subset='doi')['year'].dropna().sort_values().apply(lambda x: int(x))
data_counts = pd.Series(list(year)).value_counts()

In [None]:
data_counts=data_counts.to_dict()
del data_counts[202]

In [None]:
df_ipcc.apply(lambda x: None if str(x.doi)[:4]=='http' else x, axis=1).dropna(subset=['doi']).drop_duplicates(subset='doi')

In [None]:
data_OA=pd.read_json('../IPCC_bibliography/AR6/structured_data/data_OA_concepts.json')

In [None]:
dois=list(data_OA.doi)

In [None]:
url=f"https://api.openalex.org/works?filter=publication_year:{2018}&per-page={200}&cursor=*"
response = requests.get(url)
data = response.json()

In [None]:
def get_publi_not_in_ipcc(json_not_in_IPCC,year,nb_publi_page,nb_publi,cursor):
    url=f"https://api.openalex.org/works?filter=publication_year:{year}&per-page={nb_publi_page}&cursor={cursor}"
    response = requests.get(url)
    data = response.json()
    for i in range(nb_publi_page):
        if (nb_publi > len(json_not_in_IPCC[year])):
            if ((pd.isna(data.get('results')[i]))==False)&(data.get('results')[i].get('doi') not in dois)&(pd.isna(data.get('results')[i].get('title'))==False)&(data.get('results')[i].get('sustainable_development_goals')!=[])&(data.get('results')[i].get('concepts')!=[]):
                json_not_in_IPCC[year].append({"doi": data.get('results')[i].get('doi'), "year": year, "title": data.get('results')[i].get('title'), "sdg": data.get('results')[i].get('sustainable_development_goals'), "concepts": data.get('results')[i].get('concepts')})
                list_cursor.append(data.get('meta').get('next_cursor'))
        else:
            break

In [None]:
json_not_in_IPCC={}
list_cursor=['*']
for year,nb_publi in data_counts.items():
    print(year)
    json_not_in_IPCC[year]=[]
    while (nb_publi > len(json_not_in_IPCC[year])):
        get_publi_not_in_ipcc(json_not_in_IPCC,year,200,nb_publi,list_cursor[-1])

In [None]:
data_counts