In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import re

current_directory = os.getcwd()
root_directory = os.path.abspath(os.path.join(current_directory, '..', '..','..')) 
if root_directory not in sys.path:
    sys.path.append(root_directory)

import fasttext
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
import warnings
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import neattext.functions as nfx
 
warnings.filterwarnings(action='ignore')



In [2]:
df_ipcc=pd.read_json(root_directory+f"\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipcc\\data_ipcc.jsonl", lines= True).dropna(subset=['year'])

In [3]:
wg_doi=pd.read_json('ipcc_wg.json')

In [4]:
df_ipcc=df_ipcc.merge(wg_doi,on='doi',how='left')

In [5]:
df_ipcc.columns

Index(['index', 'doi', 'title', 'year', 'countries', 'concepts', 'sdg',
       'topics', 'is_OA_available', 'title_OA', 'year_OA', 'authors_name',
       'rors', 'institutions_names', 'locations_names', 'locations_ids',
       'type', 'type_crossref', 'test_glutton', 'wg'],
      dtype='object')

In [6]:
dict_wg={'wg1':'science','wg2':'adaptation','wg2_cross':'adaptation','wg3':'mitigation'}

In [7]:
df_ipcc['wg_name']=df_ipcc['wg'].apply(lambda y: [dict_wg[str(x)] for x in y] if isinstance(y, list) else None)

In [8]:
df_ipcc=df_ipcc[['year','doi', 'title','topics','locations_names','locations_ids','wg_name']]

In [9]:
df_ipcc=df_ipcc.dropna(subset=['year','doi', 'title','topics','locations_names','locations_ids']).drop_duplicates(subset=['doi'])

In [10]:
data_counts=pd.Series([int(x) for x in list(df_ipcc.year.dropna())]).value_counts()
data_counts=pd.Series(dict(data_counts)).sort_index()

building the learning dataset

In [11]:
df_all=df_ipcc

In [12]:
df_all.loc[:,'doi']=df_all.loc[:,'doi'].apply(lambda x: str(nfx.remove_stopwords(x)).replace('https://doi.org/','').lower())

In [13]:
df_all.loc[:,'title']=df_all.loc[:,'title'].apply(lambda x: str(nfx.remove_stopwords(x)).replace('}','').replace('{','').lower())

In [14]:
df_all.loc[:,'topics']=df_all.loc[:,'topics'].apply(lambda x: nfx.remove_stopwords(' '.join(x)).lower())

In [15]:
df_all

Unnamed: 0,year,doi,title,topics,locations_names,locations_ids,wg_name
0,2019,10.3389/978-2-88963-118-6,oceanobs'19: ocean opportunity. volume 1,marine biology ecology research,Frontiers research topics,1664-8714,[science]
2,1900,10.1002/andp.19003081208,"\""uber die bedeutung des wasserdampfes und der...",atmospheric ozone climate atmospheric chemistr...,Annalen der Physik,0003-3804,[science]
3,1964,10.3402/tellusa.v16i1.8885,the parameters atmospheric turbidity,ocean waves remote sensing,Tellus A Dynamic Meteorology and Oceanography,0280-6495,[science]
4,1929,10.1080/20014422.1929.11880498,on atmospheric transmission sun radiation dust...,atmospheric aerosols clouds,Geografiska Annaler,1651-3215,[science]
5,2013,10.1002/rog.20022,a review global ocean temperature observations...,oceanographic atmospheric processes geophysics...,Reviews of Geophysics,1944-9208,"[science, adaptation]"
...,...,...,...,...,...,...,...
52465,2017,10.1007/s11625-017-0498-1,facilitating data-intensive approaches innovat...,smart cities technologies innovative approache...,Sustainability Science,1862-4057,[mitigation]
52466,2021,10.1016/j.worlddev.2020.105249,china's green transformation eco-industrial parks,sustainable industrial ecology sustainable sup...,World Development,0305-750X,[mitigation]
52467,2018,10.1016/j.jclepro.2018.05.194,"water-energy-food nexus: concepts, questions m...",water-energy-food nexus studies energy harvest...,Journal of Cleaner Production,0959-6526,[mitigation]
52468,2018,10.1016/j.apenergy.2017.07.036,water-food-energy nexus optimization approach ...,water-energy-food nexus studies water resource...,Applied Energy,0306-2619,[mitigation]


In [16]:
df_all.columns

Index(['year', 'doi', 'title', 'topics', 'locations_names', 'locations_ids',
       'wg_name'],
      dtype='object')

using fasttext

In [17]:
df_all['label']=df_all['wg_name'].apply(lambda x: ' '.join(['__label__' + str(y) for y in x])) 
df_all['category_description']=df_all['label']+' '+df_all['title']+' '+df_all['topics']+' '+df_all['locations_names']+' '+df_all['locations_ids']

In [18]:
train, test=train_test_split(df_all, test_size=0.2, random_state=42)

In [19]:
train.shape, test.shape

((38480, 9), (9621, 9))

In [20]:
train.to_csv("teds_wg.train", columns=["category_description"], index=False, header=False)
test.to_csv("teds_wg.test", columns=["category_description"], index=False, header=False)

In [21]:
model=fasttext.train_supervised(input="teds_wg.train", loss='ova')
model.test("teds_wg.test")

(7091, 0.8713862642786631, 0.7793895055499496)

In [23]:
model.save_model("fasttext_model_teds_wg0204.bin")

In [24]:
model=fasttext.load_model("fasttext_model_teds_wg0204.bin")



In [25]:
test_data=pd.read_csv("teds_wg.test", header=None, names=["text"])

In [26]:
# pour multi labels
true_labels=[]
predicted_labels=[]

label_order=['__label__science', '__label__adaptation', '__label__mitigation']

for line in test_data["text"]:
    label, text=line.split(' ', 1)
    if '__label__not_ipcc' in label:
        labels_list=[0, 0, 0]  
    else:
        labels_list=[1 if x in label.split(' ') else 0 for x in label_order]
    true_labels.append(labels_list)
    
    text=text.strip()
    labels, probabilities=model.predict(text, k=-1, threshold=0.5)
    if '__label__not_ipcc' in labels:
        predicted_label=[0, 0, 0]  
    else:
        predicted_label=[1 if x in labels else 0 for x in label_order]
    predicted_labels.append(predicted_label)

In [27]:
# pour multi labels
true_labels = []
predicted_labels = []

for line in test_data["text"]:
    label, text = line.split(' ', 1)
    labels_list = [x.replace('__label__', '').strip() for x in label.split(',')]
    true_labels.append(labels_list)
    
    text = text.strip()
    labels, probabilities = model.predict(text, k=-1, threshold=0.5)
    predicted_labels.append([x.replace('__label__', '').strip() for x in labels])

mlb = MultiLabelBinarizer()
true_labels = mlb.fit_transform(true_labels)
predicted_labels = mlb.transform(predicted_labels)

In [28]:
predicted_labels

array([[0, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0]])

In [29]:
true_labels

array([[0, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0]])

In [30]:
#conf_matrix=confusion_matrix(true_labels, predicted_labels)

In [31]:
labels=mlb.classes_

for i, label in enumerate(labels):
    conf_matrix=confusion_matrix(true_labels[:, i], predicted_labels[:, i])
    print(f"\nmatrice de confusion pour '{label}':")
    print(conf_matrix)
    accuracy=accuracy_score(true_labels[:, i], predicted_labels[:, i])
    recall=recall_score(true_labels[:, i], predicted_labels[:, i])
    f1=f1_score(true_labels[:, i], predicted_labels[:, i])
    print(f"Accuracy pour '{label}': {accuracy:.4f}")
    print(f"Recall pour '{label}': {recall:.4f}")
    print(f"F1 score pour '{label}': {f1:.4f}")


matrice de confusion pour 'adaptation':
[[3536  899]
 [ 581 4605]]
Accuracy pour 'adaptation': 0.8462
Recall pour 'adaptation': 0.8880
F1 score pour 'adaptation': 0.8616

matrice de confusion pour 'mitigation':
[[7159  519]
 [ 316 1627]]
Accuracy pour 'mitigation': 0.9132
Recall pour 'mitigation': 0.8374
F1 score pour 'mitigation': 0.7958

matrice de confusion pour 'science':
[[6748  381]
 [ 413 2079]]
Accuracy pour 'science': 0.9175
Recall pour 'science': 0.8343
F1 score pour 'science': 0.8397


In [32]:
t=test.iloc[1,:]

In [33]:
print(f"{t.title} {t.topics} {t.locations_names} {t.locations_ids}")
print(model.predict(nfx.remove_stopwords(f"{t.title} {t.topics} {t.locations_names} {t.locations_ids}"), k=-1))

quantifying human impacts catchment sediment yield: continental approach soil erosion sediment transport hydrology sediment transport processes hydrology watershed management studies Global and Planetary Change 0921-8181
(('__label__adaptation', '__label__science', '__label__mitigation'), array([0.74317801, 0.28141561, 0.02844604]))
