In [38]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import re

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import fasttext
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
import warnings
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import neattext.functions as nfx
 
warnings.filterwarnings(action='ignore')

from code_utils.utils import preprocess,aplatir

In [2]:
df_ipcc=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipcc\\data_ipcc.jsonl', lines= True).dropna(subset=['year'])

In [3]:
wg_doi=pd.read_json('ipcc_wg.json')

In [4]:
df_ipcc=df_ipcc.merge(wg_doi,on='doi',how='left')

In [5]:
df_ipcc.columns

Index(['index', 'doi', 'title', 'year', 'countries', 'concepts', 'sdg',
       'topics', 'is_OA_available', 'title_OA', 'year_OA', 'authors_name',
       'rors', 'institutions_names', 'locations_names', 'locations_ids',
       'test_glutton', 'wg'],
      dtype='object')

In [6]:
dict_wg={'wg1':'science','wg2':'adaptation','wg2_cross':'adaptation','wg3':'mitigation'}

In [7]:
df_ipcc['wg_name']=df_ipcc['wg'].apply(lambda y: [dict_wg[str(x)] for x in y] if isinstance(y, list) else None)

In [8]:
df_not_ipcc=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipcc\\data_not_ipcc.jsonl', lines= True)

In [9]:
df_ipcc=df_ipcc[['year','doi', 'title','topics','locations_names','locations_ids','wg_name']]

In [10]:
df_not_ipcc=df_not_ipcc[['year','doi', 'title','topics','locations_names','locations_ids']]

In [11]:
df_ipcc=df_ipcc.dropna(subset=['year','doi', 'title','topics','locations_names','locations_ids']).drop_duplicates(subset=['doi'])

In [12]:
df_not_ipcc=df_not_ipcc.dropna(subset=['doi', 'title','topics','locations_names','locations_ids'])

In [13]:
data_counts=pd.Series([int(x) for x in list(df_ipcc.year.dropna())]).value_counts()
data_counts=pd.Series(dict(data_counts)).sort_index()

building the learning dataset

In [14]:
print(len(df_ipcc),len(df_not_ipcc))

48219 48219


In [15]:
df_all=pd.concat([df_ipcc,df_not_ipcc]).reset_index()
del df_all['index']

In [16]:
df_all.loc[:,'doi']=df_all.loc[:,'doi'].apply(lambda x: str(nfx.remove_stopwords(x)).replace('https://doi.org/','').lower())

In [17]:
df_all.loc[:,'title']=df_all.loc[:,'title'].apply(lambda x: str(nfx.remove_stopwords(x)).replace('}','').replace('{','').lower())

In [18]:
df_all.loc[:,'topics']=df_all.loc[:,'topics'].apply(lambda x: nfx.remove_stopwords(' '.join(x)).lower())

In [19]:
df_all

Unnamed: 0,year,doi,title,topics,locations_names,locations_ids,wg_name
0,2019.0,10.3389/978-2-88963-118-6,oceanobs'19: ocean opportunity. volume 1,marine biodiversity ecosystem functioning,Frontiers research topics,1664-8714,[science]
1,1900.0,10.1002/andp.19003081208,"\""uber die bedeutung des wasserdampfes und der...",stratospheric chemistry climate change impacts...,Annalen der Physik,0003-3804,[science]
2,1964.0,10.3402/tellusa.v16i1.8885,the parameters atmospheric turbidity,dynamics ocean surface waves wind interaction,Tellus A,0280-6495,[science]
3,1929.0,10.1080/20014422.1929.11880498,on atmospheric transmission sun radiation dust...,aerosols' impact climate hydrological cycle,Geografiska Annaler,1651-3215,[science]
4,2013.0,10.1002/rog.20022,a review global ocean temperature observations...,oceanic modeling circulation studies global se...,Reviews of Geophysics,1944-9208,"[science, adaptation]"
...,...,...,...,...,...,...,...
96433,1967.0,10.1007/bf02349230,played physicians patient's path mental hospital,crisis resolution home treatment teams mental ...,Community mental health journal,0010-3853,
96434,1966.0,10.1038/210701a0,delayed fluorescence chlorophyll vitro vivo,molecular mechanisms photosynthesis photoprote...,Nature,0028-0836,
96435,1966.0,10.1507/endocrine1927.42.8_880,子宮筋homogenateにおけるacetylcholineと組織蛋白との結合に及ぼすoxy...,principles stereochemical structure nomenclatu...,Nippon Naibunpi Gakkai zasshi/Nihon Naibunpi G...,0029-0661,
96436,1686.0,10.1098/rstl.1686.0008,"voyage emperour china eastern tartary, anno 1682",linguistic studies turkic languages silk roads...,Philosophical transactions of the Royal Societ...,0261-0523,


In [20]:
df_all.columns

Index(['year', 'doi', 'title', 'topics', 'locations_names', 'locations_ids',
       'wg_name'],
      dtype='object')

using fasttext

In [21]:
df_all['label']=df_all['wg_name'].apply(lambda x: ' '.join(['__label__' + str(y) for y in x]) if isinstance(x,list) else '__label__not_ipcc') 
df_all['category_description']=df_all['label']+' '+df_all['title']+' '+df_all['topics']+' '+df_all['locations_names']+' '+df_all['locations_ids']

In [22]:
train, test = train_test_split(df_all, test_size=0.2, random_state=42)

In [23]:
train.shape, test.shape

((77150, 9), (19288, 9))

In [24]:
train.to_csv("teds.train", columns=["category_description"], index=False, header=False)
test.to_csv("teds.test", columns=["category_description"], index=False, header=False)

In [25]:
model = fasttext.train_supervised(input="teds.train", loss='ova')
model.test("teds.test")

(15733, 0.9158456746964978, 0.8606498626209533)

In [26]:
model.save_model("fasttext_model_teds.bin")

In [27]:
model = fasttext.load_model("fasttext_model_teds.bin")

In [28]:
test_data = pd.read_csv("teds.test", header=None, names=["text"])

In [29]:
# pour 2 labels
true_labels = []
predicted_labels = []

for line in test_data["text"]:
    label, text = line.split(' ', 1)
    true_labels.append(label)
    text = text.replace('\n', '').strip()
    predicted_label = model.predict(text)[0][0]  
    predicted_labels.append(predicted_label)

In [None]:
# pour multi labels
true_labels = []
predicted_labels = []

for line in test_data["text"]:
    label, text = line.split(' ', 1)
    labels_list = [x.replace('__label__', '').strip() for x in label.split(',')]
    true_labels.append(labels_list)
    
    text = text.strip()
    labels, probabilities = model.predict(text, k=-1)
    prob_50 = [1 if x >= 0. else 0 for x in probabilities]
    predicted_label_50=[labels[i] for i in range(len(labels)) if prob_50[i] == 1]
    predicted_labels.append([x.replace('__label__', '').strip() for x in predicted_label_50])

In [79]:
mlb = MultiLabelBinarizer()
true_labels_bin = mlb.fit_transform(true_labels)
predicted_labels_bin = mlb.transform(predicted_labels)

In [80]:
predicted_labels_bin

array([[0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       ...,
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 0]])

In [81]:
true_labels_bin

array([[0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       ...,
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1]])

In [82]:
#conf_matrix = confusion_matrix(true_labels, predicted_labels)
conf_matrix = confusion_matrix(true_labels_bin.flatten(), predicted_labels_bin.flatten())

In [83]:
labels = mlb.classes_

for i, label in enumerate(labels):
    conf_matrix = confusion_matrix(true_labels_bin[:, i], predicted_labels_bin[:, i])
    print(f"\nmatrice de confusion pour '{label}':")
    print(conf_matrix)
    accuracy = accuracy_score(true_labels_bin[:, i], predicted_labels_bin[:, i])
    recall = recall_score(true_labels_bin[:, i], predicted_labels_bin[:, i])
    f1 = f1_score(true_labels_bin[:, i], predicted_labels_bin[:, i])
    print(f"Accuracy pour '{label}': {accuracy:.4f}")
    print(f"Recall pour '{label}': {recall:.4f}")
    print(f"F1 score pour '{label}': {f1:.4f}")


matrice de confusion pour 'adaptation':
[[13158   893]
 [ 1047  4190]]
Accuracy pour 'adaptation': 0.8994
Recall pour 'adaptation': 0.8001
F1 score pour 'adaptation': 0.8120

matrice de confusion pour 'mitigation':
[[16891   427]
 [  594  1376]]
Accuracy pour 'mitigation': 0.9471
Recall pour 'mitigation': 0.6985
F1 score pour 'mitigation': 0.7294

matrice de confusion pour 'not_ipcc':
[[9466  228]
 [ 289 9305]]
Accuracy pour 'not_ipcc': 0.9732
Recall pour 'not_ipcc': 0.9699
F1 score pour 'not_ipcc': 0.9730

matrice de confusion pour 'science':
[[16540   261]
 [  569  1918]]
Accuracy pour 'science': 0.9570
Recall pour 'science': 0.7712
F1 score pour 'science': 0.8221


In [70]:
accuracy_score(true_labels_bin, predicted_labels_bin)

0.8498548320199087

In [71]:
recall_score(true_labels_bin, predicted_labels_bin, average="weighted")

0.8967751970136872

In [72]:
f1_score(true_labels_bin, predicted_labels_bin, average="weighted")

0.8902626400322705

In [73]:
t=test.iloc[0,:]

In [74]:
t

year                                                               2002.0
doi                                                  10.1029/2002gl015835
title                   an important constraint tropical cloud - clima...
topics                  stratospheric chemistry climate change impacts...
locations_names                              Geophysical Research Letters
locations_ids                                                   0094-8276
wg_name                                                         [science]
label                                                    __label__science
category_description    __label__science an important constraint tropi...
Name: 5998, dtype: object

In [75]:
model.predict(nfx.remove_stopwords(f"{t.title} {t.topics} {t.locations_names} {t.locations_ids}"), k=-1)

(('__label__science',
  '__label__adaptation',
  '__label__mitigation',
  '__label__not_ipcc'),
 array([9.69795406e-01, 1.91340372e-02, 2.99103255e-03, 1.00000034e-05]))

In [76]:
model.predict(nfx.remove_stopwords(f"{t.title} {t.topics} {t.locations_names} {t.locations_ids}"))

(('__label__science',), array([0.96979541]))