In [5]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import re

module_path = os.path.abspath(os.path.join('..','..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import fasttext
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
import warnings
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
import neattext.functions as nfx
 
warnings.filterwarnings(action='ignore')

from code_utils.utils import preprocess,aplatir

In [6]:
df_ipbes=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipbes\\data_ipbes.jsonl', lines= True).dropna(subset=['year'])

In [7]:
df_not_ipbes=pd.read_json(module_path+f'\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipbes\\data_not_ipbes.jsonl', lines= True)

In [8]:
df_ipbes=df_ipbes[['year','doi', 'title','topics','locations_names','locations_id']]

In [9]:
df_not_ipbes=df_not_ipbes[['year','doi', 'title','topics','locations_names','locations_id']]

In [10]:
df_ipbes=df_ipbes.dropna(subset=['year','doi', 'title','topics','locations_names','locations_id']).drop_duplicates(subset=['doi'])

In [11]:
df_not_ipbes=df_not_ipbes.dropna(subset=['doi', 'title','topics','locations_names','locations_id'])

In [12]:
data_counts=pd.Series([int(x) for x in list(df_ipbes.year.dropna())]).value_counts()
data_counts=pd.Series(dict(data_counts)).sort_index()

building the learning dataset

In [13]:
print(len(df_ipbes),len(df_not_ipbes))

5531 5531


In [14]:
len(df_ipbes.drop_duplicates(subset='doi').dropna(subset='doi'))

5531

In [15]:
df_ipbes['label']='ipbes'
df_not_ipbes['label']='not_ipbes'

In [16]:
df_all=pd.concat([df_ipbes,df_not_ipbes]).reset_index()
del df_all['index']

In [17]:
df_all.loc[:,'doi']=df_all.loc[:,'doi'].apply(lambda x: str(nfx.remove_stopwords(x)).replace('https://doi.org/','').lower())

In [18]:
df_all.loc[:,'title']=df_all.loc[:,'title'].apply(lambda x: str(nfx.remove_stopwords(x)).replace('}','').replace('{','').lower())

In [19]:
df_all.loc[:,'topics']=df_all.loc[:,'topics'].apply(lambda x: nfx.remove_stopwords(' '.join(x)).lower())

In [20]:
df_all

Unnamed: 0,year,doi,title,topics,locations_names,locations_id,label
0,2000,10.1890/1051-0761(2000)010[1251:roteka]2.0.co;2,rediscovery traditional ecological knowledge a...,"conservation, biodiversity, resource managemen...",Ecological Applications,1051-0761,ipbes
1,2014,10.1016/j.gloenvcha.2013.12.012,evaluating knowledge exchange interdisciplinar...,sustainability climate change governance risk ...,Global Environmental Change,0959-3780,ipbes
2,2015,10.5751/es-07868-200344,multiple telecouplings complex interrelationships,land use ecosystem services species distributi...,Ecology and Society,1708-3087,ipbes
3,2008,10.1177/0959683607087927,concept human impacts past use-by date?,environmental philosophy ethics global energy ...,The Holocene,0959-6836,ipbes
4,2014,10.1038/nclimate2339,changing intellectual climate,sustainability climate change governance clima...,Nature Climate Change,1758-678X,ipbes
...,...,...,...,...,...,...,...
11057,1974,10.1021/ed051p623,remote pipeting device,chemistry research topics,Journal of Chemical Education,0021-9584,not_ipbes
11058,1975,10.7202/700546ar,problématique de l’économie socialiste décentr...,french urban social studies political social i...,Études internationales,0014-2123,not_ipbes
11059,1976,10.1063/1.433097,intramolecular 13c kinetic isotope effects dec...,chemical synthesis characterization zeolite ca...,The Journal of Chemical Physics,0021-9606,not_ipbes
11060,1954,10.1042/bj0570297,branched-chain fatty acids butterfat. 4. isola...,peroxisome proliferator-activated receptors li...,Biochemical Journal,0306-3283,not_ipbes


In [21]:
df_all.columns

Index(['year', 'doi', 'title', 'topics', 'locations_names', 'locations_id',
       'label'],
      dtype='object')

using fasttext

In [22]:
df_all['label'] = '__label__' + df_all['label'].astype(str)
df_all['category_description']=df_all['label']+' '+df_all['title']+' '+df_all['topics']+' '+df_all['locations_names']+' '+df_all['locations_id']

In [23]:
train, test = train_test_split(df_all, test_size=0.2, random_state=42)

In [24]:
train.shape, test.shape

((8849, 8), (2213, 8))

In [25]:
train.to_csv("teds_ipbes.train", columns=["category_description"], index=False, header=False)
test.to_csv("teds_ipbes.test", columns=["category_description"], index=False, header=False)

In [26]:
model = fasttext.train_supervised(input="teds_ipbes.train")
model.test("teds_ipbes.test")

(1448, 0.9509668508287292, 0.9509668508287292)

In [27]:
model.save_model("..\\models_fasttext_or_random_forest\\fasttext_model_teds_ipbes.bin")

In [28]:
model = fasttext.load_model("..\\models_fasttext_or_random_forest\\fasttext_model_teds_ipbes.bin")



In [29]:
test_data = pd.read_csv("teds_ipbes.test", header=None, names=["text"])

In [30]:
true_labels = []
predicted_labels = []

for line in test_data["text"]:
    label, text = line.split(' ', 1)
    true_labels.append(label)
    text = text.replace('\n', '').strip()
    predicted_label = model.predict(text)[0][0]  
    predicted_labels.append(predicted_label)

In [31]:
conf_matrix = confusion_matrix(true_labels, predicted_labels)

In [32]:
conf_matrix

array([[1049,   67],
       [  42, 1055]], dtype=int64)

In [33]:
accuracy_score(true_labels, predicted_labels)

0.9507455942159964

In [34]:
recall_score(true_labels, predicted_labels, average="weighted")

0.9507455942159964

In [35]:
f1_score(true_labels, predicted_labels, average="weighted")

0.9507440856069499

In [36]:
t=test.iloc[3,:]

In [37]:
t

year                                                                 1980
doi                                          10.1016/0006-3207(80)90003-8
title                   temperature dependence sexual differentiation ...
topics                  turtle biology conservation animal behavior re...
locations_names                                   Biological Conservation
locations_id                                                    0006-3207
label                                                      __label__ipbes
category_description    __label__ipbes temperature dependence sexual d...
Name: 3710, dtype: object

In [38]:
print(f"{t.title} {t.topics} {t.locations_names} {t.locations_id}",model.predict(nfx.remove_stopwords(f"{t.title} {t.topics} {t.locations_names} {t.locations_id}"), k=-1))

temperature dependence sexual differentiation sea turtles: implications conservation practices turtle biology conservation animal behavior reproduction genetic clinical aspects sex determination chromosomal abnormalities Biological Conservation 0006-3207 (('__label__ipbes', '__label__not_ipbes'), array([0.78853202, 0.21148801]))
