In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import re

current_directory = os.getcwd()
root_directory = os.path.abspath(os.path.join(current_directory, '..', '..','..')) 
if root_directory not in sys.path:
    sys.path.append(root_directory)

import fasttext
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
import warnings
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
import neattext.functions as nfx
 
warnings.filterwarnings(action='ignore')



In [2]:
os.listdir(root_directory+f"\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipcc")

['data_ipcc.jsonl',
 'data_ipcc.zip',
 'data_model.json',
 'data_not_ipcc.jsonl',
 'data_not_ipcc.zip',
 'ipcc_vectors.json',
 'locations_ids.json',
 'locations_names.json',
 'title.json',
 'topics.json']

In [3]:
df_ipcc=pd.read_json(root_directory+f"\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipcc\\data_ipcc.jsonl", lines= True).dropna(subset=['year'])

In [4]:
df_not_ipcc=pd.read_json(root_directory+f"\\IPCC_bibliography\\AR6\\structured_data\\data_model_ipcc\\data_not_ipcc.jsonl", lines= True)

In [5]:
df_ipcc=df_ipcc[['year','doi', 'title','topics','locations_names','locations_ids']]

In [6]:
df_not_ipcc=df_not_ipcc[['year','doi', 'title','topics','locations_names','locations_ids']]

In [7]:
df_ipcc=df_ipcc.dropna(subset=['year','doi', 'title','topics','locations_names','locations_ids']).drop_duplicates(subset=['doi'])

In [8]:
df_not_ipcc=df_not_ipcc.dropna(subset=['doi', 'title','topics','locations_names','locations_ids'])

In [9]:
data_counts=pd.Series([int(x) for x in list(df_ipcc.year.dropna())]).value_counts()
data_counts=pd.Series(dict(data_counts)).sort_index()

building the learning dataset

In [10]:
print(len(df_ipcc),len(df_not_ipcc))

48101 48101


In [11]:
df_ipcc['label']='ipcc'
df_not_ipcc['label']='not_ipcc'

In [12]:
df_all=pd.concat([df_ipcc,df_not_ipcc]).reset_index()
del df_all['index']

In [13]:
df_all.loc[:,'doi']=df_all.loc[:,'doi'].apply(lambda x: str(nfx.remove_stopwords(x)).replace('https://doi.org/','').lower())

In [14]:
df_all.loc[:,'title']=df_all.loc[:,'title'].apply(lambda x: str(nfx.remove_stopwords(x)).replace('}','').replace('{','').lower())

In [15]:
df_all.loc[:,'topics']=df_all.loc[:,'topics'].apply(lambda x: nfx.remove_stopwords(' '.join(x)).lower())

In [16]:
df_all

Unnamed: 0,year,doi,title,topics,locations_names,locations_ids,label
0,2019,10.3389/978-2-88963-118-6,oceanobs'19: ocean opportunity. volume 1,marine biology ecology research,Frontiers research topics,1664-8714,ipcc
1,1900,10.1002/andp.19003081208,"\""uber die bedeutung des wasserdampfes und der...",atmospheric ozone climate atmospheric chemistr...,Annalen der Physik,0003-3804,ipcc
2,1964,10.3402/tellusa.v16i1.8885,the parameters atmospheric turbidity,ocean waves remote sensing,Tellus A Dynamic Meteorology and Oceanography,0280-6495,ipcc
3,1929,10.1080/20014422.1929.11880498,on atmospheric transmission sun radiation dust...,atmospheric aerosols clouds,Geografiska Annaler,1651-3215,ipcc
4,2013,10.1002/rog.20022,a review global ocean temperature observations...,oceanographic atmospheric processes geophysics...,Reviews of Geophysics,1944-9208,ipcc
...,...,...,...,...,...,...,...
96197,2018,10.1016/j.cclet.2018.11.014,design synthesis novel branched fluorinated su...,per- polyfluoroalkyl substances research atmos...,Chinese Chemical Letters,1001-8417,not_ipcc
96198,2018,10.1504/ijhrdm.2018.10013654,relationship time perspective job satisfaction,psychological temporal perspectives research p...,International Journal of Human Resources Devel...,1465-6612,not_ipcc
96199,2018,10.31727/gzb.41.4.9,istraživanje preferencija u odabiru sobnog bil...,regional development management studies,Glasnik zaštite bilja,0350-9664,not_ipcc
96200,2018,10.1103/physrevlett.121.021102,origin cosmic ray galactic halo driven advecte...,astrophysics cosmic phenomena solar space plas...,Physical Review Letters,0031-9007,not_ipcc


In [17]:
df_all.columns

Index(['year', 'doi', 'title', 'topics', 'locations_names', 'locations_ids',
       'label'],
      dtype='object')

using fasttext

In [18]:
df_all['label'] = '__label__' + df_all['label'].astype(str)
df_all['category_description']=df_all['label']+' '+df_all['title']+' '+df_all['topics']+' '+df_all['locations_names']+' '+df_all['locations_ids']

In [19]:
train, test = train_test_split(df_all, test_size=0.2, random_state=42)

In [20]:
train.shape, test.shape

((76961, 8), (19241, 8))

In [21]:
train.to_csv("teds.train", columns=["category_description"], index=False, header=False)
test.to_csv("teds.test", columns=["category_description"], index=False, header=False)

In [22]:
model = fasttext.train_supervised(input="teds.train")
model.test("teds.test")

(14265, 0.9656501927795303, 0.9656501927795303)

In [23]:
model.save_model("fasttext_model_teds_0204.bin")

In [24]:
model = fasttext.load_model("fasttext_model_teds_0204.bin")



In [25]:
test_data = pd.read_csv("teds.test", header=None, names=["text"])

In [26]:
true_labels = []
predicted_labels = []

for line in test_data["text"]:
    label, text = line.split(' ', 1)
    true_labels.append(label)
    text = text.replace('\n', '').strip()
    predicted_label = model.predict(text)[0][0]  
    predicted_labels.append(predicted_label)

In [27]:
conf_matrix = confusion_matrix(true_labels, predicted_labels)

In [28]:
conf_matrix

array([[9270,  427],
       [ 313, 9231]], dtype=int64)

In [29]:
accuracy_score(true_labels, predicted_labels)

0.9615404604750273

In [30]:
recall_score(true_labels, predicted_labels, average="weighted")

0.9615404604750273

In [31]:
f1_score(true_labels, predicted_labels, average="weighted")

0.961540922345335

In [32]:
t=test.iloc[3,:]

In [33]:
t

year                                                                 2012
doi                                              10.1175/jamc-d-11-0137.1
title                   comparison impact global climate changes urban...
topics                  urban heat island mitigation climate variabili...
locations_names            Journal of Applied Meteorology and Climatology
locations_ids                                                   1558-8424
label                                                       __label__ipcc
category_description    __label__ipcc comparison impact global climate...
Name: 8938, dtype: object

In [34]:
print(f"{t.title} {t.topics} {t.locations_names} {t.locations_ids}",model.predict(nfx.remove_stopwords(f"{t.title} {t.topics} {t.locations_names} {t.locations_ids}"), k=-1))

comparison impact global climate changes urbanization summertime future climate tokyo metropolitan area urban heat island mitigation climate variability models land use ecosystem services Journal of Applied Meteorology and Climatology 1558-8424 (('__label__ipcc', '__label__not_ipcc'), array([9.99977112e-01, 4.29159190e-05]))
