In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from datasets import Dataset, DatasetDict
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_excel("../data/sample_llm_assessed.xlsx", index_col=0)

In [4]:
df["is_relevant"].value_counts()

is_relevant
1    458
0    142
Name: count, dtype: int64

In [5]:
df['is_relevant'] = df['is_relevant'].astype(int)
df["is_relevant"].value_counts()


is_relevant
1    458
0    142
Name: count, dtype: int64

In [6]:
# data shuffle
df = df.sample(frac=1)

In [7]:
df.head()

Unnamed: 0,index_new,Authors,Author full names,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,...,PubMed ID,Language of Original Document,Abbreviated Source Title,Document Type,Publication Stage,Open Access,Source,EID,Potential_policy,is_relevant
5842,5842,Ceron A.; Gandini A.; Lodetti P.,"Ceron, Andrea (48761074100); Gandini, Alessand...",48761074100; 56865621300; 57217249124,Still ‘fire in the (full) belly’? Anti-establi...,2021,Information Communication and Society,24.0,10.0,,...,,English,Inf. Commun. Soc.,Article,Final,All Open Access; Green Open Access,Scopus,2-s2.0-85086907732,True,1
2464,2464,Cha S.,"Cha, Seokki (58983265300)",58983265300,The potential role of small modular reactors (...,2024,Nuclear Engineering and Technology,,,103314.0,...,,English,Nucl. Eng. Technol.,Article,Article in press,,Scopus,2-s2.0-85212323684,True,1
2898,2898,Yang H.; Liu F.,"Yang, Heng (57546034300); Liu, Fenghong (56133...",57546034300; 56133222600,A Preliminary Study on the FAIRification Chara...,2024,"Documentation, Information and Knowledge",41.0,2.0,,...,,Chinese,Doc. Inf. Knowl.,Article,Final,,Scopus,2-s2.0-85194197200,True,1
2599,2599,Sreedhar M.N.; Rebedea T.; Parisien C.,"Sreedhar, Makesh Narsimhan (57712450800); Rebe...",57712450800; 24338916400; 24169548200,Unsupervised Extraction of Dialogue Policies f...,2024,EMNLP 2024 - 2024 Conference on Empirical Meth...,,,,...,,English,EMNLP - Conf. Empir. Methods Nat. Lang. Proces...,Conference paper,Final,All Open Access; Green Open Access,Scopus,2-s2.0-85217743251,False,0
1192,1192,Xu Z.; Zhang X.,"Xu, Zengzhan (59932184200); Zhang, Xi (5993218...",59932184200; 59932184300,A Cross-Platform Comparison of Public Discussi...,2025,International Journal of Human-Computer Intera...,,,,...,,English,Int. J. Hum.-Comput. Interact.,Article,Article in press,,Scopus,2-s2.0-105007419850,True,1


In [8]:
def split_df(X, y, test_size):
    X_split_1, X_split_2, y_split_1, y_split_2 = train_test_split(
       X, y, test_size=test_size, random_state=12, stratify=y
    )
    return X_split_1, X_split_2, y_split_1, y_split_2

def reduce_imbalanced_dataset(X, y, proportion=1.2):
    dfx = pd.DataFrame(X, y).reset_index()
    dfx.columns = ["y", "x"]
    df_1 = dfx[dfx.y == 1]
    df_0 = dfx[dfx.y == 0].sample(int(df_1.shape[0]*proportion))
    df_tbr = pd.concat([df_1, df_0])
    df_tbr = df_tbr.sample(frac=1)

    print("[DEBUG] New number of values per label:")
    print(df_tbr.y.value_counts())

    return df_tbr.x.to_list(), df_tbr.y.to_list()

In [9]:
X_train, X_test, y_train, y_test = split_df(df["Abstract"].to_list(), df["is_relevant"].to_list(), 0.2)
# split test into two
X_test, X_val, y_test, y_val = split_df(X_test, y_test, 0.5)

In [10]:
def create_dataset(X_train, X_test, X_val, y_train, y_test, y_val):
    """
    create a dataset for the baseline model
    input:
        X_train, X_test, X_val, y_train, y_test, y_val
    returns:
        dataset
    """
    d = {"train":Dataset.from_dict({"label":y_train, "text":X_train}),
         "test":Dataset.from_dict({"label":y_test, "text":X_test}),
         "validation":Dataset.from_dict({"label":y_val, "text":X_val})
     }
    d = DatasetDict(d)
    return d

# create dataset
dataset = create_dataset(X_train, X_test, X_val, y_train, y_test, y_val)

In [None]:
# save dataset
with open("../data/for_classification.pickle", "wb") as f:
    pickle.dump(dataset, f)
