In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from datasets import Dataset, DatasetDict
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_excel("../data/sample_llm_assessed.xlsx", index_col=0)

In [3]:
df["is_relevant"].value_counts()

is_relevant
1    458
0    142
Name: count, dtype: int64

In [4]:
df['is_relevant'] = df['is_relevant'].astype(int)
df["is_relevant"].value_counts()


is_relevant
1    458
0    142
Name: count, dtype: int64

In [5]:
# data shuffle
df = df.sample(frac=1)

In [6]:
df.head()

Unnamed: 0,index_new,Authors,Author full names,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,...,PubMed ID,Language of Original Document,Abbreviated Source Title,Document Type,Publication Stage,Open Access,Source,EID,Potential_policy,is_relevant
7998,7998,Mittal S.; Mitra A.; Gupta A.; Zeigler B.P.,"Mittal, Saurabh (9337568300); Mitra, Amit (598...",9337568300; 59802251700; 8451638300; 35576167000,Strengthening OV-6a semantics with rule-based ...,2006,Proceedings of the 2006 IEEE International Con...,,,4018469.0,...,,English,Proc. IEEE Int. Conf. Info. Reuse Integr.,Conference paper,Final,,Scopus,2-s2.0-34547452087,True,0
458,458,Liu B.; Zhang J.; Lin F.; Yang C.; Peng M.; Yi...,"Liu, Ben (57937723800); Zhang, Jihai (57222182...",57937723800; 57222182299; 57221314556; 5859965...,SymAgent: A Neural-Symbolic Self-Learning Agen...,2025,WWW 2025 - Proceedings of the ACM Web Conference,,,,...,,English,WWW - Proc. ACM Web Conf.,Conference paper,Final,All Open Access; Green Open Access,Scopus,2-s2.0-105005140683,True,0
6994,6994,Chao C.-H.; Lin C.-P.; Peng T.-H.; Yu C.-H.; L...,"Chao, Chian-Hsueng (50261299200); Lin, Chia-Pe...",50261299200; 57203393161; 57203388073; 5720338...,The open data and voices of social communities...,2018,ACM International Conference Proceeding Series,,,3227726.0,...,,English,ACM Int. Conf. Proc. Ser.,Conference paper,Final,,Scopus,2-s2.0-85051509747,False,1
6783,6783,,,,49th International Conference on Computers and...,2019,Proceedings of International Conference on Com...,2019-October,,,...,,English,"Proc. Int. Conf. Comput. Ind. Eng., CIE",Conference review,Final,,Scopus,2-s2.0-85079482850,True,0
896,896,Dong W.; Chen S.; Yang Y.,"Dong, Wenjie (59600858100); Chen, Sirong (5960...",59600858100; 59602187100; 56493572100,ProTOD: Proactive Task-oriented Dialogue Syste...,2025,Proceedings - International Conference on Comp...,Part F206484-1,,,...,,English,"Proc. Main Conf. Int. Conf. Comput. Linguist.,...",Conference paper,Final,,Scopus,2-s2.0-85218490543,True,0


In [7]:
def split_df(X, y, test_size):
    X_split_1, X_split_2, y_split_1, y_split_2 = train_test_split(
       X, y, test_size=test_size, random_state=12, stratify=y
    )
    return X_split_1, X_split_2, y_split_1, y_split_2

def reduce_imbalanced_dataset(X, y, proportion=1.2):
    dfx = pd.DataFrame(X, y).reset_index()
    dfx.columns = ["y", "x"]
    df_1 = dfx[dfx.y == 1]
    df_0 = dfx[dfx.y == 0].sample(int(df_1.shape[0]*proportion))
    df_tbr = pd.concat([df_1, df_0])
    df_tbr = df_tbr.sample(frac=1)

    print("[DEBUG] New number of values per label:")
    print(df_tbr.y.value_counts())

    return df_tbr.x.to_list(), df_tbr.y.to_list()

In [8]:
X_train, X_test, y_train, y_test = split_df(df["Abstract"].to_list(), df["is_relevant"].to_list(), 0.2)
# split test into two
X_test, X_val, y_test, y_val = split_df(X_test, y_test, 0.5)

In [9]:
def create_dataset(X_train, X_test, X_val, y_train, y_test, y_val):
    """
    create a dataset for the baseline model
    input:
        X_train, X_test, X_val, y_train, y_test, y_val
    returns:
        dataset
    """
    d = {"train":Dataset.from_dict({"label":y_train, "text":X_train}),
         "test":Dataset.from_dict({"label":y_test, "text":X_test}),
         "validation":Dataset.from_dict({"label":y_val, "text":X_val})
     }
    d = DatasetDict(d)
    return d

# create dataset
dataset = create_dataset(X_train, X_test, X_val, y_train, y_test, y_val)

In [10]:
# save dataset
with open("../data/for_classification.pickle", "wb") as f:
    pickle.dump(dataset, f)
