In [13]:
import os 
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from datasets import Dataset, DatasetDict
import pickle

In [14]:
df = pd.read_excel("../data/sample_llm_assessed.xlsx", index_col=0)

In [15]:
df["is_relevant"].value_counts()

is_relevant
1    458
0    142
Name: count, dtype: int64

In [16]:
df['is_relevant'] = df['is_relevant'].astype(int)
df["is_relevant"].value_counts()


is_relevant
1    458
0    142
Name: count, dtype: int64

In [17]:
# data shuffle
df = df.sample(frac=1)

In [18]:
df.head()

Unnamed: 0,index_new,Authors,Author full names,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,...,PubMed ID,Language of Original Document,Abbreviated Source Title,Document Type,Publication Stage,Open Access,Source,EID,Potential_policy,is_relevant
5045,5045,Fagbola T.M.; Abayomi A.; Mutanga M.B.; Jugoo V.,"Fagbola, Temitayo Matthew (56801897400); Abayo...",56801897400; 57022238600; 37662103200; 5726821...,Lexicon-Based Sentiment Analysis and Emotion C...,2022,Lecture Notes in Networks and Systems,417 LNNS,,,...,,English,Lect. Notes Networks Syst.,Conference paper,Final,,Scopus,2-s2.0-85126183602,True,1
4445,4445,Kim J.; Choi H.; Mo J.,"Kim, Jiwan (58191996500); Choi, Hyunkyoo (7404...",58191996500; 7404339166; 9632348600,A Comparative Analysis of Research Trends in t...,2023,Journal of Information Science Theory and Prac...,11,1.0,,...,,English,J. Inf. Sci. Theory Pract.,Article,Final,,Scopus,2-s2.0-85153280162,False,1
1981,1981,Choi Y.-C.; Kee Y.,"Choi, Young-Chool (56911364500); Kee, Younghwa...",56911364500; 58996427200,The nature of Saemaul Undong as a rural develo...,2024,Iberoamerican Journal of Science Measurement a...,4,1.0,,...,,English,Iberoam. J. Sci. Meas. Commun.,Article,Final,All Open Access; Bronze Open Access; Green Ope...,Scopus,2-s2.0-85186392409,True,1
981,981,Wang C.; Li Z.,"Wang, Chuqi (58104629600); Li, Zhiyu (58705841...",58104629600; 58705841100,Identifying primary frames of official public ...,2025,Media Asia,52,1.0,,...,,English,Media Asia,Article,Final,,Scopus,2-s2.0-85185146555,True,1
4510,4510,Park Y.; Shin Y.-W.,"Park, Yumin (57224911137); Shin, Yong-Wook (56...",57224911137; 56296480600,Trend Analysis of Balcony Vegetable Gardens in...,2022,"Journal of People, Plants, and Environment",25,5.0,,...,,English,J. People. Plant. Environ.,Article,Final,All Open Access; Gold Open Access,Scopus,2-s2.0-85141366080,False,1


In [19]:
def split_df(X, y, test_size):
    X_split_1, X_split_2, y_split_1, y_split_2 = train_test_split(
       X, y, test_size=test_size, random_state=12, stratify=y
    )
    print(f"[DEBUG]: original: {len(X)} split into {len(X_split_1)} and {len(X_split_2)} ")

    return X_split_1, X_split_2, y_split_1, y_split_2

def reduce_imbalanced_dataset(X, y, proportion=1.2):
    """
    Return a more balanced dataset for training 
    input:
        X: predictors
        y: labels
        proportion: percentage of the downsized label you want to keep (e.g. if 1.2, the downsized label will be 1.2 times the other label)
    returns:
        X: X, after downsizing
        y: y, after downsizing
    """

    # create a df 
    dfx = pd.DataFrame(X, y).reset_index()
    # add the columns x and y
    dfx.columns = ["y", "x"]
    # keep the small with 1
    df_1 = dfx[dfx.y == 1]
    # reduce the ones with size of len(df_1) * proportion
    df_0 = dfx[dfx.y == 0].sample(int(df_1.shape[0]*proportion))
    # create new df
    df_tbr = pd.concat([df_1, df_0])
    # shuffle
    df_tbr = df_tbr.sample(frac=1)

    print("[DEBUG] New number of values per label:")
    print(df_tbr.y.value_counts())

    return df_tbr.x.to_list(), df_tbr.y.to_list()

In [20]:
X_train, X_test, y_train, y_test = split_df(df["Abstract"].to_list(), df["is_relevant"].to_list(), 0.2)
# split test into two
X_test, X_val, y_test, y_val = split_df(X_test, y_test, 0.5)

[DEBUG]: original: 600 split into 480 and 120 
[DEBUG]: original: 120 split into 60 and 60 


In [21]:
def create_dataset(X_train, X_test, X_val, y_train, y_test, y_val):
    """
    create a dataset for the baseline model
    input:
        X_train, X_test, X_val, y_train, y_test, y_val
    returns:
        dataset
    """
    d = {"train":Dataset.from_dict({"label":y_train, "text":X_train}),
         "test":Dataset.from_dict({"label":y_test, "text":X_test}),
         "validation":Dataset.from_dict({"label":y_val, "text":X_val})
     }
    d = DatasetDict(d)
    return d

# create dataset
dataset = create_dataset(X_train, X_test, X_val, y_train, y_test, y_val)

In [23]:
# save dataset and take it to colab
with open("../data/for_classification.pickle", "wb") as f:
    pickle.dump(dataset, f)
