In [None]:
# https://www.kaggle.com/datasets/linhlpv/vietnamese-sentiment-analyst/data

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
model_log_dir = Path("/content/drive/MyDrive/Semester 7/Log/model")
result_log_dir = Path("/content/drive/MyDrive/Semester 7/Log/result")
data_folder = Path("/content/drive/MyDrive/Semester 7/Data")

In [None]:
MODEL_NAME = "phobert"
DATASET_NAME = "concat"
TRAIN_FRAC = 0.5
TEST_FRAC = 0.1
NUM_GENS = 20

In [None]:
# Variables for plotting results (loss and metrics)
model_paths = ["phobert", "visobert", "vibert4news", "vnsbert"]
which_folder = f"{DATASET_NAME}_{int(TRAIN_FRAC*100)}{int(TEST_FRAC*100)}{NUM_GENS}"
result_paths = [
    result_log_dir / f"{path_to_log}/{which_folder}" for path_to_log in model_paths
]
cols = ["test_loss", "test_acc", "test_precision", "test_recall", "test_f1"]

### Set up

In [None]:
if DATASET_NAME == "customer_feedback":
    df_concat = concat_data(
        [
            data_folder / f"raw/{DATASET_NAME}_train.csv",
            data_folder / f"raw/{DATASET_NAME}_val.csv",
            data_folder / f"raw/{DATASET_NAME}_test.csv",
        ]
    ).drop(["index", "date_time", "label"], axis=1)

    df_concat = df_concat.rename(columns={"comment": "sentence", "n_star": "sentiment"})

    df_concat["sentiment"] = label_mapping(df_concat, stars_column="sentiment")

if DATASET_NAME == "student_feedback":
    df_concat = concat_data(
        [
            data_folder / f"raw/{DATASET_NAME}_train.csv",
            data_folder / f"raw/{DATASET_NAME}_val.csv",
            data_folder / f"raw/{DATASET_NAME}_test.csv",
        ]
    ).drop("topic", axis=1)

if DATASET_NAME == "vietnamese_sentiment":
    df_concat = pd.read_csv(data_folder / f"raw/{DATASET_NAME}.csv").drop(
        ["label", "Unnamed: 3"], axis=1
    )

    df_concat = df_concat.rename(columns={"comment": "sentence", "rate": "sentiment"})

    df_concat = df_concat.dropna()

    df_concat["sentiment"] = label_mapping(df_concat, stars_column="sentiment")

if DATASET_NAME == "concat":
    df1 = concat_data(
        [
            data_folder / f"raw/student_feedback_train.csv",
            data_folder / f"raw/student_feedback_val.csv",
            data_folder / f"raw/student_feedback_test.csv",
        ]
    ).drop("topic", axis=1)

    df2 = concat_data(
        [
            data_folder / f"raw/customer_feedback_train.csv",
            data_folder / f"raw/customer_feedback_val.csv",
            data_folder / f"raw/customer_feedback_test.csv",
        ]
    ).drop(["index", "date_time", "label"], axis=1)

    df2 = df2.rename(columns={"comment": "sentence", "n_star": "sentiment"})

    df2["sentiment"] = label_mapping(df2, stars_column="sentiment")

    df3 = pd.read_csv(data_folder / f"raw/vietnamese_sentiment.csv").drop(
        ["label", "Unnamed: 3"], axis=1
    )

    df3 = df3.rename(columns={"comment": "sentence", "rate": "sentiment"})

    df3 = df3.dropna()

    df3["sentiment"] = label_mapping(df3, stars_column="sentiment")

    df_concat = pd.concat([df1, df2, df3])

    df_concat = df_concat.reset_index(drop=True)

In [None]:
train_df, test_df, k_folds = split_data(
    df=df_concat, firsttrain_frac=TRAIN_FRAC, test_frac=TEST_FRAC, n_splits=NUM_GENS
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokeniser = initialise_tokeniser(model_name=MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()
lr = 2e-5
early_stopper = EarlyStopper(patience=3, min_delta=0)
batch_size = 32
max_length = 32
epochs = 10
num_gens = NUM_GENS

args = TrainArgs(
    loss_fn=loss_fn,
    optimiser=None,
    early_stopper=early_stopper,
    lr=lr,
    batch_size=batch_size,
    max_length=max_length,
    epochs=epochs,
    num_gens=num_gens,
)

### Train & Eval

In [None]:
train_df = preprocess(
    df=train_df, model_name=MODEL_NAME, dataset_name=DATASET_NAME, col="sentence"
)

test_df = preprocess(
    df=test_df, model_name=MODEL_NAME, dataset_name=DATASET_NAME, col="sentence"
)

k_folds = [
    preprocess(df=df, model_name=MODEL_NAME, dataset_name=DATASET_NAME, col="sentence")
    for df in k_folds
]

In [None]:
preds, history, model = train_model(
    train_df=train_df,
    test_df=test_df,
    k_folds=k_folds,
    model_name=MODEL_NAME,
    tokeniser=tokeniser,
    args=args,
    device=device,
    use_predicted_labels=True,
    is_shuffle=True,
)

In [None]:
if len(preds) > 1:
    for i in range(len(preds)):
        preds[i].to_csv(result_log_dir / f"{MODEL_NAME}/{MODEL_NAME}_preds{i+1}.csv")

if len(history) == 1:
    log_to_df = pd.DataFrame(history[0])
    log_to_df.to_csv(result_log_dir / f"{MODEL_NAME}/{MODEL_NAME}_1.csv")
    print(f"Saved results in {MODEL_NAME}/{MODEL_NAME}_1.csv")
else:
    for i, log in enumerate(history):
        log_to_df = pd.DataFrame(log)
        log_to_df.to_csv(result_log_dir / f"{MODEL_NAME}/{MODEL_NAME}_{i+2}.csv")
        print(f"Saved results in {MODEL_NAME}/{MODEL_NAME}_{i+2}.csv")