## Import Package & Dataset

In [None]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()

if "GPU" not in device_name:
    print("GPU device not found")
    
print('Found GPU at: {}'.format(device_name))

print("GPU", "available (YESS!!!!)" if tf.config.list_physical_devices("GPU") else "not available :(")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import glob
from datasets import load_dataset
import datasets

In [None]:
import langid

In [None]:
!pip install evaluate

In [None]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import concatenate_datasets

In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/praproses-responses-csv/praproses_responses.csv')
# pd.set_option('display.max_colwidth', None)
print(df.dtypes)
df

In [None]:
# df['translated'] = df['answer'].apply(translate_text)
df['LEVEL KOMPETENSI'] = df['LEVEL KOMPETENSI'].astype(str)

## TRAIN/TEST SPLIT

In [None]:
# yahoo_answers_qa = yahoo_answers_qa["train"].train_test_split(test_size=0.3)
from sklearn.model_selection import train_test_split

# Membagi dataset menjadi data train (70%) dan data test (30%)
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

df_train

In [None]:
from datasets import DatasetDict, Dataset

# Misalkan X_train dan X_test adalah DataFrame dengan kolom 'id', 'answer', 'label', 'translated'
# Pilih kolom-kolom yang ingin Anda gunakan

# Ubah DataFrame menjadi dataset dengan menggunakan Dataset dari Hugging Face's transformers
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Membuat DatasetDict dengan format yang diinginkan
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
})

# Menghapus kolom '__index_level_0__' dari "train" dataset
dataset["train"] = dataset["train"].remove_columns('__index_level_0__')

# Menghapus kolom '__index_level_0__' dari "test" dataset
dataset["test"] = dataset["test"].remove_columns('__index_level_0__')

# Menampilkan informasi dataset
print(dataset)

## LOAD MODEL & TOKENIZER

In [None]:
from transformers import AutoTokenizer

model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

## TOKENIZE

In [None]:
from datasets import concatenate_datasets
from transformers import AutoTokenizer

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["RESPONSES"], truncation=True), batched=True, remove_columns=['RESPONSES', 'LEVEL KOMPETENSI'])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["LEVEL KOMPETENSI"], truncation=True), batched=True, remove_columns=['RESPONSES', 'LEVEL KOMPETENSI'])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

## PREPARED TRAIN

In [None]:
# prefix = "tolong klasifikasikan respon tersebut dimana masukan tersebut terdapat kelompok kompetensi dan respon dengan pemisah simbol semicolons, tujuan output prediksi klasifikasi respon tersebut terdapat pada level kompetensi berapa berdasarkan dari pola dataset training : "
prefix = "terdapat input dengan format kelompok kompetensi dan respons yang dipisahkan dengan semicolon. klasifikasikan respons tersebut, berdasarkan kelompok kompetensi yang dimilikinya, responsnya masuk ke dalam level kompetensi berapa dalam rentang level 1 sampai 5 dan keluaran hanya berupa angka saja, responsnya adalah sebagai berikut : "

def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = [prefix + item for item in sample["RESPONSES"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["LEVEL KOMPETENSI"], max_length=max_target_length, padding=padding, truncation=True)
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
## tokenize dataset with preprocess prefix
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['RESPONSES', 'LEVEL KOMPETENSI'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

## Fine-tune and evaluate FLAN-T5

In [None]:
from transformers import AutoModelForSeq2SeqLM

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
import nltk
import evaluate

from nltk.tokenize import sent_tokenize

nltk.download("punkt")

# Metric
metric = evaluate.load("f1")

# helper function to postprocess text
# post process convert token to label result
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

# Fungsi ini mengembalikan evaluasi metrik yang dihitung, termasuk rata-rata skor F1 dan rata-rata durasi prediksi yang dihasilkan.
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, average='macro')
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## TRAINING ARGUMENTS

In [None]:
# Define local output directory
local_output_dir = "/kaggle/working/model_id"  # Ganti dengan path direktori lokal yang diinginkan

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=local_output_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    learning_rate=3e-4,

    num_train_epochs=2,
    # logging & evaluation strategies
    logging_dir=f"{local_output_dir}/logs",
    logging_strategy="epoch",
    evaluation_strategy="no",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=False,
    report_to="tensorboard",
    push_to_hub=False,  # Tidak push ke Hugging Face Hub
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [None]:
from sklearn.model_selection import KFold
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Assuming you have already defined your model, data_collator, tokenized_dataset, and compute_metrics

# Define local output directory
local_output_dir = "/kaggle/working/model_id"  # Replace with the desired local directory path

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=local_output_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    learning_rate=3e-4,
    num_train_epochs=2,
    # logging & evaluation strategies
    logging_dir=f"{local_output_dir}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",  # Change to 'epoch' for evaluation at each epoch
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=False,
    report_to="tensorboard",
    push_to_hub=False,  # Do not push to Hugging Face Hub
)

# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(tokenized_dataset["train"])):
    print(f"Fold {fold+1}/5")

    # Extract train and test datasets
    train_dataset = tokenized_dataset["train"].select(train_index)
    test_dataset = tokenized_dataset["train"].select(test_index)

    # Create Trainer instance for each fold
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    results = trainer.evaluate()

    # Print or store the evaluation results as needed
    print(results)


In [None]:
from tqdm.auto import tqdm

In [None]:
from sklearn.model_selection import KFold
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

from sklearn.metrics import classification_report

# Assuming you have already defined your model, data_collator, tokenized_dataset, and compute_metrics

# Define local output directory
local_output_dir = "/kaggle/working/model_id"  # Replace with the desired local directory path

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=local_output_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,
    learning_rate=3e-4,
    num_train_epochs=2,
    logging_dir=f"{local_output_dir}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=False,
    report_to="tensorboard",
    push_to_hub=False,
)

# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(tokenized_dataset["train"])):
    print(f"Fold {fold+1}/5")

    # Extract train and test datasets
    train_dataset = tokenized_dataset["train"].select(train_index)
    test_dataset = tokenized_dataset["train"].select(test_index)

    # Create Trainer instance for each fold
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()
    
    samples_number = len(dataset['test'])
    progress_bar = tqdm(range(samples_number))
    predictions_list = []
    labels_list = []
    for i in range(samples_number):
      text = dataset['test']['RESPONSES'][i]
      inputs = tokenizer.encode_plus(text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
      outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
      prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
      predictions_list.append(prediction)
      labels_list.append(dataset['test']['LEVEL KOMPETENSI'][i])

      progress_bar.update(1)
    
    
    str_labels_list = []
    for i in range(len(labels_list)): str_labels_list.append(str(labels_list[i]))
        
    report = classification_report(str_labels_list, predictions_list, zero_division=0)
    print(report)

#     # Evaluate the model
#     results = trainer.evaluate()

#     # Print or store the evaluation results as needed
#     print(results)

#     # Additional: Print or store the evaluation metrics
#     print(f"Fold {fold+1} Evaluation Metrics:")
#     for key, value in results.items():
#         print(f"{key}: {value}")

#     # Print a separator for better readability
#     print("=" * 50)


In [None]:
from tqdm.auto import tqdm

## START TRAIN

In [None]:
# Start training 
trainer.train()

In [None]:
import numpy as np

# evaluate model
trainer.evaluate()

## SAVE MODEL

In [None]:
# Save the trained model and tokenizer locally
local_model_dir = "/kaggle/working/ourmodel"
model.save_pretrained(local_model_dir)
tokenizer.save_pretrained(local_model_dir)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.svm import SVC
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold

# Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("flax-community/flan-t5-base")
model = AutoModelForSequenceClassification.from_pretrained("flax-community/flan-t5-base")


# # Assuming df_train is your training DataFrame with 'text' and 'label' columns
# texts = df_train['train'].tolist()
# labels = df_train['test'].tolist()

# # Tokenize the data
# encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
# labels = torch.tensor(labels)

# # Create DataLoader
# dataset = TensorDataset(encodings.input_ids, encodings.attention_mask, labels)
# loader = DataLoader(dataset, batch_size=8, shuffle=True)

# # Define the SVM classifier
# clf = SVC(kernel='linear', C=1, random_state=0)

# # Perform cross-validation
# scoring = {'precision_macro', 'recall_macro'}
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# scores = cross_val_score(clf, X=encodings.input_ids, y=labels, cv=cv, scoring=make_scorer(recall_score, average='macro'))
# print("Cross-validated recall scores:", scores)


In [None]:
import pickle
# Simpan model dan tokenizer menggunakan pickle
with open("t5_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("t5_tokenizer.pkl", "wb") as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

## Run Inference and Classification Report

In [None]:
from tqdm.auto import tqdm

samples_number = len(dataset['test'])
progress_bar = tqdm(range(samples_number))
predictions_list = []
labels_list = []
for i in range(samples_number):
  text = dataset['test']['RESPONSES'][i]
  inputs = tokenizer.encode_plus(text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
  outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  predictions_list.append(prediction)
  labels_list.append(dataset['test']['LEVEL KOMPETENSI'][i])

  progress_bar.update(1)

In [None]:
str_labels_list = []
for i in range(len(labels_list)): str_labels_list.append(str(labels_list[i]))

In [None]:
from sklearn.metrics import classification_report

report = classification_report(str_labels_list, predictions_list, zero_division=0)
print(report)

## IF YOU WANT MODEL ACCELERATOR BY CPU

In [None]:
import torch

# move model & tensor to GPU cuda
loaded_model = model.to('cpu')
inputs = inputs.to('cpu')

In [None]:
# Define the text you want to generate predictions
input_text = "Pada saat saya menjabat sebagai Camat Cipaku saya mengadakan kegiatan pengajian syukuran dengan mengundang tetangga dan sanak keluarga dirumah ketika pengajian akan dimulai saya mendapatkan informasi terjadinya bencana alam angin puting beliung di Desa Bangbayang Kecamatan Cipaku sehingga saya langsung meminta maaf kepada tamu undangan karena saya tidak dapat mengikuti kegiatan tersebut hingga selesai karena saya selaku camat harus berada pada lokasi bencana untuk segera mengambil keputusan dan koordinasi terkait evakuasi warga yang terkena dampak bencana Jadi saya akan menghadapi dengan tenang dan saya akan memprioritaskan penyelesaian konflik kepentingan pekerjaan dengan mengidentifikasi permasalahan mengdiagnosis permasalahan yang terjadi memberikan solusi pelaksaan solusi dan mengevaluasi solusi tersebut Setelah menyelesaikan koflik pekerjaan selanjutnya adalah menyelesaikan konflik kepentingan pribadi"

# Tokenize and generate predictions
inputs = tokenizer.encode_plus(input_text, padding='max_length', max_length=512, return_tensors='pt').to('cpu')
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the prediction
print("Input Text:", input_text, "\n")
print("Prediction:", prediction)

## IF YOU WANT ACCELERATOR BY GPU NVIDIA CUDA

In [None]:
import torch

# move model & tensor to GPU cuda
loaded_model = model.to('cuda')
inputs = inputs.to('cuda')

In [None]:
# Define the text you want to generate predictions
input_text = "Saya belum pernah mengalami kepentingan konflik yang bersifat ekstrim terkait keluarga, misalnya orang tua sakit keras dan memerlukan dijenguk sesegera mungkin ketika saya sedang memiliki pekerjaan yang harus sesegera mungkin diselesaikan. Padahal sebagai seorang ASN telah terikat dalam sumpah yang harus mendahulukan kepentingan umum dibandingkan kepentingan pribadi atau golongan. Bilamana hal itu terjadi maka upaya yang akan dilakukan adalah meminta izin kepada atasan langsung dan meminta bantuan rekan kerja untuk menangani sementara pekerjaan tersebut di atas dengan terlebih dahulu memberikan penjelasan perihal pekerjaan dimaksud. Namun demikian di sisi lain sebagai bagian dari penyelenggara negara, konflik kepentingan pribadi dengan pekerjaan bila tidak dikelola dengan baik, salah satunya akan mendorong ke arah prilaku koruptif dan prilaku menyimpang lainnya dengan memanfaatkan fasilitas jabatan, kelemahan sistem ataupun keinginan untuk memperkaya diri sendiri atau orang lain. Oleh karena itu upaya pencegahan yang paling efektif adalah dengan tetap berpegang teguh pada sumpah ketika dilantik sebagai seorang PNS pegawai dan ketika dilantik sebagai seorang pemegang jabatan. Bahwa sumpah tersebut disamping disaksikan oleh manusia, juga disaksikan oleh Allah SWT  akan dihisab  dikemudian hari kelak , serta memperhatikan perundang  undangan dan kebijakan yang berlaku."
# Tokenize and generate predictions
inputs = tokenizer.encode_plus(input_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the prediction
print("Input Text:", input_text, "\n")
print("Prediction:", prediction)

In [None]:
df_test.iloc[134]

In [None]:
with open("t5_model.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

with open("t5_tokenizer.pkl", "rb") as tokenizer_file:
    loaded_tokenizer = pickle.load(tokenizer_file)
    
# Define the text you want to generate predictions
input_text = "Saya belum pernah mengalami kepentingan konflik yang bersifat ekstrim terkait keluarga, misalnya orang tua sakit keras dan memerlukan dijenguk sesegera mungkin ketika saya sedang memiliki pekerjaan yang harus sesegera mungkin diselesaikan. Padahal sebagai seorang ASN telah terikat dalam sumpah yang harus mendahulukan kepentingan umum dibandingkan kepentingan pribadi atau golongan. Bilamana hal itu terjadi maka upaya yang akan dilakukan adalah meminta izin kepada atasan langsung dan meminta bantuan rekan kerja untuk menangani sementara pekerjaan tersebut di atas dengan terlebih dahulu memberikan penjelasan perihal pekerjaan dimaksud. Namun demikian di sisi lain sebagai bagian dari penyelenggara negara, konflik kepentingan pribadi dengan pekerjaan bila tidak dikelola dengan baik, salah satunya akan mendorong ke arah prilaku koruptif dan prilaku menyimpang lainnya dengan memanfaatkan fasilitas jabatan, kelemahan sistem ataupun keinginan untuk memperkaya diri sendiri atau orang lain. Oleh karena itu upaya pencegahan yang paling efektif adalah dengan tetap berpegang teguh pada sumpah ketika dilantik sebagai seorang PNS pegawai dan ketika dilantik sebagai seorang pemegang jabatan. Bahwa sumpah tersebut disamping disaksikan oleh manusia, juga disaksikan oleh Allah SWT  akan dihisab  dikemudian hari kelak , serta memperhatikan perundang  undangan dan kebijakan yang berlaku."
# Tokenize and generate predictions
inputs = loaded_tokenizer.encode_plus(input_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
outputs = loaded_model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
prediction = loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the prediction
print("Input Text:", input_text, "\n")
print("Prediction:", prediction)