In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
import os

In [3]:
def save_to_txt(data, filepath):
    with open(filepath, 'w', encoding='utf-8') as file:
        for item in data:
            file.write(item + '\n')  # Tulis setiap baris ke file

In [2]:
file_path = 'aste_annotator1.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

data = [line.strip() for line in lines]  # Hapus spasi atau newline di awal/akhir baris
df = pd.DataFrame(data, columns=['text'])

In [4]:
# 1. Pertama, pisahkan 15% data untuk test (hold-out)
train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=42, shuffle=True)

print(f"Total data: {len(df)}")
print(f"Test set (tetap): {len(test_df)} ({len(test_df)/len(df)*100:.2f}%)")
print(f"Train + Val set: {len(train_val_df)} ({len(train_val_df)/len(df)*100:.2f}%)\n")

Total data: 3319
Test set (tetap): 498 (15.00%)
Train + Val set: 2821 (85.00%)



In [7]:
# 2. Lakukan 5-fold cross validation pada train_val_df untuk mendapatkan split training dan validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_index, val_index in kf.split(train_val_df):
    train_df = train_val_df.iloc[train_index]
    val_df = train_val_df.iloc[val_index]
    
    # Buat direktori khusus untuk fold ini, misalnya data/govapp/fold1, fold2, dst.
    save_directory = os.path.join('data', 'govapp', f'fold{fold}')
    os.makedirs(save_directory, exist_ok=True)
    
    # Tentukan file path untuk masing-masing split
    train_filepath = os.path.join(save_directory, 'train_triplets.txt')
    dev_filepath   = os.path.join(save_directory, 'dev_triplets.txt')
    test_filepath  = os.path.join(save_directory, 'test_triplets.txt')
    
    # Simpan data ke file teks (diasumsikan kolom 'text' berisi data yang akan disimpan)
    save_to_txt(train_df['text'].tolist(), train_filepath)
    save_to_txt(val_df['text'].tolist(), dev_filepath)
    save_to_txt(test_df['text'].tolist(), test_filepath)
    
    print(f"Fold {fold}:")
    print(f"  Training data: {len(train_df)} data")
    print(f"  Validation data: {len(val_df)} data")
    print(f"  Testing data (tetap): {len(test_df)} data\n")
    
    fold += 1

Fold 1:
  Training data: 2256 data
  Validation data: 565 data
  Testing data (tetap): 498 data

Fold 2:
  Training data: 2257 data
  Validation data: 564 data
  Testing data (tetap): 498 data

Fold 3:
  Training data: 2257 data
  Validation data: 564 data
  Testing data (tetap): 498 data

Fold 4:
  Training data: 2257 data
  Validation data: 564 data
  Testing data (tetap): 498 data

Fold 5:
  Training data: 2257 data
  Validation data: 564 data
  Testing data (tetap): 498 data

