## For EDA (Step 2: remove duplicate and EDA)

In [None]:
import ast
from collections import Counter

# Define the file name (adjust the path if necessary)
filename = '../Data Labeling/TJ - Overall Dataset/v1.9 (self_training batch 4)/ori.txt'
# filename = '../Data Labeling/TJ v1.8 (TwitterIO) Negative - v1.0/filtered_output.txt'
# filename = '../Data Labeling/TJ - Overall Dataset/combined_dataset_max2tok.txt'
# filename = '../Data Labeling/TJ - self_training/batch 5/filtered_output.txt'

# Initialize counters for aspect and opinion word counts
aspect_word_counts = Counter()
opinion_word_counts = Counter()

row_count = 0
triplet_count = 0
sentiment_counter = Counter()
triplet_count_dist = Counter()

with open(filename, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if not line:
            continue

        row_count += 1

        # Split into tweet text and annotation
        try:
            _, annotation = line.rsplit("#### #### ####", 1)
        except ValueError:
            print(f"Line {row_count} does not have the expected format.")
            continue

        # Parse annotations
        try:
            annotations = ast.literal_eval(annotation.strip())
        except Exception as e:
            print(f"Error evaluating annotations on line {row_count}: {e}")
            continue

        # Count this row's triplets
        n = len(annotations)
        triplet_count += n
        triplet_count_dist[n] += 1

        # Count sentiment labels and word counts for aspect and opinion
        for triplet in annotations:
            if len(triplet) == 3:
                aspect, opinion, sentiment = triplet
                sentiment_counter[sentiment] += 1

                # Count words in aspect
                if isinstance(aspect, str):
                    aspect_words = aspect.split()
                    aspect_word_counts[len(aspect_words)] += 1
                elif isinstance(aspect, list):
                    aspect_word_counts[len(aspect)] += 1
                else:
                    print(f"Warning: Aspect on line {row_count} is not str or list: {type(aspect)}")

                # Count words in opinion
                if isinstance(opinion, str):
                    opinion_words = opinion.split()
                    opinion_word_counts[len(opinion_words)] += 1
                elif isinstance(opinion, list):
                    opinion_word_counts[len(opinion)] += 1
                else:
                    print(f"Warning: Opinion on line {row_count} is not str or list: {type(opinion)}")

# Print summary
print("Total number of rows:", row_count)
print("Total number of triplets:", triplet_count)

print("\nAspect word count distribution:")
for count in sorted(aspect_word_counts.keys()):
    print(f"  {count} kata: {aspect_word_counts[count]} label")

print("\nOpinion word count distribution:")
for count in sorted(opinion_word_counts.keys()):
    print(f"  {count} kata: {opinion_word_counts[count]} label")

print("\nSentiment distribution:")
for sentiment, count in sentiment_counter.items():
    print(f"  {sentiment}: {count}")

print("\nTriplet distribution:")
for n in sorted(triplet_count_dist):
    label = "label" if n == 1 else "labels"
    print(f"  {n} {label}: {triplet_count_dist[n]} rows")

Total number of rows: 2522
Total number of triplets: 3676

Aspect word count distribution:
  1 kata: 2191 label
  2 kata: 1227 label
  3 kata: 190 label
  4 kata: 49 label
  5 kata: 11 label
  6 kata: 4 label
  7 kata: 3 label
  8 kata: 1 label

Opinion word count distribution:
  1 kata: 1291 label
  2 kata: 1763 label
  3 kata: 395 label
  4 kata: 147 label
  5 kata: 51 label
  6 kata: 19 label
  7 kata: 4 label
  8 kata: 5 label
  9 kata: 1 label

Sentiment distribution:
  NEG: 1583
  POS: 1058
  NEU: 1035

Triplet distribution:
  1 label: 1686 rows
  2 labels: 615 rows
  3 labels: 151 rows
  4 labels: 52 rows
  5 labels: 12 rows
  6 labels: 4 rows
  7 labels: 1 rows
  8 labels: 1 rows


In [4]:
# remove duplicate tweets

def hapus_tweet_duplikat(input_file):
    """
    Menghapus tweet duplikat dari file input dan menyimpan hanya tweet pertama.

    Args:
        input_file (str): Path ke file input.

    Returns:
        tuple: Tuple berisi list tweet unik dan jumlah tweet duplikat.
    """
    tweet_unik = []
    seen_tweets = set()
    jumlah_duplikat = 0

    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            # Ambil bagian tweet saja, sebelum metadata [([..])]
            tweet = line.split('####')[0].strip()
            if tweet not in seen_tweets:
                tweet_unik.append(line.strip())
                seen_tweets.add(tweet)
            else:
                jumlah_duplikat += 1
    return tweet_unik, jumlah_duplikat

def simpan_tweet_unik(output_file, list_tweet_unik):
    """
    Menyimpan list tweet unik ke dalam file output.

    Args:
        output_file (str): Path ke file output.
        list_tweet_unik (list): List berisi tweet unik.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        for tweet in list_tweet_unik:
            f.write(tweet + '\n')

if __name__ == "__main__":
    input_filename = "../Data Labeling/TJ - Overall Dataset/v1.10 (self_training batch 5)/ori.txt"
    output_filename = "../Data Labeling/TJ - Overall Dataset/v1.10 (self_training batch 5)/ori_removed duplicates.txt"
    tweet_unik, jumlah_duplikat = hapus_tweet_duplikat(input_filename)
    simpan_tweet_unik(output_filename, tweet_unik)
    print(f"Tweet unik telah disimpan di file '{output_filename}'")
    print(f"Jumlah tweet duplikat yang ditemukan: {jumlah_duplikat}")

Tweet unik telah disimpan di file '../Data Labeling/TJ - Overall Dataset/v1.10 (self_training batch 5)/ori_removed duplicates.txt'
Jumlah tweet duplikat yang ditemukan: 0


In [23]:
# split dataset ke 4 skenario

# -*- coding: utf-8 -*-
"""
Pisahkan korpus Span-ASTE menjadi 4 skenario:
S1 = 1 label, 1-kata ; S2 = multi label, 1-kata ;
S3 = 1 label, multi-kata ; S4 = multi label, multi-kata (korpus penuh).
"""
import ast
from pathlib import Path

INFILE  = Path("../Data Labeling/TJ v1.5 - v1.0.3/filtered_output_removed duplicates.txt")

# ------- helper ----------------------------------------------------------------
def parse_line(raw: str):
    """return text, list-of-triplets  (triplet = ([idx_aspek], [idx_opini], sent))"""
    txt, ann = raw.rsplit("#### #### ####", 1)
    triplets = ast.literal_eval(ann.strip())
    return txt.strip(), triplets

def is_single_word(span):          # True jika panjang daftar index == 1
    return len(span) == 1

def triplet_all_single_word(trs):  # semua aspek & opini di tweet = 1 kata
    return all(is_single_word(t[0]) and is_single_word(t[1]) for t in trs)

def tweet_multi_word(trs):         # setidaknya satu aspek/opini > 1 kata
    return any(len(t[0]) > 1 or len(t[1]) > 1 for t in trs)

# ------- load & bucket ----------------------------------------------------------
buckets = {     # tampung string baris
    "S1": [],   # 1 label, single-word
    "S2": [],   # multi label, single-word
    "S3": []    # 1 label, multi-word
    # S4 = sisanya
}

with INFILE.open(encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        text, trips = parse_line(line)
        n_label     = len(trips)

        if n_label == 1 and triplet_all_single_word(trips):
            buckets["S1"].append(line)

        elif n_label > 1 and triplet_all_single_word(trips):
            buckets["S2"].append(line)

        elif n_label == 1 and tweet_multi_word(trips):
            buckets["S3"].append(line)

        # else: otomatis masuk skenario 4 (multi-label & ada multi-kata)

# ------- write out 3 skenario pertama ------------------------------------------
OUT_DIR = INFILE.parent / "scenario_splits"
OUT_DIR.mkdir(exist_ok=True)

for tag, rows in buckets.items():
    out_path = OUT_DIR / f"{INFILE.stem}_{tag}.txt"
    out_path.write_text("\n".join(rows), encoding="utf-8")
    print(f"{tag}: {len(rows):>4} baris  →  {out_path}")

print("✅  Selesai.  S4 = sisa baris di dataset asli.")

S1:   96 baris  →  ..\Data Labeling\TJ v1.5 - v1.0.3\scenario_splits\filtered_output_removed duplicates_S1.txt
S2:   19 baris  →  ..\Data Labeling\TJ v1.5 - v1.0.3\scenario_splits\filtered_output_removed duplicates_S2.txt
S3:  520 baris  →  ..\Data Labeling\TJ v1.5 - v1.0.3\scenario_splits\filtered_output_removed duplicates_S3.txt
✅  Selesai.  S4 = sisa baris di dataset asli.


In [24]:
# version 1.2

import ast
from collections import Counter

# Define the input files
filenames = [
    '../Data Labeling/TJ v1.5 - v1.1/filtered_output.txt',
    '../Data Labeling/TJ v1.4 - v1.6/filtered_output.txt'
]

# Define the output merged file
output_filename = '../Data Labeling/TJ v1.5 - v1.0/merged_filtered_output.txt'

# Initialize counters
row_count = 0
triplet_count = 0
sentiment_counter = Counter()
triplet_count_dist = Counter()

# Collect all lines first
merged_lines = []

for filename in filenames:
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue

            row_count += 1
            merged_lines.append(line)  # Save the line for output

            # Split into tweet text and annotation
            try:
                _, annotation = line.rsplit("#### #### ####", 1)
            except ValueError:
                print(f"Line {row_count} in {filename} does not have the expected format.")
                continue

            # Parse annotations
            try:
                annotations = ast.literal_eval(annotation.strip())
            except Exception as e:
                print(f"Error evaluating annotations on line {row_count} in {filename}: {e}")
                continue

            # Count this row's triplets
            n = len(annotations)
            triplet_count += n
            triplet_count_dist[n] += 1

            # Count sentiment labels
            for triplet in annotations:
                if len(triplet) == 3:
                    sentiment = triplet[2]
                    sentiment_counter[sentiment] += 1

# --- Save merged file ---
with open(output_filename, 'w', encoding='utf-8') as outfile:
    for line in merged_lines:
        outfile.write(line + '\n')

print(f"\n✅ Merged file saved to: {output_filename}")

# --- Print summary ---
print("\n--- Summary ---")
print("Total number of rows:", row_count)
print("Total number of triplets:", triplet_count)
print("\nSentiment distribution:")
for sentiment, count in sentiment_counter.items():
    print(f"  {sentiment}: {count}")

print("\nTriplet distribution:")
for n in sorted(triplet_count_dist):
    label = "label" if n == 1 else "labels"
    print(f"  {n} {label}: {triplet_count_dist[n]} rows")



✅ Merged file saved to: ../Data Labeling/TJ v1.5 - v1.0/merged_filtered_output.txt

--- Summary ---
Total number of rows: 771
Total number of triplets: 1059

Sentiment distribution:
  NEU: 353
  POS: 300
  NEG: 406

Triplet distribution:
  1 label: 547 rows
  2 labels: 182 rows
  3 labels: 29 rows
  4 labels: 8 rows
  5 labels: 3 rows
  6 labels: 1 rows
  8 labels: 1 rows


## For split the dataset (Step 5)

In [None]:
# version 1.0

import ast
from collections import Counter
from sklearn.model_selection import train_test_split

def read_and_assign_labels(filename):
    """
    Reads the annotated file and determines a dominant sentiment label for each line.
    Each line is expected to have a tweet text and an annotation list separated by
    "#### #### ####". The dominant label is computed as the most frequent sentiment
    among the annotation triplets.
    """
    data = []
    dominant_labels = []
    
    with open(filename, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                # Expecting two parts: tweet text and annotation list
                text_part, annotation_str = line.rsplit("#### #### ####", 1)
            except ValueError:
                print(f"Skipping line {idx} due to unexpected format.")
                continue

            try:
                # Safely parse the annotation string into a Python object
                annotations = ast.literal_eval(annotation_str.strip())
            except Exception as e:
                print(f"Error parsing annotations on line {idx}: {e}")
                continue

            # Extract the sentiment label (third element) from each annotation triplet
            labels = [triplet[2] for triplet in annotations if len(triplet) == 3]
            if labels:
                dominant = Counter(labels).most_common(1)[0][0]
            else:
                dominant = None
            data.append(line)
            dominant_labels.append(dominant)
    return data, dominant_labels

# Read data from file and assign dominant labels for stratification.
data, dominant_labels = read_and_assign_labels("../Data Labeling/TJ - Overall Dataset/v1.8 (max2tok)/merged_max2tok.txt")
print("Total rows loaded:", len(data))

# Split the data ensuring that the label proportions are roughly maintained.
# First, split off a test set (15% of the data)
train_dev_data, test_data, train_dev_labels, test_labels = train_test_split(
    data, dominant_labels, test_size=0.15, random_state=42, stratify=dominant_labels)

# Then, split the remaining train_dev set into training and development.
# Here, we compute dev as roughly 15% of the entire dataset,
# which is about 15% / 85% ≈ 17.65% of the train_dev set.
train_data, dev_data, train_labels, dev_labels = train_test_split(
    train_dev_data, train_dev_labels, test_size=0.1765, random_state=42, stratify=train_dev_labels)

print("Training set size:", len(train_data))
print("Dev set size:", len(dev_data))
print("Test set size:", len(test_data))

def print_label_distribution(labels, split_name):
    counter = Counter(lab for lab in labels if lab is not None)
    print(f"{split_name} label distribution:")
    for lab, count in counter.items():
        print(f"  {lab}: {count}")

print_label_distribution(train_labels, "Train")
print_label_distribution(dev_labels, "Dev")
print_label_distribution(test_labels, "Test")

# Write the splits to separate text files.
with open("../Data Labeling/TJ - Overall Dataset/v1.8 (max2tok)/train.txt", "w", encoding='utf-8') as train_file:
    for line in train_data:
        train_file.write(line + "\n")

with open("../Data Labeling/TJ - Overall Dataset/v1.8 (max2tok)/dev.txt", "w", encoding='utf-8') as dev_file:
    for line in dev_data:
        dev_file.write(line + "\n")

with open("../Data Labeling/TJ - Overall Dataset/v1.8 (max2tok)/test.txt", "w", encoding='utf-8') as test_file:
    for line in test_data:
        test_file.write(line + "\n")


Total rows loaded: 1213
Training set size: 849
Dev set size: 182
Test set size: 182
Train label distribution:
  NEG: 273
  NEU: 310
  POS: 266
Dev label distribution:
  NEU: 66
  NEG: 59
  POS: 57
Test label distribution:
  NEG: 59
  NEU: 66
  POS: 57


In [3]:
# version 1.1 - Direct 70/15/15 split

import ast
from collections import Counter
from sklearn.model_selection import train_test_split

def read_and_assign_labels(filename):
    data = []
    dominant_labels = []

    with open(filename, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                text_part, annotation_str = line.rsplit("#### #### ####", 1)
            except ValueError:
                print(f"Skipping line {idx} due to unexpected format.")
                continue

            try:
                annotations = ast.literal_eval(annotation_str.strip())
            except Exception as e:
                print(f"Error parsing annotations on line {idx}: {e}")
                continue

            labels = [triplet[2] for triplet in annotations if len(triplet) == 3]
            if labels:
                dominant = Counter(labels).most_common(1)[0][0]
            else:
                dominant = None
            data.append(line)
            dominant_labels.append(dominant)
    return data, dominant_labels

# Path to your merged file
filename = "../Data Labeling/TJ - Overall Dataset/v1.10 (self_training batch 5)/ori.txt"

# Read
data, dominant_labels = read_and_assign_labels(filename)
print("Total rows loaded:", len(data))

# --- Direct 70% train, 15% dev, 15% test split ---

# First split off test set (15%)
train_dev_data, test_data, train_dev_labels, test_labels = train_test_split(
    data, dominant_labels, test_size=0.15, random_state=42, stratify=dominant_labels)

# Then split train/dev (15/85 ≈ 0.1765 of 85% = ~15% of total)
train_data, dev_data, train_labels, dev_labels = train_test_split(
    train_dev_data, train_dev_labels, test_size=0.1765, random_state=42, stratify=train_dev_labels)

# Check
print("Training set size:", len(train_data))
print("Dev set size:", len(dev_data))
print("Test set size:", len(test_data))

# Label distribution
def print_label_distribution(labels, split_name):
    counter = Counter(lab for lab in labels if lab is not None)
    print(f"{split_name} label distribution:")
    for lab, count in counter.items():
        print(f"  {lab}: {count}")

print_label_distribution(train_labels, "Train")
print_label_distribution(dev_labels, "Dev")
print_label_distribution(test_labels, "Test")

# --- Save splits to file ---
output_folder = "../Data Labeling/TJ - Overall Dataset/v1.10 (self_training batch 5)/"
with open(output_folder + "train.txt", "w", encoding='utf-8') as f:
    for line in train_data:
        f.write(line + "\n")

with open(output_folder + "dev.txt", "w", encoding='utf-8') as f:
    for line in dev_data:
        f.write(line + "\n")

with open(output_folder + "test.txt", "w", encoding='utf-8') as f:
    for line in test_data:
        f.write(line + "\n")

print("✅ Data successfully split into train/dev/test!")


Total rows loaded: 3179
Training set size: 2225
Dev set size: 477
Test set size: 477
Train label distribution:
  NEG: 996
  POS: 555
  NEU: 674
Dev label distribution:
  POS: 119
  NEG: 213
  NEU: 145
Test label distribution:
  POS: 119
  NEU: 145
  NEG: 213
✅ Data successfully split into train/dev/test!


# (Step 1) Merge Txt Files

In [62]:
import os

def append_all_txt_files(folder_path, output_filename="combined_dataset.txt"):
    """
    Menggabungkan semua file .txt dalam folder tertentu menjadi satu file output.

    Args:
        folder_path (str): Path ke folder yang berisi file .txt.
        output_filename (str): Nama file output yang akan dibuat.
    """
    all_text = []
    try:
        for filename in os.listdir(folder_path):
            if filename.endswith(".txt"):
                filepath = os.path.join(folder_path, filename)
                print(f"Membaca file: {filepath}")
                with open(filepath, 'r', encoding='utf-8') as f:
                    content = f.readlines()
                    all_text.extend(content)
        
        output_filepath = os.path.join(folder_path, output_filename)
        print(f"\nMenulis semua konten ke file: {output_filepath}")
        with open(output_filepath, 'w', encoding='utf-8') as outfile:
            outfile.writelines(all_text)
        
        print(f"\n✅ Semua file .txt berhasil digabungkan menjadi {output_filename} di dalam folder yang sama.")

    except FileNotFoundError:
        print(f"Error: Folder '{folder_path}' tidak ditemukan.")
    except Exception as e:
        print(f"Terjadi kesalahan: {e}")

# Ganti path folder di bawah ini dengan path yang sesuai di komputer Anda
folder_path = "../Data Labeling/TJ - Overall Dataset"
append_all_txt_files(folder_path)

Membaca file: ../Data Labeling/TJ - Overall Dataset\NEG.txt
Membaca file: ../Data Labeling/TJ - Overall Dataset\NEU.txt
Membaca file: ../Data Labeling/TJ - Overall Dataset\POS.txt
Membaca file: ../Data Labeling/TJ - Overall Dataset\TJ v1.5 - v1.0.txt
Membaca file: ../Data Labeling/TJ - Overall Dataset\TJ v1.7 - v1.0.txt

Menulis semua konten ke file: ../Data Labeling/TJ - Overall Dataset\combined_dataset.txt

✅ Semua file .txt berhasil digabungkan menjadi combined_dataset.txt di dalam folder yang sama.


# Debugging (Step 4: Check for Format Error)

In [2]:
def detect_delimiter_issues(filename, delimiter="#### #### ####"):
    """
    Mendeteksi baris dalam file yang memiliki masalah dengan delimiter yang ditentukan.

    Args:
        filename (str): Path ke file data training.
        delimiter (str): Delimiter yang diharapkan.

    Returns:
        list: List berisi nomor baris dan konten baris yang memiliki masalah delimiter.
    """
    delimiter_issues = []
    row_number = 0

    try:
        with open(filename, 'r', encoding='utf-8') as file:
            for line in file:
                row_number += 1
                line = line.strip()
                if not line:
                    continue

                parts = line.split(delimiter)

                if len(parts) != 2:
                    delimiter_issues.append((row_number, line, f"Jumlah pemisah tidak tepat (ditemukan {len(parts)}, diharapkan 2)"))

    except FileNotFoundError:
        print(f"Error: File '{filename}' tidak ditemukan.")
        return None
    except Exception as e:
        print(f"Terjadi kesalahan: {e}")
        return None

    return delimiter_issues

if __name__ == "__main__":
    input_filename = '../Data Labeling/TJ - Overall Dataset/v1.10 (self_training batch 5)/filtered_output.txt'
    delimiter_problems = detect_delimiter_issues(input_filename)

    if delimiter_problems is not None:
        if delimiter_problems:
            print("Baris dengan masalah delimiter ditemukan:")
            for row_num, line_content, error_reason in delimiter_problems:
                print(f"Baris {row_num}: '{line_content}' - Alasan: {error_reason}")
        else:
            print("Tidak ada baris dengan masalah delimiter ditemukan.")

Baris dengan masalah delimiter ditemukan:
Baris 1889: 'Busway bejir#### #### ####[([0], [1], 'NEG')]buat apa ada teknologi yg disebut melacak posisi anda live location buat apa kalo lo ga gunain jir pt_transjakarta#### #### ####[([9, 10], [15, 16], 'NEG')]' - Alasan: Jumlah pemisah tidak tepat (ditemukan 3, diharapkan 2)


# Filtered in Tweet dengan Opini dan Aspek yang Katanya maksimal 2 (Step 3)

In [11]:
import ast, pathlib

# ----------------- path -----------------
IN_FILE  = '../Data Labeling/TJ - Overall Dataset/v1.8 (max2tok)/merged.txt'
OUT_FILE = pathlib.Path(IN_FILE).with_name('merged_max2tok.txt')

# ----------------- helper ----------------
def keep_triplet(triplet):
    """
    triplet = ([a_idx...], [o_idx...], 'POL')   – return True
              hanya jika len(aspect_idx) <=2  dan  len(opinion_idx) <=2
    """
    if len(triplet) != 3:
        return False
    a_idx, o_idx, _ = triplet
    return len(a_idx) <= 2 and len(o_idx) <= 2

# ----------------- main ------------------
total_lines   = kept_lines = 0
total_trp_in  = total_trp_out = 0

with open(IN_FILE, encoding='utf-8') as fin, \
     open(OUT_FILE, 'w',  encoding='utf-8') as fout:

    for line in fin:
        line = line.rstrip('\n')
        if not line.strip():
            continue

        total_lines += 1
        try:
            text, ann_str = line.rsplit("#### #### ####", 1)
            triplets      = ast.literal_eval(ann_str.strip())
        except Exception as e:
            print(f"⚠️  Skip line {total_lines}: {e}")
            continue

        total_trp_in += len(triplets)

        # --- filter ---
        triplets_kept = [t for t in triplets if keep_triplet(t)]
        if triplets_kept:
            kept_lines    += 1
            total_trp_out += len(triplets_kept)
            fout.write(f"{text}#### #### ####{triplets_kept}\n")

# ----------------- report ----------------
print("─── Selesai ───")
print(f"Baris masuk          : {total_lines}")
print(f"Baris tertulis       : {kept_lines}")
print(f"Triplet sebelum      : {total_trp_in}")
print(f"Triplet sesudah      : {total_trp_out}")
print(f"Triplet ter-drop     : {total_trp_in - total_trp_out}")
print(f"→ File output        : {OUT_FILE}")


─── Selesai ───
Baris masuk          : 2275
Baris tertulis       : 1848
Triplet sebelum      : 3378
Triplet sesudah      : 2629
Triplet ter-drop     : 749
→ File output        : ..\Data Labeling\TJ - Overall Dataset\v1.8 (max2tok)\merged_max2tok.txt


# for Shuffle the train.txt file

In [5]:
import random

INPUT_FILE = "../Data Labeling/TJ - Overall Dataset/v1.6 (self_training batch 3)/train.txt"
OUTPUT_FILE = "../Data Labeling/TJ - Overall Dataset/v1.6 (self_training batch 3)/train_shuffled.txt"

# Baca semua baris
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Acak urutan
random.shuffle(lines)

# Tulis kembali ke file baru
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    f.writelines(lines)

print(f"✅ {len(lines)} baris dari '{INPUT_FILE}' sudah diacak dan disimpan ke '{OUTPUT_FILE}'")


✅ 1817 baris dari '../Data Labeling/TJ - Overall Dataset/v1.6 (self_training batch 3)/train.txt' sudah diacak dan disimpan ke '../Data Labeling/TJ - Overall Dataset/v1.6 (self_training batch 3)/train_shuffled.txt'
