In [1]:
import os
import random

def split_files_in_folder(input_folder, train_ratio=0.7):
    # Get absolute path
    input_folder = os.path.abspath(input_folder)
    print(input_folder)
    # Create train and test directories
    train_folder = os.path.join(input_folder, 'train')
    test_folder = os.path.join(input_folder, 'test')
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # Loop through all .txt files in the folder
    for filename in os.listdir(input_folder):
        file_path = os.path.join(input_folder, filename)

        # Skip if it's not a .txt file or a folder
        if not filename.endswith('.txt') or not os.path.isfile(file_path):
            continue

        # Read lines from the file
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        # Shuffle lines
        random.shuffle(lines)

        # Split into train and test
        split_idx = int(len(lines) * train_ratio)
        train_lines = lines[:split_idx]
        test_lines = lines[split_idx:]

        # Build output filenames
        base_name = os.path.splitext(filename)[0]
        train_file = os.path.join(train_folder, f'{base_name}_train.txt')
        test_file = os.path.join(test_folder, f'{base_name}_test.txt')

        # Write to train and test files
        with open(train_file, 'w', encoding='utf-8') as f:
            f.writelines(train_lines)
        with open(test_file, 'w', encoding='utf-8') as f:
            f.writelines(test_lines)

        print(f"Processed: {filename} -> {train_file}, {test_file}")

    print("\n All files split and saved in 'train/' and 'test/' folders.")


# split_files_in_folder('../Group04/LS_Group04/')
# split_files_in_folder('../Group04/rd_group4/')
split_files_in_folder('../Group04/NLS_Group04/')



D:\IITDH\Sem1\SPRL\SPR_Assignment\SPR\Dataset\Group04\NLS_Group04
Processed: class1.txt -> D:\IITDH\Sem1\SPRL\SPR_Assignment\SPR\Dataset\Group04\NLS_Group04\train\class1_train.txt, D:\IITDH\Sem1\SPRL\SPR_Assignment\SPR\Dataset\Group04\NLS_Group04\test\class1_test.txt
Processed: class2.txt -> D:\IITDH\Sem1\SPRL\SPR_Assignment\SPR\Dataset\Group04\NLS_Group04\train\class2_train.txt, D:\IITDH\Sem1\SPRL\SPR_Assignment\SPR\Dataset\Group04\NLS_Group04\test\class2_test.txt
Processed: class3.txt -> D:\IITDH\Sem1\SPRL\SPR_Assignment\SPR\Dataset\Group04\NLS_Group04\train\class3_train.txt, D:\IITDH\Sem1\SPRL\SPR_Assignment\SPR\Dataset\Group04\NLS_Group04\test\class3_test.txt

 All files split and saved in 'train/' and 'test/' folders.


In [6]:
def split_big_file(big_file_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    with open(big_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Skip the first line
    lines = lines[1:]

    total = len(lines)
    print(f"Total lines after skipping first line: {total}")

    class1_count = 300
    class2_count = 500
    class3_count = 1000

    if total < (class1_count + class2_count + class3_count):
        raise ValueError("Not enough lines in big file for the split counts")

    class1_lines = lines[:class1_count]
    class2_lines = lines[class1_count:class1_count+class2_count]
    class3_lines = lines[class1_count+class2_count:class1_count+class2_count+class3_count]

    class1_file = os.path.join(output_folder, 'class1.txt')
    class2_file = os.path.join(output_folder, 'class2.txt')
    class3_file = os.path.join(output_folder, 'class3.txt')

    with open(class1_file, 'w', encoding='utf-8') as f:
        f.writelines(class1_lines)
    with open(class2_file, 'w', encoding='utf-8') as f:
        f.writelines(class2_lines)
    with open(class3_file, 'w', encoding='utf-8') as f:
        f.writelines(class3_lines)

    print("Class files created!")

In [7]:
big_file = '../Group04/NLS_Group04.txt'  # Your big file containing all 1800 lines
output_dir = '../Group04/NLS_Group04'  # Folder to save class1.txt, class2.txt, class3.txt

split_big_file(big_file, output_dir)

Total lines after skipping first line: 1800
Class files created!
