In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold

In [2]:
def load_data_for_split(file_path):
    texts, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            text = line.replace(' ', '')
            label = []
            i = 0
            while (i < len(text)):
                if text[i] in ['，', '。', '？', '！']:
                    label.append('1')
                else:
                    label.append('0')
                i += 1
            text_list = list(text)
            # label[-1] = 2
            # print(text, label)
            # break
            texts.append(text_list)
            labels.append(''.join(label))

            if (len(text_list) != len(label)):
                print('Error:', text, label)
    return texts, labels

In [3]:
from sklearn.model_selection import train_test_split

def recursive_split(data, n_splits, idx=0):
    print(f'Splitting data_{idx} with {len(data)} samples')
    if idx >= n_splits:
        with open(f'data/split6/data_{idx - n_splits}.txt', 'w', encoding='utf-8') as file:
            for text in data['sentence'].to_list():
                file.write(''.join(text) + '\n')
        return
    train_data, test_data = train_test_split(data, test_size=0.5, random_state=42, stratify=data['label'])
    # print(f'Lenght of train_data: {len(train_data)}')
    # print(f'Lenght of test_data: {len(test_data)}')
    recursive_split(train_data, n_splits, idx * 2)
    recursive_split(test_data, n_splits, idx * 2 + 1)

In [4]:
sentences, labels = load_data_for_split('data/train_large_2.txt')

data = pd.DataFrame({'sentence': sentences, 'label': labels})

In [5]:
label_counts = data['label'].value_counts()
valid_labels = label_counts[label_counts >= 4096].index
filtered_data = data[data['label'].isin(valid_labels)]

In [6]:
print(f'Original data with {len(data)} samples')
print(f'Filtered data with {len(filtered_data)} samples')

Original data with 592088 samples
Filtered data with 413525 samples


In [7]:
recursive_split(filtered_data, 4096, 1)

Splitting data_1 with 413525 samples
Splitting data_2 with 206762 samples
Splitting data_4 with 103381 samples
Splitting data_8 with 51690 samples
Splitting data_16 with 25845 samples
Splitting data_32 with 12922 samples
Splitting data_64 with 6461 samples
Splitting data_128 with 3230 samples
Splitting data_256 with 1615 samples
Splitting data_512 with 807 samples
Splitting data_1024 with 403 samples
Splitting data_2048 with 201 samples
Splitting data_4096 with 100 samples
Splitting data_4097 with 101 samples
Splitting data_2049 with 202 samples
Splitting data_4098 with 101 samples
Splitting data_4099 with 101 samples
Splitting data_1025 with 404 samples
Splitting data_2050 with 202 samples
Splitting data_4100 with 101 samples
Splitting data_4101 with 101 samples
Splitting data_2051 with 202 samples
Splitting data_4102 with 101 samples
Splitting data_4103 with 101 samples
Splitting data_513 with 808 samples
Splitting data_1026 with 404 samples
Splitting data_2052 with 202 samples
Split