In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold

In [2]:
def load_data_for_split(file_path):
    texts, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            text = line.replace(' ', '')
            label = []
            i = 0
            while (i < len(text)):
                if text[i] in ['，', '。', '？', '！']:
                    label.append('1')
                else:
                    label.append('0')
                i += 1
            text_list = list(text)
            # label[-1] = 2
            # print(text, label)
            # break
            texts.append(text_list)
            labels.append(''.join(label))

            if (len(text_list) != len(label)):
                print('Error:', text, label)
    return texts, labels

In [3]:
texts, labels = load_data_for_split('data/train_large_2.txt')

In [4]:
print(texts[:5])
print(labels[:5])

[['忽', '忽', '幾', '晨', '昏', '，', '離', '別', '間', '之', '，', '疾', '病', '間', '之', '，', '不', '及', '終', '年', '同', '靜', '好', '。'], ['煢', '煢', '小', '兒', '女', '，', '孱', '羸', '若', '此', '，', '嬌', '憨', '若', '此', '，', '更', '煩', '二', '老', '費', '精', '神', '。'], ['毋', '人', '負', '我', '，', '毋', '我', '負', '人', '，', '柳', '下', '雖', '和', '有', '介', '稱', '，', '先', '生', '字', '此', '，', '可', '以', '諡', '此', '。'], ['愛', '老', '臣', '少', '，', '愛', '少', '臣', '老', '，', '馮', '唐', '爲', '郎', '無', '倦', '意', '，', '吾', '輩', '慕', '之', '，', '不', '能', '效', '之', '。'], ['深', '院', '落', '滕', '花', '，', '石', '不', '點', '頭', '龍', '不', '語', '。']]
['000001000010000100000001', '000001000010000100000001', '0000100001000000010000100001', '0000100001000000010000100001', '00000100000001']


In [5]:
# Calculate the ratio of punctuation marks
bins = labels
print(bins[:5])

['000001000010000100000001', '000001000010000100000001', '0000100001000000010000100001', '0000100001000000010000100001', '00000100000001']


In [6]:
groups = []
for i in range(len(labels)):
    groups.append(i)

In [7]:
# Create a DataFrame for easier manipulation
df = pd.DataFrame({'labels': labels, 'bins': bins, 'texts': texts, 'groups': groups})

# Step 3: Stratified K-Fold Cross Validation using the binned categories
skf = StratifiedKFold(n_splits=5)
sgkf = StratifiedGroupKFold(n_splits=5)

In [72]:
# output df to csv
df.to_csv('data/train_large_bin_2.csv', index=False)

In [8]:
cnt_train = 0
cnt_valid = 0

for i, (train_index, val_index) in enumerate(sgkf.split(df['texts'], df['bins'], df['groups'])):
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]
    
    X_train, y_train = train_df['texts'].tolist(), train_df['bins']
    X_val, y_val = val_df['texts'].tolist(), val_df['bins']

    
    with open(f'data/split3/train_{i}.txt', 'w', encoding='utf-8') as file:
        for text, label in zip(X_train, y_train):
            file.write(''.join(text) + '\n')
            cnt_train += 1

    with open(f'data/split3/valid_{i}.txt', 'w', encoding='utf-8') as file:
        for text, label in zip(X_val, y_val):
            file.write(''.join(text) + '\n')
            cnt_valid += 1

print(cnt_train, cnt_valid)




2368352 592088


In [16]:
cnt_train = 0
cnt_valid = 0

for i, (train_index, val_index) in enumerate(skf.split(df['texts'], df['bins'])):
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]
    
    X_train, y_train = train_df['texts'].tolist(), train_df['bins']
    X_val, y_val = val_df['texts'].tolist(), val_df['bins']

    
    with open(f'data/split3/train_{i}.txt', 'w', encoding='utf-8') as file:
        for text, label in zip(X_train, y_train):
            file.write(''.join(text) + '\n')
            cnt_train += 1

    with open(f'data/split3/valid_{i}.txt', 'w', encoding='utf-8') as file:
        for text, label in zip(X_val, y_val):
            file.write(''.join(text) + '\n')
            cnt_valid += 1

print(cnt_train, cnt_valid)




2368352 592088


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# Assuming your data is in a pandas DataFrame with columns 'sentence' and 'label'
# If your data is in another format, adapt this part accordingly
# For example:
# data = pd.DataFrame({'sentence': sentences, 'label': labels})

# Load your data (replace with your actual loading mechanism)
sentences, labels = load_data_for_split('data/train_large_2.txt')
data = pd.DataFrame({'sentence': sentences, 'label': labels})
# Initialize StratifiedKFold
n_splits = 64  # We want to split into 10 folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Split the data into 10 stratified folds
for i, (train_index, test_index) in enumerate(skf.split(sentences, labels)):
    # Extract the corresponding fold
    fold_sentences = [''.join(sentences[idx]) for idx in test_index]
    
    # Write the fold to a file
    fold_df = pd.DataFrame({'sentence': fold_sentences})  
    fold_df.to_csv(f'data/split4/valid_{i+1}.csv', index=False, header=False)  
    print(f'Valid {i+1} saved with {len(fold_sentences)} sentences.')

    fold_sentences = [''.join(sentences[idx]) for idx in train_index]
    fold_df = pd.DataFrame({'sentence': fold_sentences})
    fold_df.to_csv(f'data/split4/train_{i+1}.csv', index=False, header=False)
    print(f'Train {i+1} saved with {len(fold_sentences)} sentences.')


In [37]:
from sklearn.model_selection import train_test_split

def recursive_split(data, n_splits, idx=0):
    print(f'Splitting data_{idx} with {len(data)} samples')
    if idx >= n_splits:
        with open(f'data/split5/data_{idx - n_splits}.txt', 'w', encoding='utf-8') as file:
            for text in data['sentence'].to_list():
                file.write(''.join(text) + '\n')
        return
    train_data, test_data = train_test_split(data, test_size=0.5, random_state=42, stratify=data['label'])
    # print(f'Lenght of train_data: {len(train_data)}')
    # print(f'Lenght of test_data: {len(test_data)}')
    recursive_split(train_data, n_splits, idx * 2)
    recursive_split(test_data, n_splits, idx * 2 + 1)

In [None]:
sentences, labels = load_data_for_split('data/train_large_2.txt')

data = pd.DataFrame({'sentence': sentences, 'label': labels})

In [24]:
label_counts = data['label'].value_counts()
valid_labels = label_counts[label_counts >= 64].index
filtered_data = data[data['label'].isin(valid_labels)]

In [25]:
print(f'Original data with {len(data)} samples')
print(f'Filtered data with {len(filtered_data)} samples')

Original data with 592088 samples
Filtered data with 569625 samples


In [38]:
recursive_split(filtered_data, 64, 1)

Splitting data_1 with 569625 samples
Splitting data_2 with 284812 samples
Splitting data_4 with 142406 samples
Splitting data_8 with 71203 samples
Splitting data_16 with 35601 samples
Splitting data_32 with 17800 samples
Splitting data_64 with 8900 samples
Splitting data_65 with 8900 samples
Splitting data_33 with 17801 samples
Splitting data_66 with 8900 samples
Splitting data_67 with 8901 samples
Splitting data_17 with 35602 samples
Splitting data_34 with 17801 samples
Splitting data_68 with 8900 samples
Splitting data_69 with 8901 samples
Splitting data_35 with 17801 samples
Splitting data_70 with 8900 samples
Splitting data_71 with 8901 samples
Splitting data_9 with 71203 samples
Splitting data_18 with 35601 samples
Splitting data_36 with 17800 samples
Splitting data_72 with 8900 samples
Splitting data_73 with 8900 samples
Splitting data_37 with 17801 samples
Splitting data_74 with 8900 samples
Splitting data_75 with 8901 samples
Splitting data_19 with 35602 samples
Splitting data_