In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [2]:
def create_folds(data, n_splits, random_state=None):
    data["fold"] = -1
    data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)
    num_bins = int(np.floor(1 + np.log2(len(data))))  # bins数量(Sturges规则)
    data.loc[:, "bins"] = pd.cut(data["target"], bins=num_bins, labels=False)  # 根据'target'列分箱
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, "fold"] = f

    data = data.drop("bins", axis=1)
    return data

In [3]:
data_train = pd.read_csv('../datasets/train.csv')
data_train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [4]:
df_train_oof = create_folds(data_train, 5, 42)
df_train_oof.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,fold
0,b51730f9c,,,"Alice looked at the jury-box, and saw that, in...",-0.432678,0.487498,0
1,4d403fd57,https://en.wikipedia.org/wiki/Artificial_intel...,CC BY-SA 3.0,Artificial intelligence (AI) is intelligence e...,-1.161746,0.458396,2
2,0f789ee41,,,A gruff squire on horseback with shiny top boo...,-2.367914,0.519369,4
3,87f96eb79,,,But that hadn't helped Washington.\nThe Americ...,-0.842596,0.466193,0
4,b9cca6661,,,The principal business of the people of this c...,-0.748452,0.433,4


In [5]:
df_train_oof.to_csv("df_train_oof.csv", index=True, header=True)

In [6]:
fold_list = [0, 1, 2, 3, 4]

for fold in fold_list:
    train_df = df_train_oof[df_train_oof['fold'] != fold]  # 训练数据集
    val_df = df_train_oof[df_train_oof['fold'] == fold]  # 验证数据集
    print(train_df.shape)
    print(val_df.shape)

(2267, 7)
(567, 7)
(2267, 7)
(567, 7)
(2267, 7)
(567, 7)
(2267, 7)
(567, 7)
(2268, 7)
(566, 7)
