In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn import model_selection


In [2]:
def create_folds(data):
    # initialise kfold
    data["kfold"] = np.nan
    # randomise order of data 
    data = data.sample(frac = 1).reset_index(drop = True)
    # compute number of bins using Stuge's formula  
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    print('number of bins:' + str(num_bins))
    # bin targets 
    data['bin'] = pd.cut(data["target"], bins = num_bins, labels = False)
    
    # initialise stratifed k-fold
    kf = model_selection.StratifiedKFold(n_splits=5)
     
    for f, (t_, v_) in enumerate(kf.split(X=data, y = data.bin.values)):
        data.loc[v_, 'kfold'] = f
        
    #data = data.drop('bins', axis = 1)
    
    return data
    

In [3]:
X,y = datasets.make_regression(n_samples = 15000, n_features = 100, n_targets = 1)

# create dataframe
df = pd.DataFrame(X,columns  = [f'f_{i}' for i in range(X.shape[1])])
df['target'] = y


df2 = create_folds(df)
df2

number of bins:14


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_93,f_94,f_95,f_96,f_97,f_98,f_99,target,kfold,bin
0,0.000168,0.156122,1.053787,-1.474836,0.084890,1.273745,0.317444,-1.472214,-0.291865,0.362112,...,0.635367,-1.155209,-0.273301,0.463089,0.170018,0.099300,0.996037,-371.884741,0.0,2
1,-0.394630,0.255668,0.640350,-0.506832,-0.159595,0.103816,-0.195199,-0.511062,0.014911,-0.460145,...,0.789137,1.824937,0.171317,0.742868,-0.713352,0.665039,-2.304133,-229.175708,0.0,4
2,-0.501601,0.368665,-0.807573,0.043472,-1.105996,0.417992,2.367252,-0.684868,1.256810,-1.125689,...,-0.196882,1.223802,1.918301,-0.209707,0.034601,2.723993,0.742939,-214.228177,0.0,4
3,0.526680,2.114034,-0.470551,-0.550593,-0.642026,0.035462,-0.170190,0.785919,-0.753388,1.496474,...,-0.316637,0.737899,-0.013125,-0.671264,0.033682,1.036933,-1.085248,-218.864753,0.0,4
4,-0.403516,0.145258,-1.197099,0.234717,-0.024343,-0.013526,-0.301745,-0.904267,1.376897,0.177681,...,0.549223,-0.255289,-2.981996,1.060285,-1.028116,-0.945038,1.078542,-187.145293,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,-1.981482,0.296624,1.394384,-0.171253,-0.109941,-0.581578,2.997644,-0.104166,1.614524,0.904526,...,1.006982,-0.180575,-3.158167,0.667192,-0.507610,-0.319883,-1.062641,-70.558906,4.0,6
14996,0.063846,0.341253,-0.170594,1.549745,1.672067,1.062455,0.489518,-0.186206,0.177196,-0.253057,...,-0.068482,-0.755661,0.671904,-0.252507,-1.011217,1.484437,0.560517,58.074013,4.0,7
14997,0.169524,2.485389,-0.070706,-0.703944,0.858655,0.778205,0.214873,-0.725226,-1.382634,-0.612910,...,1.148840,1.228293,0.553711,-1.240441,0.130566,0.802353,-0.468743,84.264823,4.0,8
14998,-0.644996,0.457519,0.299706,-0.245555,0.235255,-2.010355,0.331049,-0.658318,-1.073212,-0.367443,...,0.299926,-0.995862,0.889335,0.733660,1.414834,0.056796,2.568886,-33.696445,4.0,6


In [5]:
df2.bin.sort_values().unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])