In [3]:
import numpy as np
import pandas as pd

In [4]:
data = pd.DataFrame({'A':pd.Series([i for i in range(20)]), 'B':pd.Series([i**2 for i in range(20)])})

In [5]:
data.head()

Unnamed: 0,A,B
0,0,0
1,1,1
2,2,4
3,3,9
4,4,16


In [13]:
class NestedSubsets():
    """Generate nested subsets of a data set.
    Input: a pd.DataFrame, an optional step_size, and an optional percentage.
    Step_size overrides percentage.
    
    Output: an iterator whose elements are nested subsets of the input, 
    decreasing in size. The decrease in size is step_size or the 
    prescribed percentage of the input."""
    def __init__(self, data, step_size = None, percentage = 0.1):
        self.sample = data.sample(data.shape[0]) # full data set
        self.step_size = int(percentage * data.shape[0])
        if step_size: self.step_size = step_size # minium subsample size
        self.remainder_size = self.sample.shape[0] % self.step_size # remainder subset
        self.sample_size = self.sample.shape[0] # initial subsample size

    def __iter__(self):
        return self
    
    def __next__(self):
        if self.sample_size == 0:
            raise StopIteration
        current_sample = self.sample # update the subsample to output
        self.sample_size = max(self.sample_size-self.step_size,0) # new subsample size
        self.sample = self.sample.sample(self.sample_size)
        return current_sample

    def get_step_size(self):
        return self.step_size

In [14]:
subs = NestedSubsets(data, 7, percentage = .4)

In [15]:
for subsets in subs:
    print(subsets.head(2))

     A    B
19  19  361
2    2    4
     A    B
15  15  225
12  12  144
   A   B
6  6  36
8  8  64


The code to turn into a script:

In [16]:
import numpy as np
import pandas as pd

class NestedSubsets():
    """Generate nested subsets of a data set.
    Input: a pd.DataFrame, an optional step_size, and an optional percentage.
    Step_size overrides percentage.
    
    Output: an iterator whose elements are nested subsets of the input, 
    decreasing in size. The decrease in size is step_size or the 
    prescribed percentage of the input."""
    def __init__(self, data, step_size = None, percentage = 0.1):
        self.sample = data.sample(data.shape[0]) # full data set
        self.step_size = int(percentage * data.shape[0])
        if step_size: self.step_size = step_size # minium subsample size
        self.remainder_size = self.sample.shape[0] % self.step_size # remainder subset
        self.sample_size = self.sample.shape[0] # initial subsample size

    def __iter__(self):
        return self
    
    def __next__(self):
        if self.sample_size == 0:
            raise StopIteration
        current_sample = self.sample # update the subsample to output
        self.sample_size = max(self.sample_size-self.step_size,0) # new subsample size
        self.sample = self.sample.sample(self.sample_size)
        return current_sample

    def get_step_size(self):
        return self.step_size

In [17]:
subs = NestedSubsets(data)

In [18]:
subs.get_step_size()

2