In [1]:
def create_single_df(N, K, nfiles, dir, i):
    """
    Creates a single pandas dataframe that contains nrows=N/nfiles

    Parameters
    ----------
    N: int,
     Total number of rows
    K: int,
     Number of groups
    nfiles: int,
     Number of output files
    dir: str,
     Output directory
    i: int,
     Integer to assign to the multiple files e.g. range(nfiles)
    """

    nrows = int(N / nfiles)

    sample_id12 = [f"id{str(x).zfill(3)}" for x in range(1, K + 1)]
    sample_id3 = [f"id{str(x).zfill(10)}" for x in range(1, int(N / K) + 1)]

    id1 = np.random.choice(sample_id12, size=nrows, replace=True)
    id2 = np.random.choice(sample_id12, size=nrows, replace=True)
    id3 = np.random.choice(sample_id3, size=nrows, replace=True)
    id4 = np.random.choice(K, size=nrows, replace=True)
    id5 = np.random.choice(K, size=nrows, replace=True)
    id6 = np.random.choice(int(N / K), size=nrows, replace=True)
    v1 = np.random.choice(5, size=nrows, replace=True)
    v2 = np.random.choice(15, size=nrows, replace=True)
    v3 = np.random.uniform(0, 100, size=nrows)

    df = pd.DataFrame(
        dict(
            zip(
                [f"id{x}" for x in range(1, 7)] + ["v1", "v2", "v3"],
                [id1, id2, id3, id4, id5, id6, v1, v2, v3],
            )
        )
    )

    df.to_csv(
        f"{dir}/groupby-N_{N}_K_{K}_file_{i}.csv",
        index=False,
        float_format="{:.6f}".format,
    )

```r
# split into common (0.9) left (0.1) and right (0.1)
split_xlr = function(n) {
  key = sample.int(n*1.1) # 1.1 = 0.9+0.1+0.1
  list(
    x = key[seq.int(1, n*0.9)],
    l = key[seq.int(n*0.9+1, n)],
    r = key[seq.int(n+1, n*1.1)]
  )
}
```

In [97]:
def split_xlr(n):
    n = int(n)
    key = np.random.choice(np.arange(1, int(n*1.1) + 1), size=int(n*1.1), replace=False)
    x = key[np.arange(0, int(n*0.9))]
    l = key[np.arange(int(n*0.9), n)]
    r = key[np.arange(n, int(n*1.1))]
    return {"x": x, "l": l, "r": r}

```r
sample_all = function(x, size) {
  stopifnot(length(x) <= size)
  y = c(x, sample(x, size=max(size-length(x), 0), replace=TRUE))
  sample(y)
}
# lhs = ['x', 'l']
id1 = sample_all(unlist(key1[lhs], use.names=FALSE), N),
id2 = sample_all(unlist(key2[lhs], use.names=FALSE), N),
id3 = sample_all(unlist(key3[lhs], use.names=FALSE), N)
```

In [102]:
def sample_all(x, n_rows):
    n_rows = int(n_rows)
    assert(len(x) <= n_rows), "I'm so sad"
    y = np.append(
        x,
        np.random.choice(x, size=max(n_rows-len(x), 0), replace=True)
    )
    return np.random.choice(y, size=len(y), replace=False)

In [98]:
N = 1e7

key1 = split_xlr(N/1e6) # 10
key2 = split_xlr(N/1e3) # 10000
key3 = split_xlr(N) # 1e7

In [107]:
id1 = sample_all(np.append(key1['x'], key1['l']), N)
id2 = sample_all(np.append(key2['x'], key2['l']), N)
id3 = sample_all(np.append(key3['x'], key3['l']), N)