# Split data into groups

The purpose on this demo is to show how to split data into several groups based on desired parameters.

In [9]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import logging
import numpy as np
import pandas as pd

sys.path.append('../')

from abacus.splitter.params import SplitBuilderParams
from abacus.splitter.split_builder import SplitBuilder

logging.basicConfig(level = logging.INFO)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
df = pd.read_csv('./data/ab_data.csv', nrows=15_000)
df["moda_city"] = np.random.randint(1, 5, df.shape[0])
df["moda_city"] = df["moda_city"].astype(str)
df["country"] = np.random.randint(1, 3, df.shape[0])
df["id"] = df.index

In [11]:
split_builder_params = SplitBuilderParams(
    map_group_names_to_sizes={
        'control': None,
        'target': None
    },
    main_strata_col = "moda_city",
    split_metric_col = "height_now",
    metric_type = "continuous",
    id_col = "id",
    cols = ["height_prev"],
    cat_cols=["country"],
    alpha=0.05,
    n_bins = 6,
    min_cluster_size = 500
)

In [12]:
split_builder = SplitBuilder(df, split_builder_params)

In [13]:
split = split_builder.collect()

In [14]:
split.head()

Unnamed: 0,height_now,height_prev,weight_now,weight_prev,noise_now,noise_prev,groups,id,moda_city,country,numerator,denominator,conversion,strata,group_name
0,172.470442,166.382868,163.648256,173.422115,5.644384,-1.923929,A,0,1,0.495536,3,2,1,11-1,control
1,178.610565,173.386388,173.195935,180.831845,11.664751,2.711829,B,1,4,0.504472,1,4,1,44-1,control
2,180.382301,178.563579,174.520065,181.571024,7.183178,-0.453939,A,2,1,0.504472,2,2,1,15-1,target
3,180.954018,179.73208,175.731245,182.840388,-4.682656,6.358723,B,3,4,0.504472,3,2,0,45-1,control
4,167.590042,167.684244,166.109779,170.189261,9.134517,-1.153977,B,4,3,0.495536,3,2,1,30-1,target
