In [1]:
import sd4py

In [2]:
import numpy as np
import pandas as pd

## Generate some data

In order to showcase sd4py, we need some data in a Pandas dataframe. 

The code below just generates a dataframe with a variety of different data types.  

In [3]:
df2 = pd.DataFrame(
    {
        "A": pd.Series(np.random.randn(100), dtype="float16"),
        "B": pd.Series(np.random.randn(100)),
        "C": pd.Series(np.array(np.random.randn(100), dtype="uint8")),
        "D": ["foo"]*50 + ["bar"]*50,
        "E": pd.date_range("2018-01-01", periods=100, freq="H"),
        "G": ([1]*5 + [10]*3 + [5,5]) * 10,
        "H": [pd.Timedelta(hours=x) for x in [1,3,2,6,5,2,5,9,1,6]*10],
        "I": [True, False] * 50
    }
)

In [4]:
df2.head()

Unnamed: 0,A,B,C,D,E,G,H,I
0,-1.708984,-0.505487,1,foo,2018-01-01 00:00:00,1,0 days 01:00:00,True
1,-1.18457,0.725483,0,foo,2018-01-01 01:00:00,1,0 days 03:00:00,False
2,2.269531,-0.99616,0,foo,2018-01-01 02:00:00,1,0 days 02:00:00,True
3,2.544922,0.079119,1,foo,2018-01-01 03:00:00,1,0 days 06:00:00,False
4,0.773438,-1.9845,0,foo,2018-01-01 04:00:00,1,0 days 05:00:00,True


## Discover subgroups

With a pandas dataframe containing the data we are interested in, performing subgroup discovery is quite easy. Below is the minimal code to perform subgroup discovery on a pandas dataframe. 

In [5]:
subgroups = sd4py.discover_subgroups(df2, "A")

SDMapNum
Time - took: 95 ms. (48 ms. CPU time)
Steps: 166


The results from the `discover_subgroups()` function are returned in a custom `PySubgroupResults` object, which has three important attributes: 

 * `subgroups`, a list of custom `PySubgroup` objects 
 * `population_value`, the average target value of the full dataset, and
 * `population_size`, the size of the full dataset. 

It also has a convenient `to_dataframe()` function that converts the results into a dataframe for easy viewing. 

A `PySubgroup` object contains the quality, subgroup size, average target value, and a list of custom selector objects. 

The selector objects come in two varieties. `PyNominalSelector` objects contain a column name and a corresponding value. `PyNumericSelector` objects contain a column name, a minimum value, and a maximum value.

Now, let's call the `PySubgroupResults.to_df()` function to take a look at the results.  

In [6]:
subgroups.to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,2018-01-02 09:00:00 <= E < 2018-01-03 18:00:00...,0.423735,16.0,7.575341
1,B < -0.51,0.178044,33.0,7.516354
2,2018-01-02 09:00:00 <= E < 2018-01-03 18:00:00...,0.538943,12.0,7.064005
3,B < -0.51 AND I = True AND C < 85.00,0.463708,13.0,6.674612
4,B < -0.51 AND C < 85.00,0.195395,27.0,6.618217
5,B < -0.51 AND I = True,0.361532,15.0,6.168836
6,2018-01-02 09:00:00 <= E < 2018-01-03 18:00:00...,0.695988,8.0,5.965695
7,G < 4.00 AND C < 85.00,0.085519,42.0,5.680225
8,B < -0.51 AND D = bar,0.239908,19.0,5.503016
9,E < 2018-01-02 09:00:00 AND I = True AND H < 0...,0.42818,11.0,5.256942


## Selecting rows from a dataframe based on a subgroup

After finding subgroups, it is possible to use them to select rows from a dataframe (i.e. select rows matching the pattern). 

This can be done on the same dataframe that was originally used to discover subgroups, or another dataframe with the same columns (which might be useful if, e.g., using a train/test split). 

In [7]:
results = sd4py.discover_subgroups(df2, "A")

## Select rows based on the first subgroup
print(results.subgroups[0])

results.subgroups[0].get_rows(df2)

SDMapNum
Time - took: 19 ms. (19 ms. CPU time)
Steps: 166
2018-01-02 09:00:00 <= E < 2018-01-03 18:00:00 AND B < -0.51


Unnamed: 0,A,B,C,D,E,G,H,I
33,0.111511,-0.560857,255,foo,2018-01-02 09:00:00,1,0 days 06:00:00,False
34,0.155029,-1.069154,255,foo,2018-01-02 10:00:00,1,0 days 05:00:00,True
36,-0.758789,-0.54593,0,foo,2018-01-02 12:00:00,10,0 days 05:00:00,True
37,1.257812,-0.877017,0,foo,2018-01-02 13:00:00,10,0 days 09:00:00,False
38,0.678711,-0.871467,0,foo,2018-01-02 14:00:00,5,0 days 01:00:00,True
40,-0.760254,-1.778476,255,foo,2018-01-02 16:00:00,1,0 days 01:00:00,True
41,-0.246582,-0.896526,0,foo,2018-01-02 17:00:00,1,0 days 03:00:00,False
44,0.774414,-1.687122,0,foo,2018-01-02 20:00:00,1,0 days 05:00:00,True
52,2.009766,-1.212723,1,bar,2018-01-03 04:00:00,1,0 days 02:00:00,True
55,0.806152,-1.241968,255,bar,2018-01-03 07:00:00,10,0 days 02:00:00,False


## Examples using more parameters

The below examples show the use of a more complex combination of parameters

In [8]:
sd4py.discover_subgroups(
    df2, 
    target="I",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="bsd",
    qf="gain",
    k=100,
    minqual=0,
    minsize=0,
    mintp=1,
    max_selectors=7,
    ignore_defaults=True,
    filter_irrelevant=True,
    postfilter="weighted_covering",
    postfilter_param=5
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,H < 0 days 02:36:00,0.75,40.0,0.124511
1,-1.10 <= B < -0.22 AND H < 0 days 02:36:00,0.818182,11.0,0.038731
2,-0.61 <= A < 0.44 AND H < 0 days 02:36:00,0.705882,17.0,0.025687
3,-1.66 <= A < -0.61 AND -0.22 <= B < 0.66,0.222222,9.0,0.023204
4,0.44 <= A < 1.49 AND H < 0 days 02:36:00,0.727273,11.0,0.019039


In [9]:
sd4py.discover_subgroups(
    df2, 
    target="A",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="beam",
    qf="wracc",
    k=50,
    minqual=0,
    minsize=2,
    mintp=0,
    max_selectors=7,
    ignore_defaults=False,
    filter_irrelevant=False,
    postfilter="min_improve_global",
    postfilter_param=0.3
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,E < 2018-01-01 19:48:00,0.263303,20.0,0.062605
1,E < 2018-01-01 19:48:00 AND -0.22 <= B < 0.66,1.089929,5.0,0.056983
2,E < 2018-01-01 19:48:00 AND 0 days 05:48:00 <=...,0.802856,4.0,0.034103
3,B < -1.10,0.300537,9.0,0.031524
4,B < -1.10 AND 2018-01-02 15:36:00 <= E < 2018-...,0.70752,4.0,0.03029
5,H < 0 days 02:36:00 AND 0.66 <= B < 1.55,0.508496,5.0,0.027911
6,I = True AND 2018-01-02 15:36:00 <= E < 2018-0...,0.95459,2.0,0.020086
7,2018-01-02 15:36:00 <= E < 2018-01-03 11:24:00...,0.394043,4.0,0.017751
8,B < -1.10 AND 0 days 04:12:00 <= H < 0 days 05...,0.773926,2.0,0.016473
9,2018-01-04 07:12:00 <= E AND -1.10 <= B < -0.2...,0.631531,2.0,0.013625


In [10]:
sd4py.discover_subgroups(
    df2, 
    target="I",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="bsd",
    qf="chi2",
    k=10,
    minqual=0,
    minsize=0,
    mintp=10
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,0 days 04:12:00 <= H < 0 days 05:48:00,1.0,20.0,25.0
1,H < 0 days 02:36:00,0.75,40.0,16.666667
2,-0.61 <= A < 0.44 AND H < 0 days 02:36:00,0.705882,17.0,3.472714
3,-0.22 <= B < 0.66 AND H < 0 days 02:36:00,0.666667,18.0,2.439024
4,-1.10 <= B < -0.22,0.4,35.0,2.153846
5,-0.22 <= B < 0.66 AND -0.61 <= A < 0.44,0.666667,15.0,1.960784
6,-1.66 <= A < -0.61,0.384615,26.0,1.871102
7,0.66 <= B < 1.55,0.578947,19.0,0.584795
8,0.44 <= A < 1.49,0.555556,27.0,0.456621
9,-0.22 <= B < 0.66,0.484848,33.0,0.045228
