In [2]:
import sd4py

In [3]:
import numpy as np
import pandas as pd

## Generate some data for testing

The code below generates a pandas dataframe with a variety of different data types 

In [4]:
df2 = pd.DataFrame(
    {
        "A": pd.Series(np.random.randn(100), dtype="float16"),
        "B": pd.Series(np.random.randn(100)),
        "C": pd.Series(np.array(np.random.randn(100), dtype="uint8")),
        "D": ["foo"]*50 + ["bar"]*50,
        "E": pd.date_range("2018-01-01", periods=100, freq="H"),
        "G": ([1]*5 + [10]*3 + [5,5]) * 10,
        "H": [pd.Timedelta(hours=x) for x in [1,3,2,6,5,2,5,9,1,6]*10],
        "I": [True, False] * 50
    }
)

In [5]:
df2

Unnamed: 0,A,B,C,D,E,G,H,I
0,-0.023346,-0.908445,0,foo,2018-01-01 00:00:00,1,0 days 01:00:00,True
1,-1.632812,1.654652,0,foo,2018-01-01 01:00:00,1,0 days 03:00:00,False
2,1.100586,-1.161146,0,foo,2018-01-01 02:00:00,1,0 days 02:00:00,True
3,-0.855469,-1.077292,0,foo,2018-01-01 03:00:00,1,0 days 06:00:00,False
4,0.292725,-1.360608,1,foo,2018-01-01 04:00:00,1,0 days 05:00:00,True
...,...,...,...,...,...,...,...,...
95,-0.319336,-1.231090,0,bar,2018-01-04 23:00:00,10,0 days 02:00:00,False
96,0.202759,-0.207234,0,bar,2018-01-05 00:00:00,10,0 days 05:00:00,True
97,-0.211304,-0.261575,2,bar,2018-01-05 01:00:00,10,0 days 09:00:00,False
98,-1.298828,0.682877,0,bar,2018-01-05 02:00:00,5,0 days 01:00:00,True


## Discover subgroups

Below is the minimal code to perform subgroup discovery on a pandas dataframe (and then convert the results into a new dataframe for easier viewing).

In [7]:
sd4py.discover_subgroups(df2, "A").to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,I = False AND -0.39 <= B < 1.20,0.510049,22.0,8.69186
1,7.00 <= G AND C < 85.00,0.398511,25.0,7.088667
2,B < -0.39 AND D = foo,0.419648,22.0,6.703037
3,-0.39 <= B < 1.20 AND D = foo,0.446286,20.0,6.626439
4,7.00 <= G AND E < 2018-01-02 09:00:00,0.837023,9.0,6.498527
5,7.00 <= G AND E < 2018-01-02 09:00:00 AND D = foo,0.837023,9.0,6.498527
6,D = foo,0.240161,50.0,6.259861
7,7.00 <= G AND E < 2018-01-02 09:00:00 AND C < ...,0.871826,8.0,6.054897
8,E < 2018-01-02 09:00:00 AND -0.39 <= B < 1.20,0.57909,13.0,6.033635
9,E < 2018-01-02 09:00:00 AND -0.39 <= B < 1.20 ...,0.57909,13.0,6.033635


### Examples using more parameters

The below examples show the use of a more complex combination of parameters

In [9]:
sd4py.discover_subgroups(
    df2, 
    target="I",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="bsd",
    qf="gain",
    k=100,
    minqual=0,
    minsize=0,
    mintp=1,
    max_selectors=7,
    ignore_defaults=True,
    filter_irrelevant=True,
    postfilter="weighted_covering",
    postfilter_param=5
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,H < 0 days 02:36:00,0.75,40.0,0.124511
1,-0.07 <= B < 0.88 AND H < 0 days 02:36:00,0.8125,16.0,0.057214
2,-0.52 <= A < 0.39 AND H < 0 days 02:36:00,0.833333,12.0,0.047251
3,-1.03 <= B < -0.07 AND H < 0 days 02:36:00,0.818182,11.0,0.038731
4,-1.43 <= A < -0.52 AND H < 0 days 02:36:00,0.857143,7.0,0.030523


In [16]:
sd4py.discover_subgroups(
    df2, 
    target="A",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="beam",
    qf="wracc",
    k=50,
    minqual=0,
    minsize=2,
    mintp=0,
    max_selectors=7,
    ignore_defaults=False,
    filter_irrelevant=False,
    postfilter="min_improve_global",
    postfilter_param=0.3
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,-1.30 <= B < -0.37,0.273844,26.0,0.083587
1,2018-01-01 19:48:00 <= E < 2018-01-02 15:36:00...,1.215495,3.0,0.037894
2,0 days 02:36:00 <= H < 0 days 04:12:00,0.253949,10.0,0.030159
3,2018-01-02 15:36:00 <= E < 2018-01-03 11:24:00...,0.638306,4.0,0.027438
4,H < 0 days 02:36:00 AND -0.37 <= B < 0.57 AND ...,0.484155,5.0,0.02659
5,B < -1.30 AND I = False,0.837565,3.0,0.026556
6,-1.30 <= B < -0.37 AND E < 2018-01-01 19:48:00,0.607468,4.0,0.026204
7,0 days 02:36:00 <= H < 0 days 04:12:00 AND E <...,1.175293,2.0,0.024459
8,0.57 <= B < 1.50 AND 0 days 02:36:00 <= H < 0 ...,1.074707,2.0,0.022447
9,0.57 <= B < 1.50 AND 2018-01-03 11:24:00 <= E ...,0.493958,4.0,0.021664


### Selecting rows from a dataframe based on a subgroup

After finding subgroups, it is possible to use them to select rows from a dataframe (i.e. select rows matching the pattern). 

This can be done on the same dataframe that was originally used to discover subgroups, or another dataframe with the same columns (which might be useful if, e.g., using a train/test split). 

In [10]:
results = sd4py.discover_subgroups(df2, "A")

## Select rows based on the first subgroup
print(results.subgroups[0])

results.subgroups[0].get_rows(df2)

NameError: name 'pySubgroup' is not defined