In [1]:
import pySubgroup

In [2]:
import numpy as np
import pandas as pd

## Generate some data for testing

The code below generates a pandas dataframe with a variety of different data types 

In [3]:
df2 = pd.DataFrame(
    {
        "A": pd.Series(np.random.randn(100), dtype="float16"),
        "B": pd.Series(np.random.randn(100)),
        "C": pd.Series(np.array(np.random.randn(100), dtype="uint8")),
        "D": ["foo"]*50 + ["bar"]*50,
        "E": pd.date_range("2018-01-01", periods=100, freq="H"),
        "G": ([1]*5 + [10]*3 + [5,5]) * 10,
        "H": [pd.Timedelta(hours=x) for x in [1,3,2,6,5,2,5,9,1,6]*10],
        "I": [True, False] * 50
    }
)

In [4]:
df2

Unnamed: 0,A,B,C,D,E,G,H,I
0,-0.083740,0.029423,1,foo,2018-01-01 00:00:00,1,01:00:00,True
1,1.768555,-1.297747,0,foo,2018-01-01 01:00:00,1,03:00:00,False
2,1.403320,2.431297,255,foo,2018-01-01 02:00:00,1,02:00:00,True
3,-0.142212,1.137068,0,foo,2018-01-01 03:00:00,1,06:00:00,False
4,-0.391602,0.334579,0,foo,2018-01-01 04:00:00,1,05:00:00,True
...,...,...,...,...,...,...,...,...
95,0.140259,0.413646,0,bar,2018-01-04 23:00:00,10,02:00:00,False
96,0.958984,-1.093116,0,bar,2018-01-05 00:00:00,10,05:00:00,True
97,0.764648,-1.096593,0,bar,2018-01-05 01:00:00,10,09:00:00,False
98,0.919922,0.239081,0,bar,2018-01-05 02:00:00,5,01:00:00,True


## Discover subgroups

Below is the minimal code to perform subgroup discovery on a pandas dataframe (and then convert the results into a new dataframe for easier viewing).

In [6]:
pySubgroup.discover_subgroups(df2, "A").to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,D = foo AND H < 0 days 03:40:00,0.338252,25.0,9.647388
1,B < -0.68 AND C < 85.00,0.375562,20.0,8.464102
2,B < -0.68,0.287661,23.0,7.712013
3,B < -0.68 AND I = False AND C < 85.00,0.883057,8.0,7.445602
4,G < 4.00 AND C < 85.00,0.121935,43.0,7.291884
5,B < -0.68 AND 0 days 03:40:00 <= H < 0 days 06...,0.747057,9.0,7.152303
6,D = foo,0.093961,50.0,7.080238
7,G < 4.00 AND I = True,0.186336,30.0,7.01938
8,G < 4.00 AND D = foo AND H < 0 days 03:40:00,0.419714,15.0,7.010369
9,170.00 <= C AND H < 0 days 03:40:00 AND D = foo,0.827255,8.0,6.999191


### Examples using more parameters

The below examples show the use of a more complex combination of parameters

In [12]:
pySubgroup.discover_subgroups(
    df2, 
    target="I",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="bsd",
    qf="gain",
    k=100,
    minqual=0,
    minsize=0,
    mintp=1,
    max_selectors=7,
    ignore_defaults=True,
    filter_irrelevant=True,
    postfilter="weighted_covering",
    postfilter_param=5
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,H < 0 days 02:36:00,0.75,40.0,0.124511
1,-1.30 <= B < -0.37 AND H < 0 days 02:36:00,0.909091,11.0,0.068232
2,-0.58 <= A < 0.41 AND H < 0 days 02:36:00,0.857143,14.0,0.065573
3,-1.30 <= B < -0.37 AND -0.58 <= A < 0.41,0.875,8.0,0.03934
4,0.41 <= A < 1.39 AND H < 0 days 02:36:00,0.75,12.0,0.0256


In [16]:
pySubgroup.discover_subgroups(
    df2, 
    target="A",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="beam",
    qf="wracc",
    k=50,
    minqual=0,
    minsize=2,
    mintp=0,
    max_selectors=7,
    ignore_defaults=False,
    filter_irrelevant=False,
    postfilter="min_improve_global",
    postfilter_param=0.3
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,-1.30 <= B < -0.37,0.273844,26.0,0.083587
1,2018-01-01 19:48:00 <= E < 2018-01-02 15:36:00...,1.215495,3.0,0.037894
2,0 days 02:36:00 <= H < 0 days 04:12:00,0.253949,10.0,0.030159
3,2018-01-02 15:36:00 <= E < 2018-01-03 11:24:00...,0.638306,4.0,0.027438
4,H < 0 days 02:36:00 AND -0.37 <= B < 0.57 AND ...,0.484155,5.0,0.02659
5,B < -1.30 AND I = False,0.837565,3.0,0.026556
6,-1.30 <= B < -0.37 AND E < 2018-01-01 19:48:00,0.607468,4.0,0.026204
7,0 days 02:36:00 <= H < 0 days 04:12:00 AND E <...,1.175293,2.0,0.024459
8,0.57 <= B < 1.50 AND 0 days 02:36:00 <= H < 0 ...,1.074707,2.0,0.022447
9,0.57 <= B < 1.50 AND 2018-01-03 11:24:00 <= E ...,0.493958,4.0,0.021664


### Selecting rows from a dataframe based on a subgroup

After finding subgroups, it is possible to use them to select rows from a dataframe (i.e. select rows matching the pattern). 

This can be done on the same dataframe that was originally used to discover subgroups, or another dataframe with the same columns (which might be useful if, e.g., using a train/test split). 

In [24]:
results = pySubgroup.discover_subgroups(df2, "A")

## Select rows based on the first subgroup
print(results.subgroups[0])

results.subgroups[0].get_rows(df2)

D = foo AND H < 0 days 03:40:00


Unnamed: 0,A,B,C,D,E,G,H,I
0,-0.08374,0.029423,1,foo,2018-01-01 00:00:00,1,01:00:00,True
1,1.768555,-1.297747,0,foo,2018-01-01 01:00:00,1,03:00:00,False
2,1.40332,2.431297,255,foo,2018-01-01 02:00:00,1,02:00:00,True
5,0.227783,-0.111382,255,foo,2018-01-01 05:00:00,10,02:00:00,False
8,1.134766,-0.482188,0,foo,2018-01-01 08:00:00,5,01:00:00,True
10,-0.38623,-2.233425,0,foo,2018-01-01 10:00:00,1,01:00:00,True
11,0.582031,-0.340126,255,foo,2018-01-01 11:00:00,1,03:00:00,False
12,-1.076172,1.603681,0,foo,2018-01-01 12:00:00,1,02:00:00,True
15,-1.116211,2.259045,1,foo,2018-01-01 15:00:00,10,02:00:00,False
18,0.078308,-0.592756,0,foo,2018-01-01 18:00:00,5,01:00:00,True
