In [1]:
import sd4py

In [2]:
import numpy as np
import pandas as pd

## Generate some data

In order to showcase sd4py, we need some data in a Pandas dataframe. 

The code below just generates a dataframe with a variety of different data types.  

In [44]:
df2 = pd.DataFrame(
    {
        "A": pd.Series(np.random.randn(100), dtype="float16"),
        "B": pd.Series(np.random.randn(100)),
        "C": pd.Series(np.array(np.random.randn(100), dtype="uint8")),
        "D": ["foo"]*50 + ["bar"]*50,
        "E": pd.date_range("2018-01-01", periods=100, freq="H"),
        "F": ["red", "yellow", "green", "blue","magenta"] * 20,
        "G": ([1]*5 + [10]*3 + [5,5]) * 10,
        "H": [pd.Timedelta(hours=x) for x in [1,3,2,6,5,2,5,9,1,6]*10],
        "I": [True, False] * 50
    }
)

In [45]:
df2.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I
0,1.040039,-0.768964,0,foo,2018-01-01 00:00:00,red,1,0 days 01:00:00,True
1,1.569336,-0.030586,0,foo,2018-01-01 01:00:00,yellow,1,0 days 03:00:00,False
2,-0.392578,1.356334,0,foo,2018-01-01 02:00:00,green,1,0 days 02:00:00,True
3,-0.662598,0.559898,0,foo,2018-01-01 03:00:00,blue,1,0 days 06:00:00,False
4,-0.717773,-0.147408,254,foo,2018-01-01 04:00:00,magenta,1,0 days 05:00:00,True


In [47]:
df2.E < pd.to_datetime('2018-01-02')

0      True
1      True
2      True
3      True
4      True
      ...  
95    False
96    False
97    False
98    False
99    False
Name: E, Length: 100, dtype: bool

In [48]:
df2.E.values < pd.to_datetime('2018-01-02')

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [38]:
df2.H < pd.Timedelta(hours=5)

0      True
1      True
2      True
3     False
4     False
      ...  
95     True
96    False
97    False
98     True
99    False
Name: H, Length: 100, dtype: bool

In [39]:
df2.H.values < pd.Timedelta(hours=5)

array([ True,  True,  True, False, False,  True, False, False,  True,
       False,  True,  True,  True, False, False,  True, False, False,
        True, False,  True,  True,  True, False, False,  True, False,
       False,  True, False,  True,  True,  True, False, False,  True,
       False, False,  True, False,  True,  True,  True, False, False,
        True, False, False,  True, False,  True,  True,  True, False,
       False,  True, False, False,  True, False,  True,  True,  True,
       False, False,  True, False, False,  True, False,  True,  True,
        True, False, False,  True, False, False,  True, False,  True,
        True,  True, False, False,  True, False, False,  True, False,
        True,  True,  True, False, False,  True, False, False,  True,
       False])

## Discover subgroups

With a pandas dataframe containing the data we are interested in, performing subgroup discovery is quite easy. Below is the minimal code to perform subgroup discovery on a pandas dataframe. 

In [33]:
subgroups = sd4py.discover_subgroups(df2, "A")

SDMapNum
Time - took: 5 ms. (4 ms. CPU time)
Steps: 177


The results from the `discover_subgroups()` function are returned in a custom `PySubgroupResults` object, which has three important attributes: 

 * `subgroups`, a list of custom `PySubgroup` objects 
 * `population_value`, the average target value of the full dataset, and
 * `population_size`, the size of the full dataset. 

It also has a convenient `to_dataframe()` function that converts the results into a dataframe for easy viewing. 

A `PySubgroup` object contains the quality, subgroup size, average target value, and a list of custom selector objects. 

The selector objects come in two varieties. `PyNominalSelector` objects contain a column name and a corresponding value. `PyNumericSelector` objects contain a column name, a minimum value, and a maximum value.

Now, let's call the `PySubgroupResults.to_df()` function to take a look at the results.  

In [34]:
subgroups.to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,I = False AND D = bar AND C < 85.00,0.467548,24.0,9.940075
1,2018-01-03 18:00:00 <= E AND I = False AND C <...,0.658218,16.0,9.677449
2,0 days 03:40:00 <= H < 0 days 06:20:00 AND I =...,0.53392,20.0,9.61085
3,0 days 03:40:00 <= H < 0 days 06:20:00 AND I =...,0.53392,20.0,9.61085
4,I = False AND D = bar,0.43618,25.0,9.570046
5,2018-01-03 18:00:00 <= E AND I = False,0.600873,17.0,9.307421
6,2018-01-03 18:00:00 <= E AND I = False AND D =...,0.600873,17.0,9.307421
7,I = False AND C < 85.00,0.248967,45.0,8.801502
8,2018-01-02 09:00:00 <= E < 2018-01-03 18:00:00...,0.54695,17.0,8.390734
9,I = False AND G < 4.00 AND D = bar,0.889326,10.0,8.359483


## Selecting rows from a dataframe based on a subgroup

After finding subgroups, it is possible to use them to select rows from a dataframe (i.e. select rows matching the pattern). 

This can be done on the same dataframe that was originally used to discover subgroups, or another dataframe with the same columns (which might be useful if, e.g., using a train/test split). 

In [32]:
results = sd4py.discover_subgroups(df2, "A")

## Select rows based on the first subgroup
print(results.subgroups[0])

results.subgroups[0].get_rows(df2)

SDMapNum
Time - took: 4 ms. (3 ms. CPU time)
Steps: 177
I = False AND D = bar AND C < 85.00


Unnamed: 0,A,B,C,D,E,F,G,H,I
51,2.0,0.363034,0,bar,2018-01-03 03:00:00,yellow,1,0 days 03:00:00,False
53,0.133057,-0.669227,0,bar,2018-01-03 05:00:00,blue,1,0 days 06:00:00,False
55,-1.022461,-1.159036,0,bar,2018-01-03 07:00:00,red,10,0 days 02:00:00,False
57,-0.277832,-1.993315,0,bar,2018-01-03 09:00:00,green,10,0 days 09:00:00,False
59,-0.859863,1.254929,0,bar,2018-01-03 11:00:00,magenta,5,0 days 06:00:00,False
61,-0.006886,-0.388429,0,bar,2018-01-03 13:00:00,yellow,1,0 days 03:00:00,False
63,0.389404,1.875706,0,bar,2018-01-03 15:00:00,blue,1,0 days 06:00:00,False
65,0.334229,-0.03745,1,bar,2018-01-03 17:00:00,red,10,0 days 02:00:00,False
67,1.233398,0.489307,1,bar,2018-01-03 19:00:00,green,10,0 days 09:00:00,False
69,0.435059,1.359281,0,bar,2018-01-03 21:00:00,magenta,5,0 days 06:00:00,False


## Examples using more parameters

The below examples show the use of a more complex combination of parameters

In [8]:
sd4py.discover_subgroups(
    df2, 
    target="I",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="bsd",
    qf="gain",
    k=100,
    minqual=0,
    minsize=0,
    mintp=1,
    max_selectors=7,
    ignore_defaults=True,
    filter_irrelevant=True,
    postfilter="weighted_covering",
    postfilter_param=5
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,H < 0 days 02:36:00,0.75,40.0,0.124511
1,-1.10 <= B < -0.22 AND H < 0 days 02:36:00,0.818182,11.0,0.038731
2,-0.61 <= A < 0.44 AND H < 0 days 02:36:00,0.705882,17.0,0.025687
3,-1.66 <= A < -0.61 AND -0.22 <= B < 0.66,0.222222,9.0,0.023204
4,0.44 <= A < 1.49 AND H < 0 days 02:36:00,0.727273,11.0,0.019039


In [9]:
sd4py.discover_subgroups(
    df2, 
    target="A",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="beam",
    qf="wracc",
    k=50,
    minqual=0,
    minsize=2,
    mintp=0,
    max_selectors=7,
    ignore_defaults=False,
    filter_irrelevant=False,
    postfilter="min_improve_global",
    postfilter_param=0.3
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,E < 2018-01-01 19:48:00,0.263303,20.0,0.062605
1,E < 2018-01-01 19:48:00 AND -0.22 <= B < 0.66,1.089929,5.0,0.056983
2,E < 2018-01-01 19:48:00 AND 0 days 05:48:00 <=...,0.802856,4.0,0.034103
3,B < -1.10,0.300537,9.0,0.031524
4,B < -1.10 AND 2018-01-02 15:36:00 <= E < 2018-...,0.70752,4.0,0.03029
5,H < 0 days 02:36:00 AND 0.66 <= B < 1.55,0.508496,5.0,0.027911
6,I = True AND 2018-01-02 15:36:00 <= E < 2018-0...,0.95459,2.0,0.020086
7,2018-01-02 15:36:00 <= E < 2018-01-03 11:24:00...,0.394043,4.0,0.017751
8,B < -1.10 AND 0 days 04:12:00 <= H < 0 days 05...,0.773926,2.0,0.016473
9,2018-01-04 07:12:00 <= E AND -1.10 <= B < -0.2...,0.631531,2.0,0.013625


In [10]:
sd4py.discover_subgroups(
    df2, 
    target="I",
    included_attributes=["B", "E", "H", "I", "A"],
    nbins=5,
    method="bsd",
    qf="chi2",
    k=10,
    minqual=0,
    minsize=0,
    mintp=10
).to_df()

Unnamed: 0,pattern,target_quantity,size,quality
0,0 days 04:12:00 <= H < 0 days 05:48:00,1.0,20.0,25.0
1,H < 0 days 02:36:00,0.75,40.0,16.666667
2,-0.61 <= A < 0.44 AND H < 0 days 02:36:00,0.705882,17.0,3.472714
3,-0.22 <= B < 0.66 AND H < 0 days 02:36:00,0.666667,18.0,2.439024
4,-1.10 <= B < -0.22,0.4,35.0,2.153846
5,-0.22 <= B < 0.66 AND -0.61 <= A < 0.44,0.666667,15.0,1.960784
6,-1.66 <= A < -0.61,0.384615,26.0,1.871102
7,0.66 <= B < 1.55,0.578947,19.0,0.584795
8,0.44 <= A < 1.49,0.555556,27.0,0.456621
9,-0.22 <= B < 0.66,0.484848,33.0,0.045228
