# Prepare the demo on 21st of March

Subgroup Discovery with frequent sugars (9) and frequent pair of genes (43)

In [26]:
from Serotype_Data import * #Dataset of Serotypes
from Serotype_Functions import *
from subgroup_dataset import *

#Biopython ver.1.76
from Bio.Seq import Seq #Represent biological sequences with alphabets
from Bio.pairwise2 import format_alignment #Functions to get global and local alignments between two sequences
from Bio import pairwise2

from IPython.display import Image

import numpy as np
import pandas as pd
import tabulate

In [27]:
import json
import os

In [28]:
path = os.path.relpath('./serotypes.json')
f = open(path, 'r')
serotypes = json.load(f)

In [29]:
total_genes = []
sets = set()

for key in serotypes:
    # Simplify the sequence of genes
    simply_genes = simplify_genes(serotypes[key]['genes']) #list
    for gene in simply_genes:
        sets.add(gene)

In [30]:
total_pairgenes = []

for key in serotypes: 
    # ignore serotypes which have no structure of sugar
    if len(serotypes[key]['sugars']) == 0:
        continue

    # with serotypes which have own sugar structure
    # simplify the gene sequence
    simply_genes = simplify_genes(serotypes[key]['genes'])
    # copy the [key]['sugars'] as local 
    lin_sugars = serotypes[key]['sugars']
    
    p_genes = [] # Store genes as pair for test
    for i in range(len(simply_genes)-1):
        string = simply_genes[i] + '-' + simply_genes[i+1]
        p_genes.append(string)
        
    for pairgene in p_genes:
        if pairgene in total_pairgenes:
            continue
        else:
            total_pairgenes.append(pairgene)
            

In [31]:
eli_total_pairgenes_df = eli_total_pairgenes_df.reset_index()
eli_total_pairgenes_df = eli_total_pairgenes_df.drop(['index'], axis = 1)

In [32]:
li_pairgenes = dict()
#sugar_loc = []

for pairgenes in eli_total_pairgenes_df['Pair of genes']: 
    li_pairgenes[pairgenes] = [0]*9

In [73]:
pairgenes = []
for elem in li_pairgenes.keys():
    pairgenes.append(elem[0]+'-'+elem[1]) 

In [34]:
data_dict = {}
outcome = {'outcome':[]}
for elem in pairgenes:
    data_dict[elem] = [] #[key]each genes [value]

In [35]:
soi = ['galactose',
 'glucose',
 'rhamnose',
 'N-acetylglucosamine',
 'N-acetylgalactosamine',
 'N-acetylmannosamine',
 'ribitol',
 'glucuronic acid',
 'N-acetylfucosamine']

In [36]:
outcome_dict = {}
value = 0
for elem in soi:
    outcome_dict[elem] = value
    value += 1

In [37]:
outcome_dict

{'galactose': 0,
 'glucose': 1,
 'rhamnose': 2,
 'N-acetylglucosamine': 3,
 'N-acetylgalactosamine': 4,
 'N-acetylmannosamine': 5,
 'ribitol': 6,
 'glucuronic acid': 7,
 'N-acetylfucosamine': 8}

In [38]:
for key in serotypes:
    simply_genes = simplify_genes(serotypes[key]['genes'])
    p_genes = [] # Store genes as pair for test
    for i in range(len(simply_genes)-1):
        p_genes.append(simply_genes[i] +'-' +simply_genes[i+1])
    sugars = serotypes[key]['sugars']
    for sugar in sugars:
        # Only think about frequent sugars
        if sugar in outcome_dict.keys():            
            for elem in list(data_dict.keys()): #elem: each gene
                if elem in p_genes:
                    li = data_dict[elem]
                    li.append(1)
                    data_dict[elem] = li
                else:
                    li = data_dict[elem]
                    li.append(0)
                    data_dict[elem] = li
            li = outcome['outcome']
            li.append(sugar)
            outcome['outcome'] = li

In [39]:
df = pd.DataFrame().from_dict(data_dict)

In [40]:
df

Unnamed: 0,rmlA-rmlC,rmlC-rmlB,rmlB-rmlD,wchA-wchF,wchJ-wchK,wchA-wchJ,rmlD-glf-,wchA-wchO,wchO-wchP,wchA-wciB,wchP-wchQ,wchM-wchN,wcyS-wcrN,wchK-wchL,wchL-wchM,fnlB-fnlC,gtp1-gtp2,wchX-gtp1,gtp2-gtp3,wchF-wciU,wciV-wciW,wchO-wcjA,wciY-gct,wcjA-mnaA,gct-HG94-,mnaA-wcjB,wcjB-wcjC,HG94--rmlA,fnlA-fnlB,mnaA-rmlA,wchQ-wcyS,wcyI-wchQ,wcyH-wcyI,wcwT-wcwU,wchF-wcyH,wcrL-wcwT,wciI-wciJ,wchQ-wchR,wchK-wcyK,wciG-glf,rbsF-mnaA,wchS-rbsF,wchR-wchS
0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [41]:
df['outcome'] = outcome['outcome']

In [42]:
df

Unnamed: 0,rmlA-rmlC,rmlC-rmlB,rmlB-rmlD,wchA-wchF,wchJ-wchK,wchA-wchJ,rmlD-glf-,wchA-wchO,wchO-wchP,wchA-wciB,wchP-wchQ,wchM-wchN,wcyS-wcrN,wchK-wchL,wchL-wchM,fnlB-fnlC,gtp1-gtp2,wchX-gtp1,gtp2-gtp3,wchF-wciU,wciV-wciW,wchO-wcjA,wciY-gct,wcjA-mnaA,gct-HG94-,mnaA-wcjB,wcjB-wcjC,HG94--rmlA,fnlA-fnlB,mnaA-rmlA,wchQ-wcyS,wcyI-wchQ,wcyH-wcyI,wcwT-wcwU,wchF-wcyH,wcrL-wcwT,wciI-wciJ,wchQ-wchR,wchK-wcyK,wciG-glf,rbsF-mnaA,wchS-rbsF,wchR-wchS,outcome
0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,glucose
1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,rhamnose
2,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,rhamnose
3,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,rhamnose
4,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,glucose
5,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,glucuronic acid
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,glucose
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,glucuronic acid
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,N-acetylgalactosamine
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,N-acetylfucosamine


In [43]:
import pysubgroup as ps

In [44]:
df_dummies = pd.get_dummies(df)
df_dummies

Unnamed: 0,rmlA-rmlC,rmlC-rmlB,rmlB-rmlD,wchA-wchF,wchJ-wchK,wchA-wchJ,rmlD-glf-,wchA-wchO,wchO-wchP,wchA-wciB,wchP-wchQ,wchM-wchN,wcyS-wcrN,wchK-wchL,wchL-wchM,fnlB-fnlC,gtp1-gtp2,wchX-gtp1,gtp2-gtp3,wchF-wciU,wciV-wciW,wchO-wcjA,wciY-gct,wcjA-mnaA,gct-HG94-,mnaA-wcjB,wcjB-wcjC,HG94--rmlA,fnlA-fnlB,mnaA-rmlA,wchQ-wcyS,wcyI-wchQ,wcyH-wcyI,wcwT-wcwU,wchF-wcyH,wcrL-wcwT,wciI-wciJ,wchQ-wchR,wchK-wcyK,wciG-glf,rbsF-mnaA,wchS-rbsF,wchR-wchS,outcome_N-acetylfucosamine,outcome_N-acetylgalactosamine,outcome_N-acetylglucosamine,outcome_N-acetylmannosamine,outcome_galactose,outcome_glucose,outcome_glucuronic acid,outcome_rhamnose,outcome_ribitol
0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
5,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [45]:
# Set maximum num of rows & cols
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)

In [46]:
data = df_dummies

target = ps.BinaryTarget('outcome_N-acetylfucosamine', True)
searchspace = ps.create_selectors(data, ignore=['outcome_N-acetylfucosamine', 'outcome_N-acetylgalactosamine',
       'outcome_N-acetylglucosamine', 'outcome_N-acetylmannosamine',
       'outcome_galactose', 'outcome_glucose', 'outcome_glucuronic acid',
       'outcome_rhamnose', 'outcome_ribitol'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=20,
    depth=5,
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [47]:
df_result = result.to_dataframe()
df_result

  df = pd.DataFrame(res, columns=headers, dtype=np.float64)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.019197,fnlB-fnlC==1 AND gtp1-gtp2==0,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889
1,0.019197,fnlB-fnlC==1 AND rmlD-glf-==0,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889
2,0.019197,fnlB-fnlC==1 AND gtp2-gtp3==0,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889
3,0.019197,fnlB-fnlC==1 AND wchM-wchN==0,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889
4,0.019197,fnlB-fnlC==1 AND wchA-wciB==0,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889
5,0.019197,fnlB-fnlC==1 AND wchK-wchL==0,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889
6,0.019197,fnlB-fnlC==1 AND wciY-gct==0,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889
7,0.019197,fnlB-fnlC==1,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889
8,0.019197,rmlD-glf-==0 AND wciI-wciJ==1,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889
9,0.019197,fnlB-fnlC==1 AND wchX-gtp1==0,18.0,241.0,5.0,5.0,223.0,0.074689,0.925311,1.0,0.0,0.277778,0.0,0.020747,13.388889


In [48]:
for i in range(len(df_result['subgroup'])):
    print(df_result['subgroup'][i])

fnlB-fnlC==1 AND gtp1-gtp2==0
fnlB-fnlC==1 AND rmlD-glf-==0
fnlB-fnlC==1 AND gtp2-gtp3==0
fnlB-fnlC==1 AND wchM-wchN==0
fnlB-fnlC==1 AND wchA-wciB==0
fnlB-fnlC==1 AND wchK-wchL==0
fnlB-fnlC==1 AND wciY-gct==0
fnlB-fnlC==1
rmlD-glf-==0 AND wciI-wciJ==1
fnlB-fnlC==1 AND wchX-gtp1==0
wciI-wciJ==1 AND wciY-gct==0
fnlB-fnlC==1 AND wchL-wchM==0
wchL-wchM==0 AND wciI-wciJ==1
wchK-wchL==0 AND wciI-wciJ==1
gtp1-gtp2==0 AND wciI-wciJ==1
gtp2-gtp3==0 AND wciI-wciJ==1
wchA-wciB==0 AND wciI-wciJ==1
wchM-wchN==0 AND wciI-wciJ==1
wciI-wciJ==1
wchX-gtp1==0 AND wciI-wciJ==1


In [49]:
data = df_dummies

target = ps.BinaryTarget('outcome_N-acetylgalactosamine', True)
searchspace = ps.create_selectors(data, ignore=['outcome_N-acetylfucosamine', 'outcome_N-acetylgalactosamine',
       'outcome_N-acetylglucosamine', 'outcome_N-acetylmannosamine',
       'outcome_galactose', 'outcome_glucose', 'outcome_glucuronic acid',
       'outcome_rhamnose', 'outcome_ribitol'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=20,
    depth=5,
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [50]:
df_result = result.to_dataframe()
df_result

  df = pd.DataFrame(res, columns=headers, dtype=np.float64)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.021281,mnaA-wcjB==0 AND rmlA-rmlC==0 AND wchA-wchJ==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576
1,0.021281,mnaA-wcjB==0 AND rmlB-rmlD==0 AND wchA-wchJ==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576
2,0.021281,mnaA-wcjB==0 AND rmlB-rmlD==0 AND wchA-wciB==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576
3,0.021281,rmlA-rmlC==0 AND wchA-wchJ==0 AND wchA-wciB==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576
4,0.021281,mnaA-wcjB==0 AND rmlC-rmlB==0 AND wchA-wchJ==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576
5,0.021281,rmlB-rmlD==0 AND wchA-wchJ==0 AND wchA-wciB==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576
6,0.021281,rmlB-rmlD==0 AND wchA-wciB==0 AND wchJ-wchK==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576
7,0.021281,rmlB-rmlD==0 AND wchA-wchJ==0 AND wchA-wchO==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576
8,0.021281,rmlA-rmlC==0 AND wchA-wchJ==0 AND wchA-wciB==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576
9,0.021281,rmlA-rmlC==0 AND wchA-wchJ==0 AND wchA-wciB==0...,41.0,241.0,7.0,11.0,200.0,0.170124,0.829876,0.636364,0.363636,0.170732,0.02,0.045643,3.740576


In [51]:
for i in range(len(df_result['subgroup'])):
    print(df_result['subgroup'][i])

mnaA-wcjB==0 AND rmlA-rmlC==0 AND wchA-wchJ==0 AND wchA-wciB==0
mnaA-wcjB==0 AND rmlB-rmlD==0 AND wchA-wchJ==0 AND wchA-wciB==0
mnaA-wcjB==0 AND rmlB-rmlD==0 AND wchA-wciB==0 AND wchJ-wchK==0
rmlA-rmlC==0 AND wchA-wchJ==0 AND wchA-wciB==0 AND wchO-wcjA==0
mnaA-wcjB==0 AND rmlC-rmlB==0 AND wchA-wchJ==0 AND wchA-wciB==0
rmlB-rmlD==0 AND wchA-wchJ==0 AND wchA-wciB==0 AND wchO-wcjA==0
rmlB-rmlD==0 AND wchA-wciB==0 AND wchJ-wchK==0 AND wchO-wcjA==0
rmlB-rmlD==0 AND wchA-wchJ==0 AND wchA-wchO==0 AND wchA-wciB==0
rmlA-rmlC==0 AND wchA-wchJ==0 AND wchA-wciB==0 AND wcjA-mnaA==0
rmlA-rmlC==0 AND wchA-wchJ==0 AND wchA-wciB==0 AND wcjB-wcjC==0
rmlA-rmlC==0 AND wchA-wchJ==0 AND wchA-wchO==0 AND wchA-wciB==0
rmlB-rmlD==0 AND wchA-wchJ==0 AND wchA-wciB==0 AND wcjB-wcjC==0
rmlB-rmlD==0 AND wchA-wchJ==0 AND wchA-wciB==0 AND wcjA-mnaA==0
rmlC-rmlB==0 AND wchA-wchJ==0 AND wchA-wciB==0 AND wcjA-mnaA==0
rmlC-rmlB==0 AND wchA-wchJ==0 AND wchA-wchO==0 AND wchA-wciB==0
rmlC-rmlB==0 AND wchA-wchJ==0 AND wchA-w

In [52]:
data = df_dummies

target = ps.BinaryTarget('outcome_N-acetylglucosamine', True)
searchspace = ps.create_selectors(data, ignore=['outcome_N-acetylfucosamine', 'outcome_N-acetylgalactosamine',
       'outcome_N-acetylglucosamine', 'outcome_N-acetylmannosamine',
       'outcome_galactose', 'outcome_glucose', 'outcome_glucuronic acid',
       'outcome_rhamnose', 'outcome_ribitol'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=20,
    depth=5,
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [53]:
df_result = result.to_dataframe()
df_result

  df = pd.DataFrame(res, columns=headers, dtype=np.float64)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.026015,fnlB-fnlC==0 AND wchA-wchJ==1,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353
1,0.026015,fnlB-fnlC==0 AND wchJ-wchK==1,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353
2,0.026015,wchA-wchJ==1 AND wchA-wciB==0,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353
3,0.026015,rmlC-rmlB==0 AND wchA-wchJ==1,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353
4,0.026015,rmlA-rmlC==0 AND wchJ-wchK==1,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353
5,0.026015,wchA-wchJ==1 AND wchJ-wchK==1,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353
6,0.026015,wchA-wchJ==1 AND wchP-wchQ==0,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353
7,0.026015,rmlD-glf-==0 AND wchA-wchJ==1,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353
8,0.026015,rmlC-rmlB==0 AND wchJ-wchK==1,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353
9,0.026015,wchA-wchO==0 AND wchJ-wchK==1,47.0,241.0,9.0,14.0,194.0,0.195021,0.804979,0.642857,0.357143,0.191489,0.025773,0.058091,3.296353


In [54]:
for i in range(len(df_result['subgroup'])):
    print(df_result['subgroup'][i])

fnlB-fnlC==0 AND wchA-wchJ==1
fnlB-fnlC==0 AND wchJ-wchK==1
wchA-wchJ==1 AND wchA-wciB==0
rmlC-rmlB==0 AND wchA-wchJ==1
rmlA-rmlC==0 AND wchJ-wchK==1
wchA-wchJ==1 AND wchJ-wchK==1
wchA-wchJ==1 AND wchP-wchQ==0
rmlD-glf-==0 AND wchA-wchJ==1
rmlC-rmlB==0 AND wchJ-wchK==1
wchA-wchO==0 AND wchJ-wchK==1
wchA-wciB==0 AND wchJ-wchK==1
wchJ-wchK==1
wchA-wchJ==1 AND wciI-wciJ==0
wchA-wchJ==1
wchJ-wchK==1 AND wciI-wciJ==0
wchA-wchF==0 AND wchA-wchJ==1
wchA-wchF==0 AND wchJ-wchK==1
wchJ-wchK==1 AND wchP-wchQ==0
rmlD-glf-==0 AND wchJ-wchK==1
wchJ-wchK==1 AND wchO-wchP==0


In [55]:
data = df_dummies

target = ps.BinaryTarget('outcome_N-acetylmannosamine', True)
searchspace = ps.create_selectors(data, ignore=['outcome_N-acetylfucosamine', 'outcome_N-acetylgalactosamine',
       'outcome_N-acetylglucosamine', 'outcome_N-acetylmannosamine',
       'outcome_galactose', 'outcome_glucose', 'outcome_glucuronic acid',
       'outcome_rhamnose', 'outcome_ribitol'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=20,
    depth=5,
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [56]:
df_result = result.to_dataframe()
df_result

  df = pd.DataFrame(res, columns=headers, dtype=np.float64)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.034676,HG94--rmlA==0 AND wchA-wchO==1,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859
1,0.034676,fnlA-fnlB==0 AND wchA-wchO==1,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859
2,0.034676,wchA-wchF==0 AND wchA-wchO==1,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859
3,0.034676,gtp2-gtp3==0 AND wchA-wchO==1,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859
4,0.034676,fnlB-fnlC==0 AND wchA-wchO==1,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859
5,0.034676,wchA-wchO==1 AND wchX-gtp1==0,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859
6,0.034676,wchA-wchO==1 AND wchF-wciU==0,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859
7,0.034676,wchA-wchO==1 AND wchA-wciB==0,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859
8,0.034676,wchA-wchJ==0 AND wchA-wchO==1,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859
9,0.034676,gct-HG94-==0 AND wchA-wchO==1,36.0,241.0,10.0,11.0,205.0,0.149378,0.850622,0.909091,0.090909,0.277778,0.004878,0.045643,6.085859


In [57]:
for i in range(len(df_result['subgroup'])):
    print(df_result['subgroup'][i])

HG94--rmlA==0 AND wchA-wchO==1
fnlA-fnlB==0 AND wchA-wchO==1
wchA-wchF==0 AND wchA-wchO==1
gtp2-gtp3==0 AND wchA-wchO==1
fnlB-fnlC==0 AND wchA-wchO==1
wchA-wchO==1 AND wchX-gtp1==0
wchA-wchO==1 AND wchF-wciU==0
wchA-wchO==1 AND wchA-wciB==0
wchA-wchJ==0 AND wchA-wchO==1
gct-HG94-==0 AND wchA-wchO==1
wchA-wchO==1 AND wcyS-wcrN==0
wchA-wchO==1 AND wciV-wciW==0
wchA-wchO==1 AND wciY-gct==0
wchA-wchO==1 AND wchK-wchL==0
wchA-wchO==1 AND wchJ-wchK==0
wchA-wchO==1 AND wciG-glf==0
wchA-wchO==1 AND wchM-wchN==0
wchA-wchO==1
wchA-wchO==1 AND wchL-wchM==0
gtp1-gtp2==0 AND wchA-wchO==1


In [58]:
data = df_dummies

target = ps.BinaryTarget('outcome_galactose', True)
searchspace = ps.create_selectors(data, ignore=['outcome_N-acetylfucosamine', 'outcome_N-acetylgalactosamine',
       'outcome_N-acetylglucosamine', 'outcome_N-acetylmannosamine',
       'outcome_galactose', 'outcome_glucose', 'outcome_glucuronic acid',
       'outcome_rhamnose', 'outcome_ribitol'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=20,
    depth=5,
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [59]:
df_result = result.to_dataframe()
df_result

  df = pd.DataFrame(res, columns=headers, dtype=np.float64)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.088996,HG94--rmlA==0 AND fnlB-fnlC==0 AND wchA-wchF==...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291
1,0.088996,fnlA-fnlB==0 AND fnlB-fnlC==0 AND wchA-wchF==0...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291
2,0.088996,fnlB-fnlC==0 AND gct-HG94-==0 AND wchA-wchF==0...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291
3,0.088996,fnlB-fnlC==0 AND mnaA-rmlA==0 AND wchA-wchF==0...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291
4,0.088996,fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291
5,0.088996,fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291
6,0.088996,fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291
7,0.088996,fnlB-fnlC==0 AND mnaA-wcjB==0 AND wchA-wchF==0...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291
8,0.088996,fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291
9,0.088996,fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0...,119.0,241.0,57.0,72.0,122.0,0.493776,0.506224,0.791667,0.208333,0.478992,0.122951,0.298755,1.603291


In [60]:
for i in range(len(df_result['subgroup'])):
    print(df_result['subgroup'][i])

HG94--rmlA==0 AND fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0
fnlA-fnlB==0 AND fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0
fnlB-fnlC==0 AND gct-HG94-==0 AND wchA-wchF==0 AND wchA-wchO==0
fnlB-fnlC==0 AND mnaA-rmlA==0 AND wchA-wchF==0 AND wchA-wchO==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchF-wcyH==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchO-wchP==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchP-wchQ==0
fnlB-fnlC==0 AND mnaA-wcjB==0 AND wchA-wchF==0 AND wchA-wchO==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchF-wciU==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchQ-wcyS==0
wchA-wchF==0 AND wchA-wchO==0 AND wciI-wciJ==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wcyS-wcrN==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchO-wcjA==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wciV-wciW==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wcjB-wcjC==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wcyI-

In [61]:
data = df_dummies

target = ps.BinaryTarget('outcome_glucose', True)
searchspace = ps.create_selectors(data, ignore=['outcome_N-acetylfucosamine', 'outcome_N-acetylgalactosamine',
       'outcome_N-acetylglucosamine', 'outcome_N-acetylmannosamine',
       'outcome_galactose', 'outcome_glucose', 'outcome_glucuronic acid',
       'outcome_rhamnose', 'outcome_ribitol'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=20,
    depth=5,
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [62]:
df_result = result.to_dataframe()
df_result

  df = pd.DataFrame(res, columns=headers, dtype=np.float64)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.024655,fnlB-fnlC==0 AND gtp1-gtp2==0 AND wchA-wchF==1,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044
1,0.024655,fnlB-fnlC==0 AND gtp2-gtp3==0 AND wchA-wchF==1,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044
2,0.024655,gtp1-gtp2==0 AND mnaA-wcjB==0 AND wchA-wchF==1,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044
3,0.024655,fnlB-fnlC==0 AND wchA-wchF==1 AND wchX-gtp1==0,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044
4,0.024655,gtp1-gtp2==0 AND rmlA-rmlC==1 AND wchA-wchF==1,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044
5,0.024655,gtp1-gtp2==0 AND rmlB-rmlD==1 AND wchA-wchF==1,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044
6,0.024655,gtp2-gtp3==0 AND wchA-wchF==1,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044
7,0.024655,gtp1-gtp2==0 AND wchA-wchF==1 AND wchA-wchJ==0,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044
8,0.024655,gtp1-gtp2==0 AND wchA-wchF==1,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044
9,0.024655,gtp1-gtp2==0 AND wchA-wchF==1 AND wchK-wchL==0,64.0,241.0,24.0,68.0,177.0,0.26556,0.73444,0.352941,0.647059,0.375,0.248588,0.282158,1.329044


In [63]:
for i in range(len(df_result['subgroup'])):
    print(df_result['subgroup'][i])

fnlB-fnlC==0 AND gtp1-gtp2==0 AND wchA-wchF==1
fnlB-fnlC==0 AND gtp2-gtp3==0 AND wchA-wchF==1
gtp1-gtp2==0 AND mnaA-wcjB==0 AND wchA-wchF==1
fnlB-fnlC==0 AND wchA-wchF==1 AND wchX-gtp1==0
gtp1-gtp2==0 AND rmlA-rmlC==1 AND wchA-wchF==1
gtp1-gtp2==0 AND rmlB-rmlD==1 AND wchA-wchF==1
gtp2-gtp3==0 AND wchA-wchF==1
gtp1-gtp2==0 AND wchA-wchF==1 AND wchA-wchJ==0
gtp1-gtp2==0 AND wchA-wchF==1
gtp1-gtp2==0 AND wchA-wchF==1 AND wchK-wchL==0
gtp2-gtp3==0 AND wchA-wchF==1 AND wchA-wchJ==0
wchA-wchF==1 AND wchX-gtp1==0
gtp2-gtp3==0 AND rmlB-rmlD==1 AND wchA-wchF==1
rmlB-rmlD==1 AND wchA-wchF==1 AND wchX-gtp1==0
wchA-wchF==1 AND wchA-wchJ==0 AND wchX-gtp1==0
rmlA-rmlC==1 AND wchA-wchF==1 AND wchX-gtp1==0
gtp2-gtp3==0 AND mnaA-wcjB==0 AND wchA-wchF==1
gtp2-gtp3==0 AND rmlA-rmlC==1 AND wchA-wchF==1
mnaA-wcjB==0 AND wchA-wchF==1 AND wchX-gtp1==0
wchA-wchF==1 AND wchK-wchL==0 AND wchX-gtp1==0


In [64]:
data = df_dummies

target = ps.BinaryTarget('outcome_glucuronic acid', True)
searchspace = ps.create_selectors(data, ignore=['outcome_N-acetylfucosamine', 'outcome_N-acetylgalactosamine',
       'outcome_N-acetylglucosamine', 'outcome_N-acetylmannosamine',
       'outcome_galactose', 'outcome_glucose', 'outcome_glucuronic acid',
       'outcome_rhamnose', 'outcome_ribitol'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=20,
    depth=5,
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [65]:
df_result = result.to_dataframe()
df_result

  df = pd.DataFrame(res, columns=headers, dtype=np.float64)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.013154,HG94--rmlA==0 AND mnaA-wcjB==1,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82
1,0.013154,HG94--rmlA==0 AND wcjB-wcjC==1,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82
2,0.013154,HG94--rmlA==0 AND wchO-wcjA==1,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82
3,0.013154,wchA-wchF==0 AND wchO-wcjA==1,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82
4,0.013154,mnaA-wcjB==1 AND wchF-wciU==0,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82
5,0.013154,mnaA-wcjB==1 AND wchA-wchF==0,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82
6,0.013154,HG94--rmlA==0 AND wcjA-mnaA==1,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82
7,0.013154,wchF-wciU==0 AND wchO-wcjA==1,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82
8,0.013154,wchA-wchF==0 AND wcjA-mnaA==1,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82
9,0.013154,mnaA-wcjB==1 AND wciY-gct==0,20.0,241.0,4.0,10.0,221.0,0.082988,0.917012,0.4,0.6,0.2,0.027149,0.041494,4.82


In [66]:
for i in range(len(df_result['subgroup'])):
    print(df_result['subgroup'][i])

HG94--rmlA==0 AND mnaA-wcjB==1
HG94--rmlA==0 AND wcjB-wcjC==1
HG94--rmlA==0 AND wchO-wcjA==1
wchA-wchF==0 AND wchO-wcjA==1
mnaA-wcjB==1 AND wchF-wciU==0
mnaA-wcjB==1 AND wchA-wchF==0
HG94--rmlA==0 AND wcjA-mnaA==1
wchF-wciU==0 AND wchO-wcjA==1
wchA-wchF==0 AND wcjA-mnaA==1
mnaA-wcjB==1 AND wciY-gct==0
wcjB-wcjC==1
wciY-gct==0 AND wcjB-wcjC==1
mnaA-wcjB==1
wchA-wchF==0 AND wcjB-wcjC==1
wchO-wcjA==1
wchO-wcjA==1 AND wciY-gct==0
wcjA-mnaA==1
wchF-wciU==0 AND wcjA-mnaA==1
wciY-gct==0 AND wcjA-mnaA==1
wchF-wciU==0 AND wcjB-wcjC==1


In [67]:
data = df_dummies

target = ps.BinaryTarget('outcome_rhamnose', True)
searchspace = ps.create_selectors(data, ignore=['outcome_N-acetylfucosamine', 'outcome_N-acetylgalactosamine',
       'outcome_N-acetylglucosamine', 'outcome_N-acetylmannosamine',
       'outcome_galactose', 'outcome_glucose', 'outcome_glucuronic acid',
       'outcome_rhamnose', 'outcome_ribitol'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=20,
    depth=5,
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [68]:
df_result = result.to_dataframe()
df_result

  df = pd.DataFrame(res, columns=headers, dtype=np.float64)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.089978,mnaA-wcjB==0 AND rmlA-rmlC==1,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336
1,0.089978,rmlA-rmlC==1 AND rmlC-rmlB==1,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336
2,0.089978,mnaA-wcjB==0 AND rmlC-rmlB==1,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336
3,0.089978,rmlA-rmlC==1 AND wchM-wchN==0,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336
4,0.089978,rmlA-rmlC==1 AND wchL-wchM==0,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336
5,0.089978,rmlB-rmlD==1 AND wchM-wchN==0,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336
6,0.089978,rmlA-rmlC==1 AND wcjB-wcjC==0,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336
7,0.089978,rmlA-rmlC==1 AND wchO-wcjA==0,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336
8,0.089978,rmlC-rmlB==1 AND wchM-wchN==0,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336
9,0.089978,rmlA-rmlC==1 AND wcjA-mnaA==0,107.0,241.0,39.0,39.0,134.0,0.443983,0.556017,1.0,0.0,0.364486,0.0,0.161826,2.252336


In [69]:
for i in range(len(df_result['subgroup'])):
    print(df_result['subgroup'][i])

mnaA-wcjB==0 AND rmlA-rmlC==1
rmlA-rmlC==1 AND rmlC-rmlB==1
mnaA-wcjB==0 AND rmlC-rmlB==1
rmlA-rmlC==1 AND wchM-wchN==0
rmlA-rmlC==1 AND wchL-wchM==0
rmlB-rmlD==1 AND wchM-wchN==0
rmlA-rmlC==1 AND wcjB-wcjC==0
rmlA-rmlC==1 AND wchO-wcjA==0
rmlC-rmlB==1 AND wchM-wchN==0
rmlA-rmlC==1 AND wcjA-mnaA==0
rmlC-rmlB==1 AND wchO-wcjA==0
rmlC-rmlB==1
rmlC-rmlB==1 AND wcjB-wcjC==0
rmlB-rmlD==1 AND wchL-wchM==0
rmlC-rmlB==1 AND wchL-wchM==0
rmlA-rmlC==1
rmlA-rmlC==1 AND wcwT-wcwU==0
rmlC-rmlB==1 AND wcwT-wcwU==0
rmlC-rmlB==1 AND wcjA-mnaA==0
rmlB-rmlD==1 AND rmlC-rmlB==1


In [70]:
data = df_dummies

target = ps.BinaryTarget('outcome_ribitol', True)
searchspace = ps.create_selectors(data, ignore=['outcome_N-acetylfucosamine', 'outcome_N-acetylgalactosamine',
       'outcome_N-acetylglucosamine', 'outcome_N-acetylmannosamine',
       'outcome_galactose', 'outcome_glucose', 'outcome_glucuronic acid',
       'outcome_rhamnose', 'outcome_ribitol'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=20,
    depth=5,
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [71]:
df_result = result.to_dataframe()
df_result

  df = pd.DataFrame(res, columns=headers, dtype=np.float64)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.027651,fnlB-fnlC==0 AND gtp1-gtp2==0 AND wchA-wchF==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842
1,0.027651,fnlB-fnlC==0 AND gtp1-gtp2==0 AND wchA-wchF==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842
2,0.027651,fnlB-fnlC==0 AND gtp1-gtp2==0 AND wchA-wchF==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842
3,0.027651,fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842
4,0.027651,gtp1-gtp2==0 AND wchA-wchF==0 AND wchA-wchO==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842
5,0.027651,fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842
6,0.027651,fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842
7,0.027651,wchA-wchF==0 AND wchA-wchO==0 AND wchM-wchN==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842
8,0.027651,wchA-wchF==0 AND wchA-wchO==0 AND wchL-wchM==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842
9,0.027651,gtp1-gtp2==0 AND wchA-wchF==0 AND wchA-wchO==0...,95.0,241.0,11.0,11.0,146.0,0.394191,0.605809,1.0,0.0,0.115789,0.0,0.045643,2.536842


In [72]:
for i in range(len(df_result['subgroup'])):
    print(df_result['subgroup'][i])

fnlB-fnlC==0 AND gtp1-gtp2==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchK-wchL==0
fnlB-fnlC==0 AND gtp1-gtp2==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchL-wchM==0
fnlB-fnlC==0 AND gtp1-gtp2==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchM-wchN==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchL-wchM==0
gtp1-gtp2==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchK-wchL==0 AND wciI-wciJ==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchM-wchN==0
fnlB-fnlC==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchK-wchL==0
wchA-wchF==0 AND wchA-wchO==0 AND wchM-wchN==0 AND wciI-wciJ==0 AND wcjB-wcjC==0
wchA-wchF==0 AND wchA-wchO==0 AND wchL-wchM==0 AND wchX-gtp1==0 AND wciI-wciJ==0
gtp1-gtp2==0 AND wchA-wchF==0 AND wchA-wchO==0 AND wchM-wchN==0 AND wciI-wciJ==0
wchA-wchF==0 AND wchA-wchO==0 AND wchL-wchM==0 AND wciI-wciJ==0
wchA-wchF==0 AND wchA-wchO==0 AND wchL-wchM==0 AND wciI-wciJ==0 AND wcjB-wcjC==0
wchA-wchF==0 AND wchA-wchO==0 AND wchK-wchL==0 AND wciI-wciJ==0
wchA-wchF==0 AND wchA-wchO==0 AN