In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils_printable import printable



abbreviations = {"RAC1P": "RAC", "White alone": "White", "Reference person": "Ref person", \
"Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions": "Empl. for-profit-c",\
"Employee of a private not-for-profit, tax-exempt, or charitable organization": "Empl. no-profit-c",
"Self-employed in own not incorporated business, professional practice, or farm": "Self-Empl. not incorp" ,     
"Self-employed in own incorporated business, professional practice or farm": "Self-Empl. incorp" ,
"Local government employee (city, county, etc.)" : "Local gov. Empl. ",
"Federal government employee": "Federal gov.  Empl.",
"California/CA":"CA", "Never married or under 15 years old": "Never married/<15yrs", 
"Biological son or daughter": "Son/daughter", "Regular high school diploma":"HS", \
"Asian alone":"Asian", "1 or more years of college credit":"1+ collage cr",
"Some Other Race alone": "Other", "Bachelor's degree": "Bachelor", "Master's degree": "Master", 
                "Some college, but less than 1 year": "<1y collage",
                "Associate's degree":"Associate",
"Noninstitutionalized group quarters population": "Noninstit. GQs", 
"OCCP=CMM-Software Developers": "OCCP=CMM-SW Dev", 
                 "Professional degree beyond a bachelor's degree": "Prof beyond bachelor"}

In [3]:
def printable_with_ratio(df_pr, abbreviations = {}, show_weighted_ratio = False):
    out_cols = ["itemsets", "sup", "Δ_outcome", "t_outcome", "ratio"]
    df_pr["ratio"] = df_pr["outcome"]/mean_outcome
    if show_weighted_ratio:
        out_cols = out_cols + ["wlogr"]
    df_pr = printable(df_pr, abbreviations=abbreviations)
    return df_pr[out_cols]



# Load data

In [4]:
from folktables import ACSDataSource, ACSIncome

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)
features, label, group = ACSIncome.df_to_numpy(acs_data)

Downloading data for 2018 1-Year person survey for CA...


In [5]:
from folktables.acs import adult_filter
from folktables import BasicProblem
import numpy as np

feature_names = [
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'OCCP',
        'POBP',
        'RELP',
        'WKHP',
        'SEX',
        'RAC1P'
    ]

target='PINCP'

ACSIncome_outcome = BasicProblem(
    features=feature_names,
    target=target,
    #target_transform=lambda x: x > 50000,    
    group='RAC1P',
    preprocess=adult_filter,
    #postprocess=lambda x: np.nan_to_num(x, -1),
)

In [6]:
features, label, group = ACSIncome_outcome.df_to_numpy(acs_data)

In [7]:
continuous_attributes = ['AGEP', 'WKHP']

In [8]:
categorical_attributes = list(set(feature_names)-set(continuous_attributes))

In [9]:
import pandas as pd


df = pd.DataFrame(features, columns=feature_names)
df["income"] = label
df.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,income
0,30.0,6.0,14.0,1.0,9610.0,6.0,16.0,40.0,1.0,8.0,48500.0
1,21.0,4.0,16.0,5.0,1970.0,6.0,17.0,20.0,1.0,1.0,7700.0
2,65.0,2.0,22.0,5.0,2040.0,6.0,17.0,8.0,1.0,1.0,17200.0
3,33.0,1.0,14.0,3.0,9610.0,36.0,16.0,40.0,1.0,1.0,12000.0
4,18.0,2.0,19.0,5.0,1021.0,6.0,17.0,18.0,2.0,1.0,300.0


# Remap columns

From label encoding to interpretable notation: e.g., 0, 1 to "male", "female"

In [10]:

import pandas as pd
import os
filename = os.path.join(os.path.curdir, "datasets", "ACSPUMS", "PUMS_Data_Dictionary_2018.csv")

In [11]:
df_mappings = pd.read_csv(filename)

In [12]:
remapping_cols = {}

In [13]:
orig_col = "1"
new_col = "Record Type"
cols_i = [orig_col, new_col]

#col_name = "OCCP"
for col_name in categorical_attributes:
    dict_i = df_mappings.loc["VAL"].loc[col_name][cols_i].set_index(orig_col).to_dict()[new_col]
    dict_i = {float(k) if (k not in ["b", "bb", 'bbb', 'bbbb']) else -1 :v for k,v in dict_i.items()}
    remapping_cols[col_name] = dict_i

In [14]:
from copy import deepcopy

df_renamed = deepcopy(df)

for column_name in remapping_cols:
    df_renamed[column_name] = df[column_name].replace(remapping_cols[column_name])

We substituite nan with string "NaN"

In [15]:
for c in df_renamed:
    if df_renamed[c].isna().any():
        print(c)
        df_renamed[c].fillna('NaN', inplace=True)

In [16]:
df_renamed.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,income
0,30.0,Self-employed in own not incorporated business...,Grade 11,Married,TRN-Cleaners Of Vehicles And Equipment,California/CA,Institutionalized group quarters population,40.0,Male,Some Other Race alone,48500.0
1,21.0,State government employee,Regular high school diploma,Never married or under 15 years old,"SCI-Other Life, Physical, And Social Science T...",California/CA,Noninstitutionalized group quarters population,20.0,Male,White alone,7700.0
2,65.0,"Employee of a private not-for-profit, tax-exem...",Master's degree,Never married or under 15 years old,CMS-Clergy,California/CA,Noninstitutionalized group quarters population,8.0,Male,White alone,17200.0
3,33.0,Employee of a private for-profit company or bu...,Grade 11,Divorced,TRN-Cleaners Of Vehicles And Equipment,New York/NY,Institutionalized group quarters population,40.0,Male,White alone,12000.0
4,18.0,"Employee of a private not-for-profit, tax-exem...","1 or more years of college credit, no degree",Never married or under 15 years old,CMM-Software Developers,California/CA,Noninstitutionalized group quarters population,18.0,Female,White alone,300.0


In [17]:
[i for i in df_renamed["OCCP"].unique() if "MGR" in i]

['MGR-Lodging Managers',
 'MGR-Food Service Managers',
 'MGR-Social And Community Service Managers',
 'MGR-Other Managers',
 'MGR-Medical And Health Services Managers',
 'MGR-Human Resources Managers',
 'MGR-Sales Managers',
 'MGR-Farmers, Ranchers, And Other Agricultural Managers',
 'MGR-Education And Childcare Administrators',
 'MGR-General And Operations Managers',
 'MGR-Training And Development Managers',
 'MGR-Chief Executives And Legislators',
 'MGR-Property, Real Estate, And Community Association Managers',
 'MGR-Marketing Managers',
 'MGR-Industrial Production Managers',
 'MGR-Construction Managers',
 'MGR-Entertainment and Recreation Managers',
 'MGR-Financial Managers',
 'MGR-Computer And Information Systems Managers',
 'MGR-Architectural And Engineering Managers',
 'MGR-Purchasing Managers',
 'MGR-Public Relations And Fundraising Managers',
 'MGR-Advertising And Promotions Managers',
 'MGR-Administrative Services Managers',
 'MGR-Transportation, Storage, And Distribution Man

In [18]:
import pickle

with open(
    os.path.join(os.path.curdir, "datasets", "ACSPUMS", "census_column_mapping.pickle"),
    "wb",
) as fp:
    pickle.dump(remapping_cols, fp)

In [19]:
from copy import deepcopy

df_analysis = deepcopy(df_renamed)

In [20]:
df_analysis.to_csv(os.path.join(os.path.curdir, "datasets", "ACSPUMS", "adult_dataset_income.csv"), index=False)

In [21]:
continuous_attributes = ['AGEP', 'WKHP']

In [22]:
df_analysis.shape

(195665, 11)

# Taxonomy

In [23]:
df_analysis_proc = deepcopy(df_analysis)

## Occupation

In [24]:
attr = "OCCP"
from copy import deepcopy

#generalization_dict_tax = deepcopy(generalization_dict)
generalization_dict_tax = {}
generalization_dict_tax[attr] = {x: x.split("-")[0] for x in df_analysis_proc[attr].values if "-" in x}

## Mean of transport

In [25]:
attr = "JWTR"

if attr in df_analysis_proc.columns:
    print(df_analysis_proc[attr].value_counts()/len(df_analysis_proc))
    generalization_dict_tax[attr] =  {'Bicycle': 'Private Transportation', 
    'Car, truck, or van': 'Private Transportation', 
    'Bus or trolley bus': 'Public Transportation',
    'Railroad': 'Public Transportation',
    'Subway or elevated': 'Public Transportation',
    'Motorcycle': 'Private Transportation', 
    'Streetcar or trolley car (carro publico in Puerto Rico)': 'Public Transportation',
    'Ferryboat': 'Public Transportation'}

## Working weeks

In [26]:
attr = "WKW"

if attr in df_analysis_proc.columns:
    print(df_analysis_proc[attr].value_counts())
    generalization_dict_tax[attr] =  {'14 to 26 weeks worked during past 12 months': '14 to 39 weeks worked during past 12 months',
    '27 to 39 weeks worked during past 12 months': '14 to 39 weeks worked during past 12 months',
    '40 to 47 weeks worked during past 12 months': '40 to 49 weeks worked during past 12 months',
    '48 to 49 weeks worked during past 12 months': '40 to 49 weeks worked during past 12 months'}

## Arrival and departure time at work

In [27]:
def taxonomy_split_in_slot(values):
    def define_slot(x, v_slot):            
            if int(x)%v_slot == 0:
                slot_value= f"{x}-{x+v_slot}"
                if x == 24:
                    slot_value= f"{x}-{v_slot}"
            else:
                x = x-(int(x)%v_slot)
                slot_value= f"{x}-{x+v_slot}"
            return slot_value
        
    tax = {}

    for val in values:
        if val=="NaN":
            continue
        tmp = val.split(":")
        h = int(tmp[0])
        t = tmp[1].split(" ")[1]
        if h==12:
            if t == "a.m.":
                h = 0    
        else:
            if t=="p.m.":
                h = h+12
        hier1 = h

        tax[val] = str(hier1)

        hier2 = define_slot(h, 2)
        tax[str(hier1)] = hier2
        hier3 = define_slot(h, 4)
        tax[hier2] = hier3
    return tax

In [28]:
attr = "JWAP"

if attr in df_analysis_proc.columns:
    tax= taxonomy_split_in_slot(df_analysis_proc[attr].unique())

    generalization_dict_tax[attr] = tax

attr = "JWDP"
if attr in df_analysis_proc.columns:
    # We use the same
    generalization_dict_tax[attr] = deepcopy(tax)

## Country

We rename some of the columns for vizualization and mapping reasons.

In [29]:
if "POBP" in df_analysis_proc.columns:
    df_analysis_proc["POBP"] = df_analysis_proc["POBP"]\
    .replace({"England":"United Kingdom", 'Scotland': "United Kingdom", \
              "United Kingdom, Not Specified": "United Kingdom",  'Northern Ireland': "United Kingdom",\
               "USSR": "Russia", "Hong Kong": "Hong Kong SAR",
             "Democratic Republic of Congo (Zaire)": 'Democratic Republic of the Congo',
             "Trinidad & Tobago": "Trinidad and Tobago", 'Macedonia':'North Macedonia',
             'Antigua & Barbuda': 'Antigua and Barbuda', \
              'Commonwealth of the Northern Mariana Islands':'Northern Mariana Islands',
             'St. Vincent & the Grenadines': 'St. Vincent and the Grenadines', \
              #simplification
             "Azores Islands": "Portugal"})

In [30]:
# https://statisticstimes.com/geography/countries-by-continents.php
df_taxonomy_country = pd.read_csv("country_hierarchies.csv", header=None, index_col=0)

df_taxonomy_country.columns=["Country or Area","ISO-alpha3 Code", "M49 Code", "Region 1", "Region 2", "Continent"]

df_taxonomy_country["Country or Area"] = df_taxonomy_country["Country or Area"].replace({
    "Viet Nam":"Vietnam", "Bolivia (Plurinational State of)": "Bolivia" ,\
      "Russian Federation": "Russia",'Iran (Islamic Republic of)': 'Iran', \
    "Lao People's Democratic Republic": "Laos", 'Syrian Arab Republic': "Syria", 
    'Venezuela (Bolivarian Republic of)': "Venezuela",
    "Republic of Moldova": "Moldova", "Czechia": "Czech Republic",
        'United Republic of Tanzania': "Tanzania", 'Côte d’Ivoire': "Ivory Coast",
        'United States Virgin Islands': 'US Virgin Islands', 'Saint Lucia': 'St. Lucia', 
    'Saint Kitts and Nevis': 'St. Kitts-Nevis',
    'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines',
    "United Kingdom of Great Britain and Northern Ireland" : "United Kingdom",\
    "China, Hong Kong Special Administrative Region" : "Hong Kong SAR"
})


g_hierarchy = {}
target_cols = ["Country or Area", "Region 1", "Continent"]
for i, row in df_taxonomy_country.iterrows():
    levels = [col for col in target_cols if pd.notnull(row[col])]
    
    for l in range(len(levels)-1):
        tax_c, tax_p = row[levels[l]], row[levels[l+1]]
        # If already exist but the value is different
        if "Korea" in tax_c:
            tax_c = "Korea"
        
        if tax_c in g_hierarchy and tax_p!=g_hierarchy[tax_c]:
            print(row)
            raise ValueError()
        if tax_c == tax_p:
            continue
        g_hierarchy[tax_c] = tax_p
attr = 'POBP'
#g_hierarchy = {}
g_hierarchy.update({"North America": "America", 
 "South America": "America", 
 "Americas, Not Specified" : "America", 
"Other Europe, Not Specified": "Europe",
"Eastern Africa, Not Specified": "Africa",
"Other Asia, Not Specified": "Asia",
"South Central Asia, Not Specified": "Asia",
"Caribbean, Not Specified": "North America",
"Other US Island Areas, Oceania, Not Specified, or at Sea": "Oceania",
"Other Africa, Not Specified": "Africa",
"Western Africa, Not Specified": "Africa",
                   "Taiwan" : 'Eastern Asia',
                   'Czechoslovakia':'Eastern Europe', 'Yugoslavia':'Southern Europe'})


In [31]:
generalization_dict_tax_u = deepcopy(g_hierarchy)
delete_values = []
for c, v in df_analysis_proc["POBP"].value_counts().items():
    if c not in g_hierarchy.keys():
        if "/" in c:
            g_hierarchy[c] = "United States of America"
        else:
            delete_values.append(c)

delete_values.append("North America")
delete_values.append("South America")
delete_values.append("Micronesia")

tot_deleted = 0
for c in delete_values:
    print(c, len(df_analysis_proc.loc[df_analysis_proc["POBP"]==c]))
    tot_deleted +=len(df_analysis_proc.loc[df_analysis_proc["POBP"]==c])
print(tot_deleted, round(tot_deleted/len(df_analysis_proc),5))

Asia 54
Europe 23
Africa 18
West Indies 2
North America 0
South America 4
Micronesia 8
109 0.00056


Delete rows for which the detailed information is not available

In [32]:
df_analysis_proc = df_analysis_proc.loc[df_analysis_proc["POBP"].isin(delete_values)==False]

In [33]:
df_analysis_proc.loc[df_analysis_proc["POBP"].isin(delete_values)]

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,income


In [34]:
generalization_dict_tax[attr] = generalization_dict_tax_u

In [35]:
df_analysis_proc.to_csv(os.path.join(os.path.curdir, "datasets", "ACSPUMS", "adult_dataset_income_tax.csv"), index=False)

In [36]:
import json

with open(os.path.join(os.path.curdir, "datasets", "ACSPUMS", "adult_taxonomies.json"), "w") as fp:
    json.dump(generalization_dict_tax, fp)

# Tree divergence - income - individual

## Tree

In [37]:
df_analysis.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,income
0,30.0,Self-employed in own not incorporated business...,Grade 11,Married,TRN-Cleaners Of Vehicles And Equipment,California/CA,Institutionalized group quarters population,40.0,Male,Some Other Race alone,48500.0
1,21.0,State government employee,Regular high school diploma,Never married or under 15 years old,"SCI-Other Life, Physical, And Social Science T...",California/CA,Noninstitutionalized group quarters population,20.0,Male,White alone,7700.0
2,65.0,"Employee of a private not-for-profit, tax-exem...",Master's degree,Never married or under 15 years old,CMS-Clergy,California/CA,Noninstitutionalized group quarters population,8.0,Male,White alone,17200.0
3,33.0,Employee of a private for-profit company or bu...,Grade 11,Divorced,TRN-Cleaners Of Vehicles And Equipment,New York/NY,Institutionalized group quarters population,40.0,Male,White alone,12000.0
4,18.0,"Employee of a private not-for-profit, tax-exem...","1 or more years of college credit, no degree",Never married or under 15 years old,CMM-Software Developers,California/CA,Noninstitutionalized group quarters population,18.0,Female,White alone,300.0


In [38]:
min_support = 0.1

In [39]:
metric = "d_outcome"


target = "income"

In [40]:
from tree_discretization_ranking import TreeDiscretization_ranking

tree_discr = TreeDiscretization_ranking()


type_experiment = "one_at_time"

type_criterion = "weighted_sum_abs_reference_s"


generalization_dict, discretizations = tree_discr.get_tree_discretization(
    df_analysis_proc,
    type_splitting=type_experiment,
    min_support=min_support,
    metric=metric,
    continuous_attributes= list(continuous_attributes),
    storeTree=True,
    type_criterion = type_criterion,
    minimal_gain = 0.0,
    target_col = target
)
tree_discr.printDiscretizationTrees()

dot = {}
for attribute in continuous_attributes:
    if attribute in tree_discr.trees:
        dot[attribute] = tree_discr.trees[attribute].visualizeTreeDiGraph()

AGEP
 root s=1.00000 --> d_outcome=0.00000
         AGEP<=34.0 s=0.34460 --> d_outcome=-27957.34967
                 AGEP<=26.0 s=0.17023 --> d_outcome=-43902.74131
                 AGEP>=27.0 s=0.17437 --> d_outcome=-12391.18714
         AGEP>=35.0 s=0.65540 --> d_outcome=14699.71082
                 AGEP<=45.0 s=0.21916 --> d_outcome=10075.43355
                         AGEP<=39.0 s=0.10386 --> d_outcome=7208.71884
                         AGEP>=40.0 s=0.11531 --> d_outcome=12657.49820
                 AGEP>=46.0 s=0.43623 --> d_outcome=17022.96186
                         AGEP<=62.0 s=0.33534 --> d_outcome=15339.48589
                                 AGEP<=56.0 s=0.22497 --> d_outcome=15764.15092
                                         AGEP<=51.0 s=0.12258 --> d_outcome=15071.81348
                                         AGEP>=52.0 s=0.10239 --> d_outcome=16592.99880
                                 AGEP>=57.0 s=0.11037 --> d_outcome=14473.90428
                         AGEP>=63.0

In [41]:
generalization_dict

{'AGEP': {'<=26.0': '<=34.0',
  '[27.0-34.0]': '<=34.0',
  '[35.0-45.0]': '>=35.0',
  '[35.0-39.0]': '[35.0-45.0]',
  '[40.0-45.0]': '[35.0-45.0]',
  '>=46.0': '>=35.0',
  '[46.0-62.0]': '>=46.0',
  '[46.0-56.0]': '[46.0-62.0]',
  '[46.0-51.0]': '[46.0-56.0]',
  '[52.0-56.0]': '[46.0-56.0]',
  '[57.0-62.0]': '[46.0-62.0]',
  '>=63.0': '>=46.0'},
 'WKHP': {'<=29.0': '<=39.0',
  '[30.0-39.0]': '<=39.0',
  '[40.0-43.0]': '>=40.0',
  '>=44.0': '>=40.0'}}

In [42]:
discretizations

{'AGEP': {'<=26.0': {'rels': ['<='], 'vals': [26.0]},
  '[27.0-34.0]': {'rels': ['>=', '<='], 'vals': [27.0, 34.0]},
  '[35.0-39.0]': {'rels': ['>=', '<='], 'vals': [35.0, 39.0]},
  '[40.0-45.0]': {'rels': ['>=', '<='], 'vals': [40.0, 45.0]},
  '[46.0-51.0]': {'rels': ['>=', '<='], 'vals': [46.0, 51.0]},
  '[52.0-56.0]': {'rels': ['>=', '<='], 'vals': [52.0, 56.0]},
  '[57.0-62.0]': {'rels': ['>=', '<='], 'vals': [57.0, 62.0]},
  '>=63.0': {'rels': ['>='], 'vals': [63.0]}},
 'WKHP': {'<=29.0': {'rels': ['<='], 'vals': [29.0]},
  '[30.0-39.0]': {'rels': ['>=', '<='], 'vals': [30.0, 39.0]},
  '[40.0-43.0]': {'rels': ['>=', '<='], 'vals': [40.0, 43.0]},
  '>=44.0': {'rels': ['>='], 'vals': [44.0]}}}

In [43]:
FP_results = {"base": {}, "generalized": {}}

## min_sup_divergence  = 0.05

In [44]:
generalization_dict_all = deepcopy(generalization_dict)
generalization_dict_all.update(generalization_dict_tax)

In [45]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [46]:
min_sup_divergence = 0.05

### Without Generalization 

In [47]:
import pandas as pd
pd.set_option('max_colwidth', None)

In [48]:
INFO = ['support', 'itemsets', 'tn', 'fp', 'fn', 'tp', 'd_fpr', 't_value_fp', 'error']

In [49]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = False

pattern_type = "generalized" if apply_generalization else "base"
    
FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    #true_class_name=pred_name,
    #predicted_class_name=class_name,
    target_name = target,
    #class_map=class_map,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

Unnamed: 0,RELP,RAC1P,COW,POBP,MAR,SCHL,OCCP,SEX,AGEP,WKHP,income
0,Institutionalized group quarters population,Some Other Race alone,"Self-employed in own not incorporated business, professional practice, or farm",California/CA,Married,Grade 11,TRN-Cleaners Of Vehicles And Equipment,Male,[27.0-34.0],[40.0-43.0],48500.0
1,Noninstitutionalized group quarters population,White alone,State government employee,California/CA,Never married or under 15 years old,Regular high school diploma,"SCI-Other Life, Physical, And Social Science Technicians",Male,<=26.0,<=29.0,7700.0
2,Noninstitutionalized group quarters population,White alone,"Employee of a private not-for-profit, tax-exempt, or charitable organization",California/CA,Never married or under 15 years old,Master's degree,CMS-Clergy,Male,>=63.0,<=29.0,17200.0
3,Institutionalized group quarters population,White alone,"Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions",New York/NY,Divorced,Grade 11,TRN-Cleaners Of Vehicles And Equipment,Male,[27.0-34.0],[40.0-43.0],12000.0
4,Noninstitutionalized group quarters population,White alone,"Employee of a private not-for-profit, tax-exempt, or charitable organization",California/CA,Never married or under 15 years old,"1 or more years of college credit, no degree",CMM-Software Developers,Female,<=26.0,<=29.0,300.0
...,...,...,...,...,...,...,...,...,...,...,...
195660,Reference person,Asian alone,"Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions",India,Married,Master's degree,CMM-Software Developers,Male,[35.0-39.0],[40.0-43.0],565280.0
195661,Husband/wife,Asian alone,"Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions",India,Married,Master's degree,CMM-Software Developers,Female,[35.0-39.0],[40.0-43.0],210000.0
195662,Reference person,White alone,"Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions",Illinois/IL,Married,"1 or more years of college credit, no degree",OFF-Customer Service Representatives,Male,[57.0-62.0],>=44.0,105000.0
195663,Reference person,Asian alone,"Self-employed in own incorporated business, professional practice or farm",China,Married,Doctorate degree,CMS-Clergy,Male,>=63.0,>=44.0,30000.0


In [334]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [335]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [336]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
113,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
6,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
38,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


In [337]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
528,0.051494,"(WKHP=<=29.0, AGEP=<=26.0, POBP=California/CA)",3,10070.0,7927.402582,-57257.667843,238.857644
375,0.063741,"(WKHP=<=29.0, AGEP=<=26.0, MAR=Never married or under 15 years old)",3,12465.0,7990.126274,-57194.944151,249.298074
524,0.051556,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions)",3,10082.0,8009.281293,-57175.789131,236.757609
339,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.62619
374,0.063757,"(WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",3,12468.0,10175.319538,-55009.750887,218.028712


In [338]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
341,0.067035,"(RAC1P=White alone, WKHP=>=44.0, MAR=Married, SEX=Male)",4,13109.0,146171.347471,80986.277046,62.329294
468,0.054925,"(MAR=Married, RELP=Reference person, WKHP=>=44.0, SEX=Male)",4,10741.0,144403.15371,79218.083285,55.26603
450,0.056797,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, RAC1P=White alone, WKHP=>=44.0, MAR=Married)",4,11107.0,144314.13613,79129.065706,56.418323
541,0.050921,"(RAC1P=White alone, RELP=Reference person, WKHP=>=44.0, MAR=Married)",4,9958.0,143632.726853,78447.656428,53.882423
408,0.061266,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0, MAR=Married, SEX=Male)",4,11981.0,141333.358651,76148.288226,56.490536


In [339]:
print(df_analysis["income"].describe()["mean"])
FP_fm.sort_values("length", ascending = True).head(1)

65192.04881813304


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
0,1.0,(),0,195556.0,65185.070425,0.0,0.0


#### Redundancy

In [340]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

3259.25, 65185.07


#### Top-3

In [341]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
528,"AGEP=<=26.0, POBP=CA, WKHP=<=29.0",0.05,-57257.668,238.9,0.122
375,"AGEP=<=26.0, MAR=Never married/<15yrs, WKHP=<=29.0",0.06,-57194.944,249.3,0.123
524,"AGEP=<=26.0, COW=Empl. for-profit-c, WKHP=<=29.0",0.05,-57175.789,236.8,0.123


In [342]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
341,"MAR=Married, RAC=White, SEX=Male, WKHP=>=44.0",0.07,80986.277,62.3,2.242
468,"MAR=Married, RELP=Ref person, SEX=Male, WKHP=>=44.0",0.05,79218.083,55.3,2.215
450,"COW=Empl. for-profit-c, MAR=Married, RAC=White, WKHP=>=44.0",0.06,79129.066,56.4,2.214


#### Top-3 redundancy

In [343]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
339,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
374,"COW=Empl. for-profit-c, MAR=Never married/<15yrs, WKHP=<=29.0",0.06,-55009.751,218.0,0.156
368,"MAR=Never married/<15yrs, POBP=CA, WKHP=<=29.0",0.06,-54666.19,202.6,0.161


In [344]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
339,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
201,"MAR=Never married/<15yrs, WKHP=<=29.0",0.09,-53376.975,206.4,0.181
255,"AGEP=<=26.0, RELP=Son/daughter",0.08,-49170.757,191.6,0.246


In [345]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
339,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
201,"MAR=Never married/<15yrs, WKHP=<=29.0",0.09,-53376.975,206.4,0.181
145,"COW=Empl. for-profit-c, WKHP=<=29.0",0.11,-46995.221,145.5,0.279


In [355]:
df_analysis_proc["POBP"]

0         California/CA
1         California/CA
2         California/CA
3           New York/NY
4         California/CA
              ...      
195660            India
195661            India
195662      Illinois/IL
195663            China
195664           Mexico
Name: POBP, Length: 195556, dtype: object

In [353]:
generalization_dict_all['POBP']["Afghanistan"].keys()

dict_keys(['Afghanistan', 'Southern Asia', 'Åland Islands', 'Northern Europe', 'Albania', 'Southern Europe', 'Algeria', 'Northern Africa', 'American Samoa', 'Polynesia', 'Andorra', 'Angola', 'Middle Africa', 'Anguilla', 'Caribbean', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Western Asia', 'Aruba', 'Australia', 'Australia and New Zealand', 'Austria', 'Western Europe', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Eastern Europe', 'Belgium', 'Belize', 'Central America', 'Benin', 'Western Africa', 'Bermuda', 'Northern America', 'Bhutan', 'Bolivia', 'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Southern Africa', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'Eastern Africa', 'British Virgin Islands', 'Brunei Darussalam', 'South-eastern Asia', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Eastern Asia'

### With Generalization 

In [346]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = True
pattern_type = "generalized" if apply_generalization else "base"

import time 

st = time.time()

FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    target_name = target,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    considerOnlyContinuos = False,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

print(time.time()-st)

19.13944911956787


In [347]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [348]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [349]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
display(l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20))
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


In [350]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1910,0.051494,"(WKHP=<=29.0, AGEP=<=26.0, POBP=California/CA)",3,10070.0,7927.402582,-57257.667843,238.857644
1301,0.063741,"(WKHP=<=29.0, AGEP=<=26.0, MAR=Never married or under 15 years old)",3,12465.0,7990.126274,-57194.944151,249.298074
1902,0.051556,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions)",3,10082.0,8009.281293,-57175.789131,236.757609
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.62619
1554,0.057615,"(WKHP=<=29.0, MAR=Never married or under 15 years old, POBP=California/CA, AGEP=<=34.0)",4,11267.0,8883.009142,-56302.061283,228.174844


In [351]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1858,0.052082,"(OCCP=MGR, AGEP=>=35.0, SEX=Male)",3,10185.0,155389.089838,90204.019413,60.614209
1567,0.057298,"(AGEP=>=35.0, MAR=Married, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",5,11205.0,155344.132441,90159.062016,62.025598
1745,0.054296,"(WKHP=>=40.0, AGEP=>=35.0, OCCP=MGR, MAR=Married)",4,10618.0,155110.01036,89924.939935,64.128105
1950,0.050983,"(AGEP=>=46.0, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",4,9970.0,152503.482849,87318.412424,56.646643
1767,0.053729,"(AGEP=>=46.0, MAR=Married, WKHP=>=44.0, SEX=Male)",4,10507.0,150511.681165,85326.61074,56.369136


In [68]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWTR" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [69]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [70]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i and "JWAP=NaN" not in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [71]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


#### Redundancy

In [72]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.1
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

In [73]:
display(fpdiv.head(20))

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1910,0.051494,"(WKHP=<=29.0, AGEP=<=26.0, POBP=California/CA)",3,10070.0,7927.402582,-57257.667843,238.857644
1301,0.063741,"(WKHP=<=29.0, AGEP=<=26.0, MAR=Never married or under 15 years old)",3,12465.0,7990.126274,-57194.944151,249.298074
1902,0.051556,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions)",3,10082.0,8009.281293,-57175.789131,236.757609
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.62619
1554,0.057615,"(WKHP=<=29.0, MAR=Never married or under 15 years old, POBP=California/CA, AGEP=<=34.0)",4,11267.0,8883.009142,-56302.061283,228.174844
1591,0.056904,"(WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old, AGEP=<=34.0)",4,11128.0,8887.360173,-56297.710252,232.799979
899,0.075845,"(WKHP=<=29.0, MAR=Never married or under 15 years old, AGEP=<=34.0)",3,14832.0,9288.519013,-55896.551412,237.604617
1261,0.064713,"(WKHP=<=29.0, POBP=California/CA, AGEP=<=34.0)",3,12655.0,9741.66132,-55443.409105,221.588755
1249,0.064989,"(WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, AGEP=<=34.0)",3,12709.0,9772.428515,-55412.64191,224.874556
1300,0.063757,"(WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",3,12468.0,10175.319538,-55009.750887,218.028712


In [74]:
fpdiv_t.head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.62619
676,0.088159,"(WKHP=<=29.0, AGEP=<=34.0)",2,17240.0,10338.782135,-54846.28829,227.314599
601,0.094413,"(AGEP=<=26.0, WKHP=<=39.0)",2,18463.0,10956.889671,-54228.180754,230.807747
653,0.089739,"(WKHP=<=29.0, MAR=Never married or under 15 years old)",2,17549.0,11808.095846,-53376.974579,206.363504
1172,0.06706,"(RELP=Biological son or daughter, WKHP=<=39.0)",2,13114.0,12223.590819,-52961.479606,222.591009
285,0.135935,"(WKHP=<=39.0, AGEP=<=34.0)",2,26583.0,15309.197382,-49875.873043,203.613355
264,0.139034,"(MAR=Never married or under 15 years old, WKHP=<=39.0)",2,27189.0,17282.848358,-47902.222067,179.66941
468,0.108097,"(WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions)",2,21139.0,18189.849283,-46995.221142,145.456512
1642,0.055871,"(SCHL=Regular high school diploma, WKHP=<=39.0)",2,10926.0,19104.645799,-46080.424626,120.157673
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073


In [75]:
display(fpdiv.tail(10))
fpdiv_t.tail(10)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1372,0.061962,"(WKHP=>=40.0, OCCP=MGR, MAR=Married)",3,12117.0,147351.839564,82166.769139,64.148109
1900,0.051602,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, AGEP=>=35.0, OCCP=MGR)",3,10091.0,147725.9231,82540.852675,57.753015
1812,0.052967,"(RAC1P=White alone, AGEP=>=35.0, OCCP=MGR, WKHP=>=40.0)",4,10358.0,149646.473257,84461.402833,61.203329
1734,0.054399,"(AGEP=>=46.0, RAC1P=White alone, WKHP=>=44.0, MAR=Married)",4,10638.0,149774.836811,84589.766387,57.58504
1895,0.051668,"(AGEP=>=35.0, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0, SEX=Male)",5,10104.0,149883.079968,84698.009543,55.774965
1767,0.053729,"(AGEP=>=46.0, MAR=Married, WKHP=>=44.0, SEX=Male)",4,10507.0,150511.681165,85326.61074,56.369136
1950,0.050983,"(AGEP=>=46.0, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",4,9970.0,152503.482849,87318.412424,56.646643
1745,0.054296,"(WKHP=>=40.0, AGEP=>=35.0, OCCP=MGR, MAR=Married)",4,10618.0,155110.01036,89924.939935,64.128105
1567,0.057298,"(AGEP=>=35.0, MAR=Married, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",5,11205.0,155344.132441,90159.062016,62.025598
1858,0.052082,"(OCCP=MGR, AGEP=>=35.0, SEX=Male)",3,10185.0,155389.089838,90204.019413,60.614209


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1173,0.067035,"(RAC1P=White alone, WKHP=>=44.0, MAR=Married, SEX=Male)",4,13109.0,146171.347471,80986.277046,62.329294
1371,0.061992,"(OCCP=MGR, AGEP=>=35.0, MAR=Married)",3,12123.0,147197.202838,82012.132413,62.387184
1372,0.061962,"(WKHP=>=40.0, OCCP=MGR, MAR=Married)",3,12117.0,147351.839564,82166.769139,64.148109
1900,0.051602,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, AGEP=>=35.0, OCCP=MGR)",3,10091.0,147725.9231,82540.852675,57.753015
1734,0.054399,"(AGEP=>=46.0, RAC1P=White alone, WKHP=>=44.0, MAR=Married)",4,10638.0,149774.836811,84589.766387,57.58504
1767,0.053729,"(AGEP=>=46.0, MAR=Married, WKHP=>=44.0, SEX=Male)",4,10507.0,150511.681165,85326.61074,56.369136
1950,0.050983,"(AGEP=>=46.0, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",4,9970.0,152503.482849,87318.412424,56.646643
1745,0.054296,"(WKHP=>=40.0, AGEP=>=35.0, OCCP=MGR, MAR=Married)",4,10618.0,155110.01036,89924.939935,64.128105
1567,0.057298,"(AGEP=>=35.0, MAR=Married, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",5,11205.0,155344.132441,90159.062016,62.025598
1858,0.052082,"(OCCP=MGR, AGEP=>=35.0, SEX=Male)",3,10185.0,155389.089838,90204.019413,60.614209


#### Analysis

In [76]:
# FP_fm.loc[FP_fm["length"] == 1].loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "POBP" in i ])>0)].sort_values(metric, ascending = True)

FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1839,0.052369,(OCCP=EAT),1,10241.0,21855.516063,-43329.554362,120.649128
1553,0.057666,"(OCCP=TRN, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions)",2,11277.0,34750.778576,-30434.291849,73.11069
1060,0.070205,(OCCP=TRN),1,13729.0,36495.081943,-28689.988482,70.796573
1718,0.054573,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=OFF, SEX=Female)",3,10672.0,38881.453336,-26303.617089,61.293815
1701,0.054803,"(OCCP=TRN, SEX=Male)",2,10717.0,39517.90893,-25667.161495,55.189505
1256,0.064861,"(OCCP=OFF, POBP=California/CA)",2,12684.0,39574.504888,-25610.565537,60.309615
822,0.080064,"(OCCP=OFF, SEX=Female)",2,15657.0,40632.856869,-24552.213556,65.848044
856,0.077543,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=OFF)",2,15164.0,40674.680823,-24510.389602,59.144009
1992,0.050272,"(SEX=Female, OCCP=SAL)",2,9831.0,42517.137626,-22667.932799,32.115795
430,0.11228,(OCCP=OFF),1,21957.0,42589.386073,-22595.684352,62.883211


In [77]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence

fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=0)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1858,0.052082,"(OCCP=MGR, AGEP=>=35.0, SEX=Male)",3,10185.0,155389.089838,90204.019413,60.614209
1567,0.057298,"(AGEP=>=35.0, MAR=Married, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",5,11205.0,155344.132441,90159.062016,62.025598
1745,0.054296,"(WKHP=>=40.0, AGEP=>=35.0, OCCP=MGR, MAR=Married)",4,10618.0,155110.010360,89924.939935,64.128105
1950,0.050983,"(AGEP=>=46.0, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",4,9970.0,152503.482849,87318.412424,56.646643
1767,0.053729,"(AGEP=>=46.0, MAR=Married, WKHP=>=44.0, SEX=Male)",4,10507.0,150511.681165,85326.610740,56.369136
...,...,...,...,...,...,...,...
1554,0.057615,"(WKHP=<=29.0, MAR=Never married or under 15 years old, POBP=California/CA, AGEP=<=34.0)",4,11267.0,8883.009142,-56302.061283,228.174844
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.626190
1902,0.051556,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions)",3,10082.0,8009.281293,-57175.789131,236.757609
1301,0.063741,"(WKHP=<=29.0, AGEP=<=26.0, MAR=Never married or under 15 years old)",3,12465.0,7990.126274,-57194.944151,249.298074


In [78]:
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]

v = mean_outcome*0.1

In [79]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
print(v)
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=v)

6518.507042483994


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1858,0.052082,"(OCCP=MGR, AGEP=>=35.0, SEX=Male)",3,10185.0,155389.089838,90204.019413,60.614209
1567,0.057298,"(AGEP=>=35.0, MAR=Married, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",5,11205.0,155344.132441,90159.062016,62.025598
1745,0.054296,"(WKHP=>=40.0, AGEP=>=35.0, OCCP=MGR, MAR=Married)",4,10618.0,155110.010360,89924.939935,64.128105
1950,0.050983,"(AGEP=>=46.0, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",4,9970.0,152503.482849,87318.412424,56.646643
1767,0.053729,"(AGEP=>=46.0, MAR=Married, WKHP=>=44.0, SEX=Male)",4,10507.0,150511.681165,85326.610740,56.369136
...,...,...,...,...,...,...,...
1172,0.067060,"(RELP=Biological son or daughter, WKHP=<=39.0)",2,13114.0,12223.590819,-52961.479606,222.591009
653,0.089739,"(WKHP=<=29.0, MAR=Never married or under 15 years old)",2,17549.0,11808.095846,-53376.974579,206.363504
601,0.094413,"(AGEP=<=26.0, WKHP=<=39.0)",2,18463.0,10956.889671,-54228.180754,230.807747
676,0.088159,"(WKHP=<=29.0, AGEP=<=34.0)",2,17240.0,10338.782135,-54846.288290,227.314599


In [80]:
f = fp_divergence_o.getDivergence(th_redundancy=v)

f.loc[f["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = False)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1858,0.052082,"(OCCP=MGR, AGEP=>=35.0, SEX=Male)",3,10185.0,155389.089838,90204.019413,60.614209
1745,0.054296,"(WKHP=>=40.0, AGEP=>=35.0, OCCP=MGR, MAR=Married)",4,10618.0,155110.01036,89924.939935,64.128105
1900,0.051602,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, AGEP=>=35.0, OCCP=MGR)",3,10091.0,147725.9231,82540.852675,57.753015
1372,0.061962,"(WKHP=>=40.0, OCCP=MGR, MAR=Married)",3,12117.0,147351.839564,82166.769139,64.148109
1371,0.061992,"(OCCP=MGR, AGEP=>=35.0, MAR=Married)",3,12123.0,147197.202838,82012.132413,62.387184
934,0.074598,"(WKHP=>=40.0, AGEP=>=35.0, OCCP=MGR)",3,14588.0,143578.596792,78393.526367,69.204062
1563,0.057406,"(AGEP=>=46.0, OCCP=MGR)",2,11226.0,142563.348477,77378.278052,57.550155
1319,0.063399,"(OCCP=MGR, SEX=Male)",2,12398.0,141752.798839,76567.728414,58.873202
1043,0.07066,"(OCCP=MGR, MAR=Married)",2,13818.0,139590.40165,74405.331225,61.989538
1661,0.055478,"(WKHP=>=40.0, RELP=Reference person, OCCP=MGR)",3,10849.0,139225.741543,74040.671118,57.767871


#### Top-3

In [81]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
1910,"AGEP=<=26.0, POBP=CA, WKHP=<=29.0",0.05,-57257.668,238.9,0.122
1301,"AGEP=<=26.0, MAR=Never married/<15yrs, WKHP=<=29.0",0.06,-57194.944,249.3,0.123
1902,"AGEP=<=26.0, COW=Empl. for-profit-c, WKHP=<=29.0",0.05,-57175.789,236.8,0.123


In [82]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
1858,"AGEP=>=35.0, OCCP=MGR, SEX=Male",0.05,90204.019,60.6,2.384
1567,"AGEP=>=35.0, MAR=Married, RAC=White, SEX=Male, WKHP=>=44.0",0.06,90159.062,62.0,2.383
1745,"AGEP=>=35.0, MAR=Married, OCCP=MGR, WKHP=>=40.0",0.05,89924.94,64.1,2.38


#### Top-3 redundancy

In [83]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
1168,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
899,"AGEP=<=34.0, MAR=Never married/<15yrs, WKHP=<=29.0",0.08,-55896.551,237.6,0.142
1300,"COW=Empl. for-profit-c, MAR=Never married/<15yrs, WKHP=<=29.0",0.06,-55009.751,218.0,0.156


In [84]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
1168,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
676,"AGEP=<=34.0, WKHP=<=29.0",0.09,-54846.288,227.3,0.159
601,"AGEP=<=26.0, WKHP=<=39.0",0.09,-54228.181,230.8,0.168


In [85]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
1168,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
676,"AGEP=<=34.0, WKHP=<=29.0",0.09,-54846.288,227.3,0.159
601,"AGEP=<=26.0, WKHP=<=39.0",0.09,-54228.181,230.8,0.168


In [86]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


## min_sup_divergence  = 0.025

In [87]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [88]:
min_sup_divergence = 0.025

### Without Generalization 

In [89]:
import pandas as pd
pd.set_option('max_colwidth', None)

In [90]:
INFO = ['support', 'itemsets', 'tn', 'fp', 'fn', 'tp', 'd_fpr', 't_value_fp', 'error']

In [91]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = False

pattern_type = "generalized" if apply_generalization else "base"
    
FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    #true_class_name=pred_name,
    #predicted_class_name=class_name,
    target_name = target,
    #class_map=class_map,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

In [92]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [93]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [94]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
113,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
6,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
38,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


In [95]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1276,0.028948,"(AGEP=<=26.0, RELP=Biological son or daughter, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",5,5661.0,7339.512454,-57845.557971,243.867037
1244,0.02947,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, RELP=Biological son or daughter)",4,5763.0,7354.799584,-57830.270841,244.629639
937,0.035524,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Biological son or daughter, MAR=Never married or under 15 years old)",4,6947.0,7395.747805,-57789.32262,241.063771
915,0.036179,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Biological son or daughter)",3,7075.0,7409.028975,-57776.04145,242.071604
1214,0.02992,"(POBP=California/CA, AGEP=<=26.0, RELP=Biological son or daughter, WKHP=<=29.0, MAR=Never married or under 15 years old)",5,5851.0,7428.044779,-57757.025646,229.182128


In [96]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1285,0.02879,(SCHL=Professional degree beyond a bachelor's degree),1,5630.0,170441.813499,105256.743074,46.671452
1300,0.02858,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SCHL=Master's degree, SEX=Male)",3,5589.0,161340.767579,96155.697154,50.09306
1425,0.027015,"(SCHL=Bachelor's degree, RAC1P=White alone, WKHP=>=44.0, MAR=Married)",4,5283.0,158178.654174,92993.583749,46.210936
998,0.034318,"(MAR=Married, SCHL=Master's degree, SEX=Male)",3,6711.0,157121.586947,91936.516522,52.763516
1208,0.029997,"(SCHL=Master's degree, WKHP=>=44.0)",2,5866.0,156834.481759,91649.411334,48.885913


In [97]:
print(df_analysis["income"].describe()["mean"])
FP_fm.sort_values("length", ascending = True).head(1)

65192.04881813304


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
0,1.0,(),0,195556.0,65185.070425,0.0,0.0


#### Redundancy

In [98]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

3259.25, 65185.07


#### Top-3

In [99]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
1276,"AGEP=<=26.0, COW=Empl. for-profit-c, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.03,-57845.558,243.9,0.113
1244,"AGEP=<=26.0, COW=Empl. for-profit-c, RELP=Son/daughter, WKHP=<=29.0",0.03,-57830.271,244.6,0.113
937,"AGEP=<=26.0, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.04,-57789.323,241.1,0.113


In [100]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
1285,SCHL=Prof beyond bachelor,0.03,105256.743,46.7,2.615
1300,"COW=Empl. for-profit-c, SCHL=Master, SEX=Male",0.03,96155.697,50.1,2.475
1425,"MAR=Married, RAC=White, SCHL=Bachelor, WKHP=>=44.0",0.03,92993.584,46.2,2.427


#### Top-3 redundancy

In [101]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
915,"AGEP=<=26.0, RELP=Son/daughter, WKHP=<=29.0",0.04,-57776.041,242.1,0.114
339,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
625,"RELP=Son/daughter, WKHP=<=29.0",0.05,-56262.32,229.4,0.137


In [102]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
339,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
625,"RELP=Son/daughter, WKHP=<=29.0",0.05,-56262.32,229.4,0.137
201,"MAR=Never married/<15yrs, WKHP=<=29.0",0.09,-53376.975,206.4,0.181


In [103]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
339,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
625,"RELP=Son/daughter, WKHP=<=29.0",0.05,-56262.32,229.4,0.137
201,"MAR=Never married/<15yrs, WKHP=<=29.0",0.09,-53376.975,206.4,0.181


In [104]:
l = FP_results["base"][0.025].loc[FP_results["base"][0.025]["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
113,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
6,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
38,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


### With Generalization 

In [105]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = True
pattern_type = "generalized" if apply_generalization else "base"

import time 

st = time.time()

FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    target_name = target,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    considerOnlyContinuos = False,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

print(time.time()-st)

24.775001764297485


In [106]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [107]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [108]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
display(l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20))
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


In [109]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
5206,0.028948,"(AGEP=<=26.0, RELP=Biological son or daughter, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",5,5661.0,7339.512454,-57845.557971,243.867037
5037,0.02947,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, RELP=Biological son or daughter)",4,5763.0,7354.799584,-57830.270841,244.629639
3680,0.035524,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Biological son or daughter, MAR=Never married or under 15 years old)",4,6947.0,7395.747805,-57789.32262,241.063771
3573,0.036179,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Biological son or daughter)",3,7075.0,7409.028975,-57776.04145,242.071604
4934,0.02992,"(POBP=California/CA, AGEP=<=26.0, RELP=Biological son or daughter, WKHP=<=29.0, MAR=Never married or under 15 years old)",5,5851.0,7428.044779,-57757.025646,229.182128


In [110]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
6602,0.025047,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, SEX=Male)",4,4898.0,184525.279706,119340.209281,50.568105
5850,0.027026,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, MAR=Married)",4,5285.0,184364.491958,119179.421534,53.300735
6447,0.025445,"(AGEP=>=35.0, OCCP=MGR, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SEX=Male)",5,4976.0,180937.019695,115751.94927,50.479892
6015,0.026576,"(AGEP=>=35.0, RAC1P=White alone, WKHP=>=40.0, MAR=Married, OCCP=MGR, SEX=Male)",6,5197.0,178260.681162,113075.610737,51.016658
6612,0.025006,"(AGEP=>=46.0, WKHP=>=40.0, MAR=Married, OCCP=MGR, SEX=Male)",5,4890.0,177898.789366,112713.718941,49.19337


In [111]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWTR" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [112]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [113]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i and "JWAP=NaN" not in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [114]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


#### Redundancy

In [115]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.1
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

In [116]:
display(fpdiv.head(20))

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
5206,0.028948,"(AGEP=<=26.0, RELP=Biological son or daughter, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",5,5661.0,7339.512454,-57845.557971,243.867037
5037,0.02947,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, RELP=Biological son or daughter)",4,5763.0,7354.799584,-57830.270841,244.629639
3680,0.035524,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Biological son or daughter, MAR=Never married or under 15 years old)",4,6947.0,7395.747805,-57789.32262,241.063771
3573,0.036179,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Biological son or daughter)",3,7075.0,7409.028975,-57776.04145,242.071604
4934,0.02992,"(POBP=California/CA, AGEP=<=26.0, RELP=Biological son or daughter, WKHP=<=29.0, MAR=Never married or under 15 years old)",5,5851.0,7428.044779,-57757.025646,229.182128
4790,0.030436,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Biological son or daughter, POBP=California/CA)",4,5952.0,7436.009745,-57749.06068,230.3311
5955,0.026744,"(SEX=Female, AGEP=<=26.0, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",5,5230.0,7653.497897,-57531.572528,227.213969
5711,0.027368,"(SEX=Female, POBP=California/CA, AGEP=<=26.0, WKHP=<=29.0, MAR=Never married or under 15 years old)",5,5352.0,7677.936099,-57507.134326,224.490326
5211,0.028928,"(WKHP=<=29.0, AGEP=<=26.0, POBP=California/CA, SEX=Female)",4,5657.0,7784.469507,-57400.600918,227.831734
3209,0.038521,"(POBP=California/CA, AGEP=<=26.0, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",5,7533.0,7796.32736,-57388.743065,226.157503


In [117]:
fpdiv_t.head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.62619
2322,0.045915,"(WKHP=<=29.0, RELP=Biological son or daughter)",2,8979.0,8922.750863,-56262.319562,229.360368
676,0.088159,"(WKHP=<=29.0, AGEP=<=34.0)",2,17240.0,10338.782135,-54846.28829,227.314599
601,0.094413,"(AGEP=<=26.0, WKHP=<=39.0)",2,18463.0,10956.889671,-54228.180754,230.807747
653,0.089739,"(WKHP=<=29.0, MAR=Never married or under 15 years old)",2,17549.0,11808.095846,-53376.974579,206.363504
1172,0.06706,"(RELP=Biological son or daughter, WKHP=<=39.0)",2,13114.0,12223.590819,-52961.479606,222.591009
4037,0.033648,"(WKHP=<=29.0, SCHL=Regular high school diploma)",2,6580.0,14767.244681,-50417.825744,118.237561
4090,0.033433,"(OCCP=EAT, WKHP=<=39.0)",2,6538.0,15102.535944,-50082.534481,147.027199
285,0.135935,"(WKHP=<=39.0, AGEP=<=34.0)",2,26583.0,15309.197382,-49875.873043,203.613355
2812,0.041497,"(WKHP=<=29.0, SCHL=1 or more years of college credit, no degree)",2,8115.0,16970.069501,-48215.000924,112.994316


In [118]:
display(fpdiv.tail(10))
fpdiv_t.tail(10)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
6441,0.025466,"(AGEP=>=35.0, RELP=Reference person, MAR=Married, OCCP=MGR, SEX=Male)",5,4980.0,173191.901606,108006.831182,47.60124
5579,0.027767,"(RAC1P=White alone, AGEP=>=35.0, WKHP=>=44.0, OCCP=MGR)",4,5430.0,174526.918969,109341.848544,51.73993
5647,0.027552,"(OCCP=MGR, WKHP=>=40.0, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SEX=Male)",5,5388.0,174558.158872,109373.088447,50.964568
5079,0.029362,"(AGEP=>=35.0, RAC1P=White alone, MAR=Married, OCCP=MGR, SEX=Male)",5,5742.0,174718.685127,109533.614702,51.353075
4793,0.030426,"(OCCP=MGR, WKHP=>=44.0, MAR=Married)",3,5950.0,176303.536134,111118.46571,53.874208
6612,0.025006,"(AGEP=>=46.0, WKHP=>=40.0, MAR=Married, OCCP=MGR, SEX=Male)",5,4890.0,177898.789366,112713.718941,49.19337
6015,0.026576,"(AGEP=>=35.0, RAC1P=White alone, WKHP=>=40.0, MAR=Married, OCCP=MGR, SEX=Male)",6,5197.0,178260.681162,113075.610737,51.016658
6447,0.025445,"(AGEP=>=35.0, OCCP=MGR, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SEX=Male)",5,4976.0,180937.019695,115751.94927,50.479892
5850,0.027026,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, MAR=Married)",4,5285.0,184364.491958,119179.421534,53.300735
6602,0.025047,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, SEX=Male)",4,4898.0,184525.279706,119340.209281,50.568105


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
3539,0.036353,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0)",3,7109.0,170701.136587,105516.066163,57.916484
5894,0.026918,"(AGEP=>=46.0, RAC1P=White alone, OCCP=MGR, SEX=Male)",4,5264.0,170963.860182,105778.789758,47.744936
4987,0.029685,"(OCCP=MGR, WKHP=>=44.0, SEX=Male)",3,5805.0,171542.440999,106357.370574,50.680086
5248,0.02881,"(MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=MGR, SEX=Male)",4,5634.0,172163.954562,106978.884137,50.679673
6308,0.025783,"(AGEP=>=35.0, WKHP=>=40.0, SCHL=Master's degree, MAR=Married, SEX=Male)",5,5042.0,172959.617215,107774.546791,51.674119
5432,0.028176,"(AGEP=>=46.0, OCCP=MGR, MAR=Married, SEX=Male)",4,5510.0,173003.283122,107818.212697,49.522467
4793,0.030426,"(OCCP=MGR, WKHP=>=44.0, MAR=Married)",3,5950.0,176303.536134,111118.46571,53.874208
6447,0.025445,"(AGEP=>=35.0, OCCP=MGR, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SEX=Male)",5,4976.0,180937.019695,115751.94927,50.479892
5850,0.027026,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, MAR=Married)",4,5285.0,184364.491958,119179.421534,53.300735
6602,0.025047,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, SEX=Male)",4,4898.0,184525.279706,119340.209281,50.568105


#### Analysis

In [119]:
# FP_fm.loc[FP_fm["length"] == 1].loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "POBP" in i ])>0)].sort_values(metric, ascending = True)

FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
5626,0.027619,"(OCCP=EAT, MAR=Never married or under 15 years old, AGEP=<=34.0)",3,5401.0,14708.446584,-50476.623841,165.667236
6230,0.025987,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=EAT, MAR=Never married or under 15 years old, AGEP=<=34.0)",4,5082.0,14749.008264,-50436.062160,161.774657
4832,0.030283,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=EAT, WKHP=<=39.0)",3,5922.0,14939.034110,-50246.036315,148.312365
4090,0.033433,"(OCCP=EAT, WKHP=<=39.0)",2,6538.0,15102.535944,-50082.534481,147.027199
6287,0.025839,"(AGEP=<=26.0, OCCP=SAL)",2,5053.0,15682.841876,-49502.228549,115.474898
...,...,...,...,...,...,...,...
6612,0.025006,"(AGEP=>=46.0, WKHP=>=40.0, MAR=Married, OCCP=MGR, SEX=Male)",5,4890.0,177898.789366,112713.718941,49.193370
6015,0.026576,"(AGEP=>=35.0, RAC1P=White alone, WKHP=>=40.0, MAR=Married, OCCP=MGR, SEX=Male)",6,5197.0,178260.681162,113075.610737,51.016658
6447,0.025445,"(AGEP=>=35.0, OCCP=MGR, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SEX=Male)",5,4976.0,180937.019695,115751.949270,50.479892
5850,0.027026,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, MAR=Married)",4,5285.0,184364.491958,119179.421534,53.300735


In [120]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence

fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=0)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
6602,0.025047,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, SEX=Male)",4,4898.0,184525.279706,119340.209281,50.568105
5850,0.027026,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, MAR=Married)",4,5285.0,184364.491958,119179.421534,53.300735
6447,0.025445,"(AGEP=>=35.0, OCCP=MGR, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SEX=Male)",5,4976.0,180937.019695,115751.949270,50.479892
6015,0.026576,"(AGEP=>=35.0, RAC1P=White alone, WKHP=>=40.0, MAR=Married, OCCP=MGR, SEX=Male)",6,5197.0,178260.681162,113075.610737,51.016658
6612,0.025006,"(AGEP=>=46.0, WKHP=>=40.0, MAR=Married, OCCP=MGR, SEX=Male)",5,4890.0,177898.789366,112713.718941,49.193370
...,...,...,...,...,...,...,...
4934,0.029920,"(POBP=California/CA, AGEP=<=26.0, RELP=Biological son or daughter, WKHP=<=29.0, MAR=Never married or under 15 years old)",5,5851.0,7428.044779,-57757.025646,229.182128
3573,0.036179,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Biological son or daughter)",3,7075.0,7409.028975,-57776.041450,242.071604
3680,0.035524,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Biological son or daughter, MAR=Never married or under 15 years old)",4,6947.0,7395.747805,-57789.322620,241.063771
5037,0.029470,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, RELP=Biological son or daughter)",4,5763.0,7354.799584,-57830.270841,244.629639


In [121]:
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]

v = mean_outcome*0.1

In [122]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
print(v)
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=v)

6518.507042483994


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
6602,0.025047,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, SEX=Male)",4,4898.0,184525.279706,119340.209281,50.568105
5850,0.027026,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, MAR=Married)",4,5285.0,184364.491958,119179.421534,53.300735
6447,0.025445,"(AGEP=>=35.0, OCCP=MGR, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SEX=Male)",5,4976.0,180937.019695,115751.949270,50.479892
4793,0.030426,"(OCCP=MGR, WKHP=>=44.0, MAR=Married)",3,5950.0,176303.536134,111118.465710,53.874208
5432,0.028176,"(AGEP=>=46.0, OCCP=MGR, MAR=Married, SEX=Male)",4,5510.0,173003.283122,107818.212697,49.522467
...,...,...,...,...,...,...,...
653,0.089739,"(WKHP=<=29.0, MAR=Never married or under 15 years old)",2,17549.0,11808.095846,-53376.974579,206.363504
601,0.094413,"(AGEP=<=26.0, WKHP=<=39.0)",2,18463.0,10956.889671,-54228.180754,230.807747
676,0.088159,"(WKHP=<=29.0, AGEP=<=34.0)",2,17240.0,10338.782135,-54846.288290,227.314599
2322,0.045915,"(WKHP=<=29.0, RELP=Biological son or daughter)",2,8979.0,8922.750863,-56262.319562,229.360368


In [123]:
f = fp_divergence_o.getDivergence(th_redundancy=v)

f.loc[f["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = False)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
6602,0.025047,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, SEX=Male)",4,4898.0,184525.279706,119340.209281,50.568105
5850,0.027026,"(OCCP=MGR, AGEP=>=35.0, WKHP=>=44.0, MAR=Married)",4,5285.0,184364.491958,119179.421534,53.300735
6447,0.025445,"(AGEP=>=35.0, OCCP=MGR, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SEX=Male)",5,4976.0,180937.019695,115751.949270,50.479892
4793,0.030426,"(OCCP=MGR, WKHP=>=44.0, MAR=Married)",3,5950.0,176303.536134,111118.465710,53.874208
5432,0.028176,"(AGEP=>=46.0, OCCP=MGR, MAR=Married, SEX=Male)",4,5510.0,173003.283122,107818.212697,49.522467
...,...,...,...,...,...,...,...
5142,0.029163,"(OCCP=TRN, MAR=Never married or under 15 years old)",2,5703.0,24241.879712,-40943.190712,104.884368
3655,0.035662,"(OCCP=OFF, WKHP=<=39.0)",2,6974.0,23426.852595,-41758.217829,89.357806
5767,0.027225,"(OCCP=TRN, AGEP=<=34.0)",2,5324.0,22513.914350,-42671.156075,119.300494
1839,0.052369,(OCCP=EAT),1,10241.0,21855.516063,-43329.554362,120.649128


#### Top-3

In [124]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
5206,"AGEP=<=26.0, COW=Empl. for-profit-c, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.03,-57845.558,243.9,0.113
5037,"AGEP=<=26.0, COW=Empl. for-profit-c, RELP=Son/daughter, WKHP=<=29.0",0.03,-57830.271,244.6,0.113
3680,"AGEP=<=26.0, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.04,-57789.323,241.1,0.113


In [125]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
6602,"AGEP=>=35.0, OCCP=MGR, SEX=Male, WKHP=>=44.0",0.03,119340.209,50.6,2.831
5850,"AGEP=>=35.0, MAR=Married, OCCP=MGR, WKHP=>=44.0",0.03,119179.422,53.3,2.828
6447,"AGEP=>=35.0, COW=Empl. for-profit-c, MAR=Married, OCCP=MGR, SEX=Male",0.03,115751.949,50.5,2.776


#### Top-3 redundancy

In [126]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
3573,"AGEP=<=26.0, RELP=Son/daughter, WKHP=<=29.0",0.04,-57776.041,242.1,0.114
1168,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
2775,"AGEP=<=34.0, RELP=Son/daughter, WKHP=<=29.0",0.04,-57007.316,237.0,0.125


In [127]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
1168,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
2322,"RELP=Son/daughter, WKHP=<=29.0",0.05,-56262.32,229.4,0.137
676,"AGEP=<=34.0, WKHP=<=29.0",0.09,-54846.288,227.3,0.159


In [128]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
1168,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
2322,"RELP=Son/daughter, WKHP=<=29.0",0.05,-56262.32,229.4,0.137
676,"AGEP=<=34.0, WKHP=<=29.0",0.09,-54846.288,227.3,0.159


In [129]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


## min_sup_divergence  = 0.01

In [130]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [131]:
min_sup_divergence = 0.01

### Without Generalization 

In [132]:
import pandas as pd
pd.set_option('max_colwidth', None)

In [133]:
INFO = ['support', 'itemsets', 'tn', 'fp', 'fn', 'tp', 'd_fpr', 't_value_fp', 'error']

In [134]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = False

pattern_type = "generalized" if apply_generalization else "base"
    
FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    #true_class_name=pred_name,
    #predicted_class_name=class_name,
    target_name = target,
    #class_map=class_map,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

In [135]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [136]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [137]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
113,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
6,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
38,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


In [138]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
5265,0.010115,"(WKHP=<=29.0, MAR=Never married or under 15 years old, RELP=Noninstitutionalized group quarters population)",3,1978.0,6795.462083,-58389.608342,200.289408
3137,0.01508,"(WKHP=<=29.0, AGEP=<=26.0, SCHL=Regular high school diploma, MAR=Never married or under 15 years old)",4,2949.0,6948.565615,-58236.504809,174.290882
4248,0.011961,"(SCHL=Regular high school diploma, POBP=California/CA, AGEP=<=26.0, WKHP=<=29.0, MAR=Never married or under 15 years old)",5,2339.0,7044.236853,-58140.833571,150.22667
5103,0.010365,"(WKHP=<=29.0, RELP=Noninstitutionalized group quarters population)",2,2027.0,7057.648742,-58127.421683,195.474882
4048,0.012411,"(SCHL=Regular high school diploma, AGEP=<=26.0, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",5,2427.0,7111.915946,-58073.154479,154.674865


In [139]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
4388,0.011654,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",2,2279.0,228664.931988,163479.861563,40.290922
4366,0.011695,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2287.0,221697.415829,156512.345404,39.083566
5235,0.010151,"(SCHL=Professional degree beyond a bachelor's degree, RELP=Reference person, SEX=Male)",3,1985.0,212307.662469,147122.592044,34.782031
4651,0.011127,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SCHL=Master's degree, WKHP=>=44.0, SEX=Male)",4,2176.0,209515.284926,144330.214502,39.830737
4472,0.01146,"(RAC1P=White alone, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2241.0,208800.406069,143615.335644,36.158773


In [140]:
print(df_analysis["income"].describe()["mean"])
FP_fm.sort_values("length", ascending = True).head(1)

65192.04881813304


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
0,1.0,(),0,195556.0,65185.070425,0.0,0.0


#### Redundancy

In [141]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

3259.25, 65185.07


#### Top-3

In [142]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
5265,"MAR=Never married/<15yrs, RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58389.608,200.3,0.104
3137,"AGEP=<=26.0, MAR=Never married/<15yrs, SCHL=HS, WKHP=<=29.0",0.02,-58236.505,174.3,0.107
4248,"AGEP=<=26.0, MAR=Never married/<15yrs, POBP=CA, SCHL=HS, WKHP=<=29.0",0.01,-58140.834,150.2,0.108


In [143]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
4388,"SCHL=Prof beyond bachelor, WKHP=>=44.0",0.01,163479.862,40.3,3.508
4366,"MAR=Married, SCHL=Prof beyond bachelor, SEX=Male",0.01,156512.345,39.1,3.401
5235,"RELP=Ref person, SCHL=Prof beyond bachelor, SEX=Male",0.01,147122.592,34.8,3.257


#### Top-3 redundancy

In [144]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
5103,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
2932,"AGEP=<=26.0, SCHL=HS, WKHP=<=29.0",0.02,-57926.0,156.5,0.111
915,"AGEP=<=26.0, RELP=Son/daughter, WKHP=<=29.0",0.04,-57776.041,242.1,0.114


In [145]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
5103,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
339,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
625,"RELP=Son/daughter, WKHP=<=29.0",0.05,-56262.32,229.4,0.137


In [146]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
5103,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
339,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
625,"RELP=Son/daughter, WKHP=<=29.0",0.05,-56262.32,229.4,0.137


In [147]:
l = FP_results["base"][0.025].loc[FP_results["base"][0.025]["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
113,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
6,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
38,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


### With Generalization 

In [148]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = True
pattern_type = "generalized" if apply_generalization else "base"

import time 

st = time.time()

FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    target_name = target,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    considerOnlyContinuos = False,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

print(time.time()-st)

1 10000
2 20000
38.97925043106079


In [149]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [150]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [151]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
display(l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20))
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


In [152]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
23642,0.010846,"(MAR=Never married or under 15 years old, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=39.0)",4,2121.0,6721.76992,-58463.300505,203.760619
26155,0.010115,"(WKHP=<=29.0, MAR=Never married or under 15 years old, RELP=Noninstitutionalized group quarters population)",3,1978.0,6795.462083,-58389.608342,200.289408
23265,0.010964,"(AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=39.0)",3,2144.0,6831.079291,-58353.991134,200.128757
19821,0.012222,"(AGEP=<=26.0, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old, OCCP=SAL)",5,2390.0,6857.711297,-58327.359128,237.233403
25260,0.01035,"(POBP=California/CA, AGEP=<=26.0, WKHP=<=29.0, MAR=Never married or under 15 years old, OCCP=SAL)",5,2024.0,6873.552372,-58311.518053,226.388457


In [153]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
25541,0.010268,"(WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0, SEX=Male)",4,2008.0,237481.040837,172295.970412,39.344466
24251,0.010652,"(MAR=Married, AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",4,2083.0,228965.972156,163780.901731,38.327022
21275,0.011654,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",2,2279.0,228664.931988,163479.861563,40.290922
26256,0.010094,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,1974.0,221792.608916,156607.538491,35.238657
21164,0.011695,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2287.0,221697.415829,156512.345404,39.083566


In [154]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWTR" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [155]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [156]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i and "JWAP=NaN" not in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [157]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


#### Redundancy

In [158]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.1
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

In [159]:
display(fpdiv.head(20))

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
23642,0.010846,"(MAR=Never married or under 15 years old, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=39.0)",4,2121.0,6721.76992,-58463.300505,203.760619
26155,0.010115,"(WKHP=<=29.0, MAR=Never married or under 15 years old, RELP=Noninstitutionalized group quarters population)",3,1978.0,6795.462083,-58389.608342,200.289408
23265,0.010964,"(AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=39.0)",3,2144.0,6831.079291,-58353.991134,200.128757
19821,0.012222,"(AGEP=<=26.0, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old, OCCP=SAL)",5,2390.0,6857.711297,-58327.359128,237.233403
25260,0.01035,"(POBP=California/CA, AGEP=<=26.0, WKHP=<=29.0, MAR=Never married or under 15 years old, OCCP=SAL)",5,2024.0,6873.552372,-58311.518053,226.388457
25950,0.010171,"(POBP=California/CA, AGEP=<=26.0, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=SAL)",5,1989.0,6884.786325,-58300.2841,226.234964
18381,0.012876,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=SAL)",4,2518.0,6939.722002,-58245.348423,238.016882
14560,0.01508,"(WKHP=<=29.0, AGEP=<=26.0, SCHL=Regular high school diploma, MAR=Never married or under 15 years old)",4,2949.0,6948.565615,-58236.504809,174.290882
18124,0.013004,"(WKHP=<=29.0, AGEP=<=26.0, MAR=Never married or under 15 years old, OCCP=SAL)",4,2543.0,6951.207236,-58233.863189,235.935647
23941,0.010754,"(WKHP=<=29.0, AGEP=<=26.0, POBP=California/CA, OCCP=SAL)",4,2103.0,6970.361388,-58214.709036,224.861004


In [160]:
fpdiv_t.head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
25212,0.010365,"(WKHP=<=29.0, RELP=Noninstitutionalized group quarters population)",2,2027.0,7057.648742,-58127.421683,195.474882
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.62619
2322,0.045915,"(WKHP=<=29.0, RELP=Biological son or daughter)",2,8979.0,8922.750863,-56262.319562,229.360368
24772,0.010488,"(WKHP=<=29.0, OCCP=SAL-Cashiers)",2,2051.0,9040.068259,-56145.002165,141.678107
21438,0.011603,"(OCCP=SAL-Cashiers, AGEP=<=26.0)",2,2269.0,9399.171441,-55785.898984,201.619177
676,0.088159,"(WKHP=<=29.0, AGEP=<=34.0)",2,17240.0,10338.782135,-54846.28829,227.314599
601,0.094413,"(AGEP=<=26.0, WKHP=<=39.0)",2,18463.0,10956.889671,-54228.180754,230.807747
8836,0.020787,"(WKHP=<=29.0, OCCP=EAT)",2,4065.0,11243.729397,-53941.341028,144.803803
653,0.089739,"(WKHP=<=29.0, MAR=Never married or under 15 years old)",2,17549.0,11808.095846,-53376.974579,206.363504
1172,0.06706,"(RELP=Biological son or daughter, WKHP=<=39.0)",2,13114.0,12223.590819,-52961.479606,222.591009


In [161]:
display(fpdiv.tail(10))
fpdiv_t.tail(10)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
19787,0.012237,"(WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2393.0,218808.332637,153623.262212,39.524042
21228,0.011674,"(AGEP=>=46.0, WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree)",3,2283.0,219292.229523,154107.159098,38.725222
17510,0.01329,"(WKHP=>=40.0, AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married)",4,2599.0,219292.527895,154107.457471,42.242858
26286,0.010084,"(RAC1P=White alone, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0, SEX=Male)",4,1972.0,220337.312373,155152.241948,35.768288
24734,0.010498,"(AGEP=>=35.0, SCHL=Master's degree, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0)",5,2053.0,221466.678032,156281.607607,40.481408
21164,0.011695,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2287.0,221697.415829,156512.345404,39.083566
26256,0.010094,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,1974.0,221792.608916,156607.538491,35.238657
21275,0.011654,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",2,2279.0,228664.931988,163479.861563,40.290922
24251,0.010652,"(MAR=Married, AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",4,2083.0,228965.972156,163780.901731,38.327022
25541,0.010268,"(WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0, SEX=Male)",4,2008.0,237481.040837,172295.970412,39.344466


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
17345,0.013372,"(SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0, SEX=Male)",3,2615.0,217735.774379,152550.703954,40.445336
19787,0.012237,"(WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2393.0,218808.332637,153623.262212,39.524042
21228,0.011674,"(AGEP=>=46.0, WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree)",3,2283.0,219292.229523,154107.159098,38.725222
17510,0.01329,"(WKHP=>=40.0, AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married)",4,2599.0,219292.527895,154107.457471,42.242858
24734,0.010498,"(AGEP=>=35.0, SCHL=Master's degree, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0)",5,2053.0,221466.678032,156281.607607,40.481408
21164,0.011695,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2287.0,221697.415829,156512.345404,39.083566
26256,0.010094,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,1974.0,221792.608916,156607.538491,35.238657
21275,0.011654,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",2,2279.0,228664.931988,163479.861563,40.290922
24251,0.010652,"(MAR=Married, AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",4,2083.0,228965.972156,163780.901731,38.327022
25541,0.010268,"(WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0, SEX=Male)",4,2008.0,237481.040837,172295.970412,39.344466


#### Analysis

In [162]:
# FP_fm.loc[FP_fm["length"] == 1].loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "POBP" in i ])>0)].sort_values(metric, ascending = True)

FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
19821,0.012222,"(AGEP=<=26.0, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old, OCCP=SAL)",5,2390.0,6857.711297,-58327.359128,237.233403
25260,0.010350,"(POBP=California/CA, AGEP=<=26.0, WKHP=<=29.0, MAR=Never married or under 15 years old, OCCP=SAL)",5,2024.0,6873.552372,-58311.518053,226.388457
25950,0.010171,"(POBP=California/CA, AGEP=<=26.0, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=SAL)",5,1989.0,6884.786325,-58300.284100,226.234964
18381,0.012876,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=SAL)",4,2518.0,6939.722002,-58245.348423,238.016882
18124,0.013004,"(WKHP=<=29.0, AGEP=<=26.0, MAR=Never married or under 15 years old, OCCP=SAL)",4,2543.0,6951.207236,-58233.863189,235.935647
...,...,...,...,...,...,...,...
25388,0.010314,(OCCP=MGR-Chief Executives And Legislators),1,2017.0,208556.807139,143371.736714,34.251326
22257,0.011291,"(OCCP=MGR, AGEP=>=46.0, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0)",5,2208.0,210097.196558,144912.126133,38.777271
24810,0.010478,"(OCCP=MGR, AGEP=>=46.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0, SEX=Male)",5,2049.0,210158.799414,144973.728990,37.211200
17294,0.013403,"(AGEP=>=35.0, OCCP=MGR, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0, SEX=Male)",6,2621.0,212623.227776,147438.157351,42.551811


In [163]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence

fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=0)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
25541,0.010268,"(WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0, SEX=Male)",4,2008.0,237481.040837,172295.970412,39.344466
24251,0.010652,"(MAR=Married, AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",4,2083.0,228965.972156,163780.901731,38.327022
21275,0.011654,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",2,2279.0,228664.931988,163479.861563,40.290922
26256,0.010094,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,1974.0,221792.608916,156607.538491,35.238657
21164,0.011695,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2287.0,221697.415829,156512.345404,39.083566
...,...,...,...,...,...,...,...
25260,0.010350,"(POBP=California/CA, AGEP=<=26.0, WKHP=<=29.0, MAR=Never married or under 15 years old, OCCP=SAL)",5,2024.0,6873.552372,-58311.518053,226.388457
19821,0.012222,"(AGEP=<=26.0, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old, OCCP=SAL)",5,2390.0,6857.711297,-58327.359128,237.233403
23265,0.010964,"(AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=39.0)",3,2144.0,6831.079291,-58353.991134,200.128757
26155,0.010115,"(WKHP=<=29.0, MAR=Never married or under 15 years old, RELP=Noninstitutionalized group quarters population)",3,1978.0,6795.462083,-58389.608342,200.289408


In [164]:
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]

v = mean_outcome*0.1

In [165]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
print(v)
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=v)

6518.507042483994


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
25541,0.010268,"(WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0, SEX=Male)",4,2008.0,237481.040837,172295.970412,39.344466
24251,0.010652,"(MAR=Married, AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",4,2083.0,228965.972156,163780.901731,38.327022
21275,0.011654,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",2,2279.0,228664.931988,163479.861563,40.290922
26256,0.010094,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,1974.0,221792.608916,156607.538491,35.238657
21164,0.011695,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2287.0,221697.415829,156512.345404,39.083566
...,...,...,...,...,...,...,...
21438,0.011603,"(OCCP=SAL-Cashiers, AGEP=<=26.0)",2,2269.0,9399.171441,-55785.898984,201.619177
24772,0.010488,"(WKHP=<=29.0, OCCP=SAL-Cashiers)",2,2051.0,9040.068259,-56145.002165,141.678107
2322,0.045915,"(WKHP=<=29.0, RELP=Biological son or daughter)",2,8979.0,8922.750863,-56262.319562,229.360368
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.626190


In [166]:
f = fp_divergence_o.getDivergence(th_redundancy=v)

f.loc[f["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = False)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
17294,0.013403,"(AGEP=>=35.0, OCCP=MGR, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0, SEX=Male)",6,2621.0,212623.227776,147438.157351,42.551811
24810,0.010478,"(OCCP=MGR, AGEP=>=46.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0, SEX=Male)",5,2049.0,210158.799414,144973.728990,37.211200
22257,0.011291,"(OCCP=MGR, AGEP=>=46.0, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0)",5,2208.0,210097.196558,144912.126133,38.777271
25388,0.010314,(OCCP=MGR-Chief Executives And Legislators),1,2017.0,208556.807139,143371.736714,34.251326
25377,0.010314,"(AGEP=>=35.0, OCCP=MGR, RELP=Reference person, MAR=Married, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0)",6,2017.0,208230.019831,143044.949407,36.626199
...,...,...,...,...,...,...,...
14533,0.015106,"(RELP=Biological son or daughter, OCCP=EAT)",2,2954.0,13155.142180,-52029.928245,159.578722
7476,0.023257,"(AGEP=<=26.0, OCCP=EAT)",2,4548.0,12802.069041,-52383.001384,153.320276
8836,0.020787,"(WKHP=<=29.0, OCCP=EAT)",2,4065.0,11243.729397,-53941.341028,144.803803
21438,0.011603,"(OCCP=SAL-Cashiers, AGEP=<=26.0)",2,2269.0,9399.171441,-55785.898984,201.619177


#### Top-3

In [167]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
23642,"AGEP=<=26.0, MAR=Never married/<15yrs, RELP=Noninstit. GQs, WKHP=<=39.0",0.01,-58463.301,203.8,0.103
26155,"MAR=Never married/<15yrs, RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58389.608,200.3,0.104
23265,"AGEP=<=26.0, RELP=Noninstit. GQs, WKHP=<=39.0",0.01,-58353.991,200.1,0.105


In [168]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
25541,"AGEP=>=35.0, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=40.0",0.01,172295.97,39.3,3.643
24251,"AGEP=>=35.0, MAR=Married, SCHL=Prof beyond bachelor, SEX=Male",0.01,163780.902,38.3,3.513
21275,"SCHL=Prof beyond bachelor, WKHP=>=44.0",0.01,163479.862,40.3,3.508


#### Top-3 redundancy

In [169]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
23265,"AGEP=<=26.0, RELP=Noninstit. GQs, WKHP=<=39.0",0.01,-58353.991,200.1,0.105
25212,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
16689,"AGEP=<=26.0, OCCP=SAL, WKHP=<=29.0",0.01,-58097.099,235.2,0.109


In [170]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
25212,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
19230,"RELP=Noninstit. GQs, WKHP=<=39.0",0.01,-57115.581,183.2,0.124
1168,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125


In [171]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
25212,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
1168,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125
2322,"RELP=Son/daughter, WKHP=<=29.0",0.05,-56262.32,229.4,0.137


In [172]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


## min_sup_divergence  = 0.001

In [173]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [174]:
min_sup_divergence = 0.005

### Without Generalization 

In [175]:
import pandas as pd
pd.set_option('max_colwidth', None)

In [176]:
INFO = ['support', 'itemsets', 'tn', 'fp', 'fn', 'tp', 'd_fpr', 't_value_fp', 'error']

In [177]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = False

pattern_type = "generalized" if apply_generalization else "base"
    
FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    #true_class_name=pred_name,
    #predicted_class_name=class_name,
    target_name = target,
    #class_map=class_map,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

1 10000


In [178]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [179]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [180]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
113,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
6,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
38,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


In [181]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
12572,0.005016,"(WKHP=<=29.0, MAR=Never married or under 15 years old, SCHL=Grade 11)",3,981.0,4594.79103,-60590.279395,203.925669
10233,0.005911,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, RELP=Noninstitutionalized group quarters population)",4,1156.0,5289.233564,-59895.836861,205.227141
10378,0.00585,"(AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",5,1144.0,5315.16958,-59869.900844,204.01992
9521,0.00628,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, POBP=California/CA)",4,1228.0,5662.104235,-59522.96619,197.543755
9597,0.006239,"(POBP=California/CA, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, MAR=Never married or under 15 years old)",5,1220.0,5678.085246,-59506.985179,196.773637


In [182]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
11244,0.005482,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",4,1072.0,277420.419776,212235.349351,33.472128
11627,0.005339,"(RAC1P=White alone, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",4,1044.0,262434.971264,197249.90084,31.414025
10584,0.005748,(OCCP=MED-Physicians),1,1124.0,262061.352313,196876.281888,33.230349
8269,0.007072,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",3,1383.0,258110.115691,192925.045266,34.810372
9743,0.006152,"(RAC1P=White alone, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, MAR=Married)",4,1203.0,254145.544472,188960.474047,32.787377


In [183]:
print(df_analysis["income"].describe()["mean"])
FP_fm.sort_values("length", ascending = True).head(1)

65192.04881813304


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
0,1.0,(),0,195556.0,65185.070425,0.0,0.0


#### Redundancy

In [184]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

3259.25, 65185.07


#### Top-3

In [185]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
12572,"MAR=Never married/<15yrs, SCHL=Grade 11, WKHP=<=29.0",0.01,-60590.279,203.9,0.07
10233,"AGEP=<=26.0, COW=Empl. for-profit-c, RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-59895.837,205.2,0.081
10378,"AGEP=<=26.0, COW=Empl. for-profit-c, MAR=Never married/<15yrs, RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-59869.901,204.0,0.082


In [186]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
11244,"MAR=Married, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=44.0",0.01,212235.349,33.5,4.256
11627,"RAC=White, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=44.0",0.01,197249.901,31.4,4.026
10584,OCCP=MED-Physicians,0.01,196876.282,33.2,4.02


#### Top-3 redundancy

In [187]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
12572,"MAR=Never married/<15yrs, SCHL=Grade 11, WKHP=<=29.0",0.01,-60590.279,203.9,0.07
10233,"AGEP=<=26.0, COW=Empl. for-profit-c, RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-59895.837,205.2,0.081
9144,"COW=Empl. for-profit-c, RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-59088.537,195.8,0.094


In [188]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
5103,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
6315,"AGEP=<=26.0, COW=Empl. for-profit-c, RELP=Noninstit. GQs",0.01,-57889.881,139.2,0.112
9864,"SCHL=Grade 11, WKHP=<=29.0",0.01,-57703.87,87.4,0.115


In [189]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
5103,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
9864,"SCHL=Grade 11, WKHP=<=29.0",0.01,-57703.87,87.4,0.115
8745,"AGEP=<=26.0, SCHL=Grade 11",0.01,-57585.412,142.9,0.117


In [190]:
l = FP_results["base"][0.025].loc[FP_results["base"][0.025]["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
113,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
6,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
38,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


### With Generalization 

In [191]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = True
pattern_type = "generalized" if apply_generalization else "base"

import time 

st = time.time()

FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    target_name = target,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    considerOnlyContinuos = False,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

print(time.time()-st)

1 10000
2 20000
3 30000
4 40000
5 50000
6 60000
7 70000
58.37598443031311


In [192]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [193]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [194]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
display(l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20))
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


In [195]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
70280,0.005016,"(WKHP=<=29.0, SCHL=Grade 11, AGEP=<=34.0)",3,981.0,4410.214067,-60774.856358,213.639704
70329,0.005016,"(WKHP=<=29.0, MAR=Never married or under 15 years old, SCHL=Grade 11)",3,981.0,4594.79103,-60590.279395,203.925669
65032,0.005318,"(WKHP=<=39.0, AGEP=<=26.0, MAR=Never married or under 15 years old, SCHL=Grade 11)",4,1040.0,4676.923077,-60508.147348,208.483231
63369,0.005415,"(WKHP=<=39.0, AGEP=<=26.0, SCHL=Grade 11)",3,1059.0,4818.338055,-60366.73237,205.947561
55998,0.005911,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, RELP=Noninstitutionalized group quarters population)",4,1156.0,5289.233564,-59895.836861,205.227141


In [196]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
56034,0.005906,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, AGEP=>=35.0, SEX=Male)",4,1155.0,281019.203463,215834.133038,34.792665
62302,0.005482,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",4,1072.0,277420.419776,212235.349351,33.472128
63625,0.0054,"(AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married, RAC1P=White alone, WKHP=>=44.0)",5,1056.0,267271.676136,202086.605712,32.243483
70577,0.005001,"(AGEP=>=46.0, RAC1P=White alone, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",4,978.0,266948.680982,201763.610557,30.364818
42645,0.007195,"(AGEP=>=35.0, WKHP=>=44.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married)",4,1407.0,266824.292822,201639.222397,37.107282


In [197]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWTR" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [198]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [199]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i and "JWAP=NaN" not in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [200]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


#### Redundancy

In [201]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.1
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

In [202]:
display(fpdiv.head(20))

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
70280,0.005016,"(WKHP=<=29.0, SCHL=Grade 11, AGEP=<=34.0)",3,981.0,4410.214067,-60774.856358,213.639704
70329,0.005016,"(WKHP=<=29.0, MAR=Never married or under 15 years old, SCHL=Grade 11)",3,981.0,4594.79103,-60590.279395,203.925669
65032,0.005318,"(WKHP=<=39.0, AGEP=<=26.0, MAR=Never married or under 15 years old, SCHL=Grade 11)",4,1040.0,4676.923077,-60508.147348,208.483231
63369,0.005415,"(WKHP=<=39.0, AGEP=<=26.0, SCHL=Grade 11)",3,1059.0,4818.338055,-60366.73237,205.947561
55998,0.005911,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, RELP=Noninstitutionalized group quarters population)",4,1156.0,5289.233564,-59895.836861,205.227141
58850,0.005707,"(WKHP=<=39.0, MAR=Never married or under 15 years old, SCHL=Grade 11, AGEP=<=34.0)",4,1116.0,5293.422939,-59891.647486,201.374686
56847,0.00585,"(AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",5,1144.0,5315.16958,-59869.900844,204.01992
66755,0.005211,"(POBP=California/CA, AGEP=<=34.0, RELP=Noninstitutionalized group quarters population, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=<=39.0)",5,1019.0,5644.017664,-59541.05276,205.466779
67997,0.005144,"(POBP=California/CA, AGEP=<=34.0, RELP=Noninstitutionalized group quarters population, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old, WKHP=<=39.0)",6,1006.0,5658.502982,-59526.567443,204.153187
51493,0.00628,"(WKHP=<=29.0, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, POBP=California/CA)",4,1228.0,5662.104235,-59522.96619,197.543755


In [203]:
fpdiv_t.head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
25211,0.010365,"(WKHP=<=29.0, RELP=Noninstitutionalized group quarters population)",2,2027.0,7057.648742,-58127.421683,195.474882
53771,0.00609,"(WKHP=<=29.0, SCHL=Grade 11)",2,1191.0,7481.200672,-57703.869753,87.367529
46745,0.006735,"(AGEP=<=26.0, SCHL=Grade 11)",2,1317.0,7599.658314,-57585.41211,142.904024
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.62619
65041,0.005318,"(RELP=Biological son or daughter, SCHL=Grade 11)",2,1040.0,8353.144231,-56831.926194,134.434957
2322,0.045915,"(WKHP=<=29.0, RELP=Biological son or daughter)",2,8979.0,8922.750863,-56262.319562,229.360368
24773,0.010488,"(WKHP=<=29.0, OCCP=SAL-Cashiers)",2,2051.0,9040.068259,-56145.002165,141.678107
21441,0.011603,"(OCCP=SAL-Cashiers, AGEP=<=26.0)",2,2269.0,9399.171441,-55785.898984,201.619177
676,0.088159,"(WKHP=<=29.0, AGEP=<=34.0)",2,17240.0,10338.782135,-54846.28829,227.314599
36389,0.008039,"(WKHP=<=39.0, SCHL=Grade 11)",2,1572.0,10348.861323,-54836.209102,81.517239


In [204]:
display(fpdiv.tail(10))
fpdiv_t.tail(10)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
43678,0.007072,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",3,1383.0,258110.115691,192925.045266,34.810372
58285,0.005748,(OCCP=MED-Physicians),1,1124.0,262061.352313,196876.281888,33.230349
64709,0.005339,"(RAC1P=White alone, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",4,1044.0,262434.971264,197249.90084,31.414025
65325,0.005298,"(OCCP=MED, WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0)",4,1036.0,265348.735521,200163.665096,32.683352
51239,0.0063,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",3,1232.0,266402.321429,201217.251004,33.719249
42645,0.007195,"(AGEP=>=35.0, WKHP=>=44.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married)",4,1407.0,266824.292822,201639.222397,37.107282
70577,0.005001,"(AGEP=>=46.0, RAC1P=White alone, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",4,978.0,266948.680982,201763.610557,30.364818
63625,0.0054,"(AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married, RAC1P=White alone, WKHP=>=44.0)",5,1056.0,267271.676136,202086.605712,32.243483
62302,0.005482,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",4,1072.0,277420.419776,212235.349351,33.472128
56034,0.005906,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, AGEP=>=35.0, SEX=Male)",4,1155.0,281019.203463,215834.133038,34.792665


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
29482,0.009307,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, AGEP=>=35.0)",3,1820.0,254248.714286,189063.643861,40.079374
69969,0.005032,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, AGEP=>=35.0, OCCP=MGR-Chief Executives And Legislators)",3,984.0,255728.79065,190543.720226,29.270607
54494,0.006024,"(AGEP=>=46.0, WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married, SEX=Male)",5,1178.0,257434.219015,192249.14859,32.387721
43678,0.007072,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",3,1383.0,258110.115691,192925.045266,34.810372
58285,0.005748,(OCCP=MED-Physicians),1,1124.0,262061.352313,196876.281888,33.230349
65325,0.005298,"(OCCP=MED, WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0)",4,1036.0,265348.735521,200163.665096,32.683352
51239,0.0063,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",3,1232.0,266402.321429,201217.251004,33.719249
42645,0.007195,"(AGEP=>=35.0, WKHP=>=44.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married)",4,1407.0,266824.292822,201639.222397,37.107282
62302,0.005482,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",4,1072.0,277420.419776,212235.349351,33.472128
56034,0.005906,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, AGEP=>=35.0, SEX=Male)",4,1155.0,281019.203463,215834.133038,34.792665


#### Analysis

In [205]:
# FP_fm.loc[FP_fm["length"] == 1].loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "POBP" in i ])>0)].sort_values(metric, ascending = True)

FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
46283,0.006786,"(AGEP=<=26.0, OCCP=SAL-Cashiers, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",5,1327.0,6163.896006,-59021.174419,234.808239
43114,0.007139,"(WKHP=<=29.0, AGEP=<=26.0, MAR=Never married or under 15 years old, OCCP=SAL-Cashiers)",4,1396.0,6230.351003,-58954.719422,234.288367
43121,0.007134,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=SAL-Cashiers)",4,1395.0,6240.501792,-58944.568633,236.427517
62941,0.005441,"(POBP=California/CA, AGEP=<=26.0, OCCP=SAL-Cashiers, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, MAR=Never married or under 15 years old)",6,1064.0,6249.915414,-58935.155011,219.318957
40112,0.007512,"(WKHP=<=29.0, AGEP=<=26.0, OCCP=SAL-Cashiers)",3,1469.0,6303.458135,-58881.612290,236.240866
...,...,...,...,...,...,...,...
69348,0.005068,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=MGR-Chief Executives And Legislators, WKHP=>=40.0)",3,991.0,248602.441978,183417.371553,29.307662
65483,0.005287,"(AGEP=>=35.0, OCCP=MGR, SCHL=Master's degree, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0)",5,1034.0,251942.669246,186757.598821,32.809004
69969,0.005032,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, AGEP=>=35.0, OCCP=MGR-Chief Executives And Legislators)",3,984.0,255728.790650,190543.720226,29.270607
58285,0.005748,(OCCP=MED-Physicians),1,1124.0,262061.352313,196876.281888,33.230349


In [206]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence

fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=0)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
56034,0.005906,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, AGEP=>=35.0, SEX=Male)",4,1155.0,281019.203463,215834.133038,34.792665
62302,0.005482,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",4,1072.0,277420.419776,212235.349351,33.472128
63625,0.005400,"(AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married, RAC1P=White alone, WKHP=>=44.0)",5,1056.0,267271.676136,202086.605712,32.243483
70577,0.005001,"(AGEP=>=46.0, RAC1P=White alone, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",4,978.0,266948.680982,201763.610557,30.364818
42645,0.007195,"(AGEP=>=35.0, WKHP=>=44.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married)",4,1407.0,266824.292822,201639.222397,37.107282
...,...,...,...,...,...,...,...
55998,0.005911,"(WKHP=<=29.0, AGEP=<=26.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, RELP=Noninstitutionalized group quarters population)",4,1156.0,5289.233564,-59895.836861,205.227141
63369,0.005415,"(WKHP=<=39.0, AGEP=<=26.0, SCHL=Grade 11)",3,1059.0,4818.338055,-60366.732370,205.947561
65032,0.005318,"(WKHP=<=39.0, AGEP=<=26.0, MAR=Never married or under 15 years old, SCHL=Grade 11)",4,1040.0,4676.923077,-60508.147348,208.483231
70329,0.005016,"(WKHP=<=29.0, MAR=Never married or under 15 years old, SCHL=Grade 11)",3,981.0,4594.791030,-60590.279395,203.925669


In [207]:
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]

v = mean_outcome*0.1

In [208]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
print(v)
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=v)

6518.507042483994


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
56034,0.005906,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, AGEP=>=35.0, SEX=Male)",4,1155.0,281019.203463,215834.133038,34.792665
62302,0.005482,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",4,1072.0,277420.419776,212235.349351,33.472128
42645,0.007195,"(AGEP=>=35.0, WKHP=>=44.0, SCHL=Professional degree beyond a bachelor's degree, MAR=Married)",4,1407.0,266824.292822,201639.222397,37.107282
51239,0.006300,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",3,1232.0,266402.321429,201217.251004,33.719249
65325,0.005298,"(OCCP=MED, WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0)",4,1036.0,265348.735521,200163.665096,32.683352
...,...,...,...,...,...,...,...
65041,0.005318,"(RELP=Biological son or daughter, SCHL=Grade 11)",2,1040.0,8353.144231,-56831.926194,134.434957
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.626190
46745,0.006735,"(AGEP=<=26.0, SCHL=Grade 11)",2,1317.0,7599.658314,-57585.412110,142.904024
53771,0.006090,"(WKHP=<=29.0, SCHL=Grade 11)",2,1191.0,7481.200672,-57703.869753,87.367529


In [209]:
f = fp_divergence_o.getDivergence(th_redundancy=v)

f.loc[f["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = False)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
65325,0.005298,"(OCCP=MED, WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0)",4,1036.0,265348.735521,200163.665096,32.683352
58285,0.005748,(OCCP=MED-Physicians),1,1124.0,262061.352313,196876.281888,33.230349
69969,0.005032,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, AGEP=>=35.0, OCCP=MGR-Chief Executives And Legislators)",3,984.0,255728.790650,190543.720226,29.270607
65483,0.005287,"(AGEP=>=35.0, OCCP=MGR, SCHL=Master's degree, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, WKHP=>=44.0)",5,1034.0,251942.669246,186757.598821,32.809004
62790,0.005451,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=MGR-Chief Executives And Legislators)",2,1066.0,246954.174484,181769.104059,29.395599
...,...,...,...,...,...,...,...
69250,0.005073,"(OCCP=OFF-Customer Service Representatives, WKHP=<=29.0)",2,992.0,12681.058468,-52504.011957,80.582104
36943,0.007947,"(OCCP=PRS, AGEP=<=26.0)",2,1554.0,11997.844273,-53187.226152,135.128395
8837,0.020787,"(WKHP=<=29.0, OCCP=EAT)",2,4065.0,11243.729397,-53941.341028,144.803803
21441,0.011603,"(OCCP=SAL-Cashiers, AGEP=<=26.0)",2,2269.0,9399.171441,-55785.898984,201.619177


#### Top-3

In [210]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
70280,"AGEP=<=34.0, SCHL=Grade 11, WKHP=<=29.0",0.01,-60774.856,213.6,0.068
70329,"MAR=Never married/<15yrs, SCHL=Grade 11, WKHP=<=29.0",0.01,-60590.279,203.9,0.07
65032,"AGEP=<=26.0, MAR=Never married/<15yrs, SCHL=Grade 11, WKHP=<=39.0",0.01,-60508.147,208.5,0.072


In [211]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
56034,"AGEP=>=35.0, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=44.0",0.01,215834.133,34.8,4.311
62302,"MAR=Married, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=44.0",0.01,212235.349,33.5,4.256
63625,"AGEP=>=35.0, MAR=Married, RAC=White, SCHL=Prof beyond bachelor, WKHP=>=44.0",0.01,202086.606,32.2,4.1


#### Top-3 redundancy

In [212]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
70280,"AGEP=<=34.0, SCHL=Grade 11, WKHP=<=29.0",0.01,-60774.856,213.6,0.068
70329,"MAR=Never married/<15yrs, SCHL=Grade 11, WKHP=<=29.0",0.01,-60590.279,203.9,0.07
63369,"AGEP=<=26.0, SCHL=Grade 11, WKHP=<=39.0",0.01,-60366.732,205.9,0.074


In [213]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
54358,"AGEP=<=34.0, SCHL=Grade 11, WKHP=<=39.0",0.01,-59451.971,196.3,0.088
53099,"MAR=Never married/<15yrs, SCHL=Grade 11, WKHP=<=39.0",0.01,-58943.971,180.9,0.096
25211,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108


In [214]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
25211,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
53771,"SCHL=Grade 11, WKHP=<=29.0",0.01,-57703.87,87.4,0.115
46745,"AGEP=<=26.0, SCHL=Grade 11",0.01,-57585.412,142.9,0.117


In [215]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


## min_sup_divergence  = 0.005

In [216]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [217]:
min_sup_divergence = 0.001

### Without Generalization 

In [218]:
import pandas as pd
pd.set_option('max_colwidth', None)

In [219]:
INFO = ['support', 'itemsets', 'tn', 'fp', 'fn', 'tp', 'd_fpr', 't_value_fp', 'error']

In [220]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = False

pattern_type = "generalized" if apply_generalization else "base"
    
FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    #true_class_name=pred_name,
    #predicted_class_name=class_name,
    target_name = target,
    #class_map=class_map,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

1 10000
2 20000
3 30000
4 40000
5 50000
6 60000
7 70000
8 80000


In [221]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [222]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [223]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
113,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
6,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
38,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


In [224]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
66565,0.001202,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0)",5,235.0,3305.276596,-61879.793829,200.307841
67452,0.001186,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, MAR=Never married or under 15 years old)",6,232.0,3311.810345,-61873.26008,198.741469
61822,0.001278,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, RAC1P=White alone, MAR=Never married or under 15 years old, SEX=Male)",7,250.0,3314.12,-61870.950425,194.43034
61520,0.001284,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, RAC1P=White alone, SEX=Male)",6,251.0,3380.59761,-61804.472815,190.589818
53673,0.001447,"(POBP=California/CA, AGEP=<=26.0, RELP=Biological son or daughter, SCHL=Grade 11, WKHP=<=29.0, SEX=Male)",6,283.0,3419.434629,-61765.635796,202.567104


In [225]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
58603,0.00134,"(OCCP=MED-Physicians, SCHL=Professional degree beyond a bachelor's degree, MAR=Married, WKHP=>=44.0, SEX=Male)",5,262.0,351761.030534,286575.96011,20.578337
48891,0.00157,"(MAR=Married, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,307.0,347915.602606,282730.532181,22.098218
66709,0.001197,"(RAC1P=White alone, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,234.0,339797.264957,274612.194532,19.186114
71608,0.001125,"(OCCP=MED-Physicians, SCHL=Professional degree beyond a bachelor's degree, RELP=Reference person, WKHP=>=44.0, SEX=Male)",5,220.0,335527.136364,270342.065939,18.081164
45349,0.001682,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,329.0,331673.404255,266488.33383,20.955622


In [226]:
print(df_analysis["income"].describe()["mean"])
FP_fm.sort_values("length", ascending = True).head(1)

65192.04881813304


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
0,1.0,(),0,195556.0,65185.070425,0.0,0.0


#### Redundancy

In [227]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

3259.25, 65185.07


#### Top-3

In [228]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
66565,"AGEP=<=26.0, RELP=Noninstit. GQs, SCHL=HS, SEX=Female, WKHP=<=29.0",0.0,-61879.794,200.3,0.051
67452,"AGEP=<=26.0, MAR=Never married/<15yrs, RELP=Noninstit. GQs, SCHL=HS, SEX=Female, WKHP=<=29.0",0.0,-61873.26,198.7,0.051
61822,"AGEP=<=26.0, MAR=Never married/<15yrs, POBP=CA, RAC=White, SCHL=Grade 11, SEX=Male, WKHP=<=29.0",0.0,-61870.95,194.4,0.051


In [229]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
58603,"MAR=Married, OCCP=MED-Physicians, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=44.0",0.0,286575.96,20.6,5.396
48891,"MAR=Married, OCCP=MED-Physicians, SEX=Male, WKHP=>=44.0",0.0,282730.532,22.1,5.337
66709,"OCCP=MED-Physicians, RAC=White, SEX=Male, WKHP=>=44.0",0.0,274612.195,19.2,5.213


#### Top-3 redundancy

In [230]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
75741,"AGEP=<=26.0, COW=Empl. for-profit-c, RELP=Noninstit. GQs, SCHL=HS, SEX=Female",0.0,-61745.07,193.1,0.053
61649,"RELP=Noninstit. GQs, SCHL=HS, SEX=Female, WKHP=<=29.0",0.0,-61511.047,193.3,0.056
13623,"AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0",0.0,-61216.65,219.6,0.061


In [231]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
13623,"AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0",0.0,-61216.65,219.6,0.061
17696,"RELP=Son/daughter, SCHL=Grade 11, WKHP=<=29.0",0.0,-61199.323,229.6,0.061
44152,"OCCP=EAT-Fast Food And Counter Workers, WKHP=<=29.0",0.0,-58352.34,104.5,0.105


In [232]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
5103,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
9872,"SCHL=Grade 11, WKHP=<=29.0",0.01,-57703.87,87.4,0.115
8748,"AGEP=<=26.0, SCHL=Grade 11",0.01,-57585.412,142.9,0.117


In [233]:
l = FP_results["base"][0.025].loc[FP_results["base"][0.025]["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
113,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
6,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
38,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


### With Generalization 

In [234]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = True
pattern_type = "generalized" if apply_generalization else "base"

import time 

st = time.time()

FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    target_name = target,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    considerOnlyContinuos = False,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

print(time.time()-st)

1 10000
2 20000
3 30000
4 40000
5 50000
6 60000
7 70000
8 80000
9 90000
10 100000
11 110000
12 120000
13 130000
14 140000
15 150000
16 160000
17 170000
18 180000
19 190000
20 200000
21 210000
22 220000
23 230000
24 240000
25 250000
26 260000
27 270000
28 280000
29 290000
30 300000
31 310000
32 320000
33 330000
34 340000
35 350000
36 360000
37 370000
38 380000
39 390000
40 400000
41 410000
42 420000
43 430000
44 440000
45 450000
46 460000
47 470000
48 480000
49 490000
50 500000
51 510000
52 520000
53 530000
54 540000
55 550000
56 560000
57 570000
204.33544826507568


In [235]:
FP_fm = FP_results[pattern_type][min_sup_divergence]

In [236]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [237]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
display(l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20))
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


In [238]:
FP_fm.sort_values(metric, ascending = True).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
459458,0.001202,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0)",5,235.0,3305.276596,-61879.793829,200.307841
466068,0.001186,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, MAR=Never married or under 15 years old)",6,232.0,3311.810345,-61873.26008,198.741469
424027,0.001278,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, RAC1P=White alone, MAR=Never married or under 15 years old, SEX=Male)",7,250.0,3314.12,-61870.950425,194.43034
453989,0.001212,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=34.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0)",5,237.0,3371.054852,-61814.015573,198.875479
461231,0.001197,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=34.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, MAR=Never married or under 15 years old)",6,234.0,3378.376068,-61806.694356,197.322449


In [239]:
FP_fm.sort_values(metric, ascending = False).head()

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
509859,0.001105,"(AGEP=>=46.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,216.0,391068.055556,325882.985131,21.703299
483588,0.001151,"(AGEP=[46.0-62.0], WKHP=>=44.0, OCCP=MED-Physicians)",3,225.0,387013.022222,321827.951797,22.473743
544800,0.001043,"(AGEP=[46.0-62.0], WKHP=>=40.0, SEX=Male, OCCP=MED-Physicians)",4,204.0,384804.95098,319619.880556,21.478808
398927,0.001345,"(AGEP=>=35.0, OCCP=MED-Physicians, MAR=Married, WKHP=>=44.0, SEX=Male)",5,263.0,382907.528517,317722.458092,23.961065
559959,0.001023,"(AGEP=[46.0-62.0], SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, OCCP=MED-Physicians)",4,200.0,380763.65,315578.579575,20.590568


In [240]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWTR" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [241]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [242]:
FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "JWAP" in i and "JWAP=NaN" not in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome


In [243]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "AGEP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437
159,0.174375,(AGEP=[27.0-34.0]),1,34100.0,52793.883284,-12391.18714,34.097372
499,0.103858,(AGEP=[35.0-39.0]),1,20310.0,72393.789266,7208.718842,11.706102
94,0.219165,(AGEP=[35.0-45.0]),1,42859.0,75260.503978,10075.433553,21.481532
398,0.115307,(AGEP=[40.0-45.0]),1,22549.0,77842.568628,12657.498203,19.562325
446,0.110372,(AGEP=[57.0-62.0]),1,21584.0,79658.974703,14473.904279,21.234203
2,0.655398,(AGEP=>=35.0),1,128167.0,79884.781246,14699.710822,44.135717
342,0.122579,(AGEP=[46.0-51.0]),1,23971.0,80256.883901,15071.813477,22.759425
27,0.335341,(AGEP=[46.0-62.0]),1,65578.0,80524.556315,15339.48589,35.776606


#### Redundancy

In [244]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)

v = mean_outcome*0.1
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

In [245]:
display(fpdiv.head(20))

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
459458,0.001202,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0)",5,235.0,3305.276596,-61879.793829,200.307841
466068,0.001186,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, MAR=Never married or under 15 years old)",6,232.0,3311.810345,-61873.26008,198.741469
424027,0.001278,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, RAC1P=White alone, MAR=Never married or under 15 years old, SEX=Male)",7,250.0,3314.12,-61870.950425,194.43034
453989,0.001212,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=34.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0)",5,237.0,3371.054852,-61814.015573,198.875479
461231,0.001197,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=34.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, MAR=Never married or under 15 years old)",6,234.0,3378.376068,-61806.694356,197.322449
422439,0.001284,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, RAC1P=White alone, SEX=Male)",6,251.0,3380.59761,-61804.472815,190.589818
362901,0.001447,"(POBP=California/CA, AGEP=<=26.0, RELP=Biological son or daughter, SCHL=Grade 11, WKHP=<=29.0, SEX=Male)",6,283.0,3419.434629,-61765.635796,202.567104
344427,0.001514,"(AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, RAC1P=White alone, MAR=Never married or under 15 years old, SEX=Male)",6,296.0,3431.216216,-61753.854209,205.960221
291344,0.001723,"(AGEP=<=26.0, RELP=Biological son or daughter, SCHL=Grade 11, WKHP=<=29.0, SEX=Male)",5,337.0,3435.548961,-61749.521463,215.47709
294823,0.001708,"(AGEP=<=26.0, RELP=Biological son or daughter, SCHL=Grade 11, WKHP=<=29.0, MAR=Never married or under 15 years old, SEX=Male)",6,334.0,3437.844311,-61747.226113,215.017834


In [246]:
fpdiv_t.head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
25216,0.010365,"(WKHP=<=29.0, RELP=Noninstitutionalized group quarters population)",2,2027.0,7057.648742,-58127.421683,195.474882
200419,0.002301,"(RELP=Noninstitutionalized group quarters population, OCCP=SAL)",2,450.0,7212.4,-57972.670425,133.254493
516577,0.001094,"(OCCP=PRS, RELP=Noninstitutionalized group quarters population)",2,214.0,7446.682243,-57738.388182,56.371489
53698,0.00609,"(WKHP=<=29.0, SCHL=Grade 11)",2,1191.0,7481.200672,-57703.869753,87.367529
46777,0.006735,"(AGEP=<=26.0, SCHL=Grade 11)",2,1317.0,7599.658314,-57585.41211,142.904024
456821,0.001207,"(WKHP=<=29.0, OCCP=EAT-Dining Room And Cafeteria Attendants And Bartender Helpers)",2,236.0,7756.610169,-57428.460255,123.498221
1168,0.067244,"(WKHP=<=29.0, AGEP=<=26.0)",2,13150.0,8166.591939,-57018.478486,246.62619
256091,0.001907,"(AGEP=<=26.0, OCCP=EDU-Tutors)",2,373.0,8242.439678,-56942.630747,53.350791
488930,0.00114,"(AGEP=<=26.0, OCCP=EAT-Dining Room And Cafeteria Attendants And Bartender Helpers)",2,223.0,8319.865471,-56865.204954,106.201798
65027,0.005318,"(RELP=Biological son or daughter, SCHL=Grade 11)",2,1040.0,8353.144231,-56831.926194,134.434957


In [247]:
display(fpdiv.tail(10))
fpdiv_t.tail(10)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
525500,0.001079,"(AGEP=>=35.0, OCCP=MED-Physicians, RELP=Reference person, WKHP=>=44.0, SEX=Male)",5,211.0,374314.407583,309129.337158,21.313214
561679,0.001018,"(AGEP=[46.0-62.0], MAR=Married, SEX=Male, OCCP=MED-Physicians)",4,199.0,374690.351759,309505.281334,20.298544
385416,0.001381,"(AGEP=>=35.0, OCCP=MED-Physicians, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, SEX=Male)",5,270.0,376533.111111,311348.040686,23.136784
320465,0.001601,"(AGEP=>=35.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,313.0,379176.485623,313991.415198,25.509617
472886,0.001171,"(AGEP=>=35.0, OCCP=MED-Physicians, SCHL=Professional degree beyond a bachelor's degree, MAR=Married, WKHP=>=44.0, SEX=Male)",6,229.0,379664.104803,314479.034379,21.733546
559959,0.001023,"(AGEP=[46.0-62.0], SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, OCCP=MED-Physicians)",4,200.0,380763.65,315578.579575,20.590568
398927,0.001345,"(AGEP=>=35.0, OCCP=MED-Physicians, MAR=Married, WKHP=>=44.0, SEX=Male)",5,263.0,382907.528517,317722.458092,23.961065
544800,0.001043,"(AGEP=[46.0-62.0], WKHP=>=40.0, SEX=Male, OCCP=MED-Physicians)",4,204.0,384804.95098,319619.880556,21.478808
483588,0.001151,"(AGEP=[46.0-62.0], WKHP=>=44.0, OCCP=MED-Physicians)",3,225.0,387013.022222,321827.951797,22.473743
509859,0.001105,"(AGEP=>=46.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,216.0,391068.055556,325882.985131,21.703299


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
404135,0.00133,"(AGEP=>=35.0, OCCP=MED-Physicians, WKHP=>=40.0, RAC1P=White alone, SEX=Male)",5,260.0,358431.538462,293246.468037,23.766323
335905,0.001539,"(AGEP=>=46.0, WKHP=>=40.0, SEX=Male, OCCP=MED-Physicians)",4,301.0,360385.946844,295200.876419,24.146196
384672,0.001381,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, OCCP=MED-Physicians)",4,270.0,360565.518519,295380.448094,22.384352
548340,0.001038,"(AGEP=[46.0-62.0], WKHP=>=40.0, RELP=Reference person, OCCP=MED-Physicians)",4,203.0,363662.857143,298477.786718,20.5768
334943,0.001544,"(AGEP=>=46.0, WKHP=>=44.0, OCCP=MED-Physicians)",3,302.0,367929.437086,302744.366661,24.517022
473447,0.001171,"(AGEP=[46.0-62.0], SEX=Male, OCCP=MED-Physicians)",3,229.0,371065.851528,305880.781104,21.537511
320465,0.001601,"(AGEP=>=35.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,313.0,379176.485623,313991.415198,25.509617
544800,0.001043,"(AGEP=[46.0-62.0], WKHP=>=40.0, SEX=Male, OCCP=MED-Physicians)",4,204.0,384804.95098,319619.880556,21.478808
483588,0.001151,"(AGEP=[46.0-62.0], WKHP=>=44.0, OCCP=MED-Physicians)",3,225.0,387013.022222,321827.951797,22.473743
509859,0.001105,"(AGEP=>=46.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,216.0,391068.055556,325882.985131,21.703299


#### Analysis

In [248]:
# FP_fm.loc[FP_fm["length"] == 1].loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "POBP" in i ])>0)].sort_values(metric, ascending = True)

FP_fm.loc[FP_fm["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
503393,0.001115,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, OCCP=EAT, MAR=Never married or under 15 years old)",6,218.0,3578.027523,-61607.042902,183.978958
500358,0.001120,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, OCCP=EAT)",5,219.0,3579.954338,-61605.116087,184.529438
531570,0.001069,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=EAT, MAR=Never married or under 15 years old)",7,209.0,3580.191388,-61604.879037,181.746440
528087,0.001074,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, OCCP=EAT)",6,210.0,3582.190476,-61602.879949,182.320646
494384,0.001130,"(POBP=California/CA, AGEP=<=34.0, SCHL=Grade 11, WKHP=<=29.0, OCCP=EAT, MAR=Never married or under 15 years old)",6,221.0,3640.316742,-61544.753683,184.021926
...,...,...,...,...,...,...,...
559959,0.001023,"(AGEP=[46.0-62.0], SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, OCCP=MED-Physicians)",4,200.0,380763.650000,315578.579575,20.590568
398927,0.001345,"(AGEP=>=35.0, OCCP=MED-Physicians, MAR=Married, WKHP=>=44.0, SEX=Male)",5,263.0,382907.528517,317722.458092,23.961065
544800,0.001043,"(AGEP=[46.0-62.0], WKHP=>=40.0, SEX=Male, OCCP=MED-Physicians)",4,204.0,384804.950980,319619.880556,21.478808
483588,0.001151,"(AGEP=[46.0-62.0], WKHP=>=44.0, OCCP=MED-Physicians)",3,225.0,387013.022222,321827.951797,22.473743


In [249]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence

fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=0)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
509859,0.001105,"(AGEP=>=46.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,216.0,391068.055556,325882.985131,21.703299
483588,0.001151,"(AGEP=[46.0-62.0], WKHP=>=44.0, OCCP=MED-Physicians)",3,225.0,387013.022222,321827.951797,22.473743
544800,0.001043,"(AGEP=[46.0-62.0], WKHP=>=40.0, SEX=Male, OCCP=MED-Physicians)",4,204.0,384804.950980,319619.880556,21.478808
398927,0.001345,"(AGEP=>=35.0, OCCP=MED-Physicians, MAR=Married, WKHP=>=44.0, SEX=Male)",5,263.0,382907.528517,317722.458092,23.961065
559959,0.001023,"(AGEP=[46.0-62.0], SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0, OCCP=MED-Physicians)",4,200.0,380763.650000,315578.579575,20.590568
...,...,...,...,...,...,...,...
461231,0.001197,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=34.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, MAR=Never married or under 15 years old)",6,234.0,3378.376068,-61806.694356,197.322449
453989,0.001212,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=34.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0)",5,237.0,3371.054852,-61814.015573,198.875479
424027,0.001278,"(POBP=California/CA, AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0, RAC1P=White alone, MAR=Never married or under 15 years old, SEX=Male)",7,250.0,3314.120000,-61870.950425,194.430340
466068,0.001186,"(SCHL=Regular high school diploma, SEX=Female, AGEP=<=26.0, RELP=Noninstitutionalized group quarters population, WKHP=<=29.0, MAR=Never married or under 15 years old)",6,232.0,3311.810345,-61873.260080,198.741469


In [250]:
mean_outcome = FP_fm.loc[FP_fm["itemsets"]==frozenset()]["outcome"][0]

v = mean_outcome*0.1

In [251]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
print(v)
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fp_divergence_o.getDivergence(th_redundancy=v)

6518.507042483994


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
509859,0.001105,"(AGEP=>=46.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,216.0,391068.055556,325882.985131,21.703299
483588,0.001151,"(AGEP=[46.0-62.0], WKHP=>=44.0, OCCP=MED-Physicians)",3,225.0,387013.022222,321827.951797,22.473743
544800,0.001043,"(AGEP=[46.0-62.0], WKHP=>=40.0, SEX=Male, OCCP=MED-Physicians)",4,204.0,384804.950980,319619.880556,21.478808
320465,0.001601,"(AGEP=>=35.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,313.0,379176.485623,313991.415198,25.509617
473447,0.001171,"(AGEP=[46.0-62.0], SEX=Male, OCCP=MED-Physicians)",3,229.0,371065.851528,305880.781104,21.537511
...,...,...,...,...,...,...,...
46777,0.006735,"(AGEP=<=26.0, SCHL=Grade 11)",2,1317.0,7599.658314,-57585.412110,142.904024
53698,0.006090,"(WKHP=<=29.0, SCHL=Grade 11)",2,1191.0,7481.200672,-57703.869753,87.367529
516577,0.001094,"(OCCP=PRS, RELP=Noninstitutionalized group quarters population)",2,214.0,7446.682243,-57738.388182,56.371489
200419,0.002301,"(RELP=Noninstitutionalized group quarters population, OCCP=SAL)",2,450.0,7212.400000,-57972.670425,133.254493


In [252]:
f = fp_divergence_o.getDivergence(th_redundancy=v)

f.loc[f["itemsets"].apply(lambda x: len([i for i in x if "OCCP" in i])>0)].sort_values(metric, ascending = False)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
509859,0.001105,"(AGEP=>=46.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,216.0,391068.055556,325882.985131,21.703299
483588,0.001151,"(AGEP=[46.0-62.0], WKHP=>=44.0, OCCP=MED-Physicians)",3,225.0,387013.022222,321827.951797,22.473743
544800,0.001043,"(AGEP=[46.0-62.0], WKHP=>=40.0, SEX=Male, OCCP=MED-Physicians)",4,204.0,384804.950980,319619.880556,21.478808
320465,0.001601,"(AGEP=>=35.0, WKHP=>=44.0, SEX=Male, OCCP=MED-Physicians)",4,313.0,379176.485623,313991.415198,25.509617
473447,0.001171,"(AGEP=[46.0-62.0], SEX=Male, OCCP=MED-Physicians)",3,229.0,371065.851528,305880.781104,21.537511
...,...,...,...,...,...,...,...
488930,0.001140,"(AGEP=<=26.0, OCCP=EAT-Dining Room And Cafeteria Attendants And Bartender Helpers)",2,223.0,8319.865471,-56865.204954,106.201798
256091,0.001907,"(AGEP=<=26.0, OCCP=EDU-Tutors)",2,373.0,8242.439678,-56942.630747,53.350791
456821,0.001207,"(WKHP=<=29.0, OCCP=EAT-Dining Room And Cafeteria Attendants And Bartender Helpers)",2,236.0,7756.610169,-57428.460255,123.498221
516577,0.001094,"(OCCP=PRS, RELP=Noninstitutionalized group quarters population)",2,214.0,7446.682243,-57738.388182,56.371489


#### Top-3

In [253]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
459458,"AGEP=<=26.0, RELP=Noninstit. GQs, SCHL=HS, SEX=Female, WKHP=<=29.0",0.0,-61879.794,200.3,0.051
466068,"AGEP=<=26.0, MAR=Never married/<15yrs, RELP=Noninstit. GQs, SCHL=HS, SEX=Female, WKHP=<=29.0",0.0,-61873.26,198.7,0.051
424027,"AGEP=<=26.0, MAR=Never married/<15yrs, POBP=CA, RAC=White, SCHL=Grade 11, SEX=Male, WKHP=<=29.0",0.0,-61870.95,194.4,0.051


In [254]:
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
509859,"AGEP=>=46.0, OCCP=MED-Physicians, SEX=Male, WKHP=>=44.0",0.0,325882.985,21.7,5.999
483588,"AGEP=[46.0-62.0], OCCP=MED-Physicians, WKHP=>=44.0",0.0,321827.952,22.5,5.937
544800,"AGEP=[46.0-62.0], OCCP=MED-Physicians, SEX=Male, WKHP=>=40.0",0.0,319619.881,21.5,5.903


#### Top-3 redundancy

In [255]:
v = mean_outcome*0.01
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

651.85, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
531583,"AGEP=<=26.0, COW=Empl. for-profit-c, RELP=Noninstit. GQs, SCHL=HS, SEX=Female",0.0,-61745.07,193.1,0.053
422782,"RELP=Noninstit. GQs, SCHL=HS, SEX=Female, WKHP=<=29.0",0.0,-61511.047,193.3,0.056
77088,"AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0",0.0,-61216.65,219.6,0.061


In [256]:
v = mean_outcome*0.05
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

3259.25, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
77088,"AGEP=<=26.0, SCHL=Grade 11, WKHP=<=29.0",0.0,-61216.65,219.6,0.061
104238,"RELP=Son/daughter, SCHL=Grade 11, WKHP=<=29.0",0.0,-61199.323,229.6,0.061
87720,"RELP=Son/daughter, SCHL=Grade 11, WKHP=<=39.0",0.0,-60625.418,219.8,0.07


In [257]:
v = mean_outcome*0.1
print(f"{v:.2f}, {mean_outcome:.2f}")
fpdiv_t = fp_divergence_o.getDivergence(th_redundancy=v).sort_values(fp_divergence_o.metric, ascending = True)

printable_with_ratio(fpdiv_t.head(3), abbreviations)

6518.51, 65185.07


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio
25216,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108
200419,"OCCP=SAL, RELP=Noninstit. GQs",0.0,-57972.67,133.3,0.111
516577,"OCCP=PRS, RELP=Noninstit. GQs",0.0,-57738.388,56.4,0.114


In [258]:
l = FP_fm.loc[FP_fm["length"]==1].sort_values("d_outcome")
l.loc[l["itemsets"].apply(lambda x: len([i for i in x if "WKHP" in i])>0)].head(20)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
144,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595
358,0.120395,(WKHP=[30.0-39.0]),1,23544.0,43446.36298,-21738.707445,46.955137
9,0.477193,(WKHP=[40.0-43.0]),1,93318.0,65774.672882,589.602457,1.962819
1,0.694983,(WKHP=>=40.0),1,135908.0,79681.731156,14496.660732,45.859105
95,0.217789,(WKHP=>=44.0),1,42590.0,110153.176661,44968.106236,72.224167


# Plot

In [279]:
def get_summary(FP_results, metric, sort_by_value=None, \
                show_weighted_ratio = True,
                type_gens = ["base", "generalized"], th_redundancy = 0, ascending = True, k = 1, \
                abbreviations = abbreviations, key = None, sups = [0.05, 0.025, 0.01]):
    ds = []
    if sort_by_value is None:
        sort_by_value = metric
    for sup in sups:
        for type_gen in type_gens:
            FP_fm = FP_results[type_gen][sup]
            fp_divergence_o=FP_Divergence(FP_fm, metric)
            fpdiv = fp_divergence_o.getDivergence(th_redundancy=th_redundancy)

            if show_weighted_ratio:
                import math
                fpdiv["wlogr"] = fpdiv["support"] * (fpdiv["outcome"]/mean_outcome).apply(lambda x: math.log(x))
                fpdiv["wlogr2"] = fpdiv["support"] * (mean_outcome/fpdiv["outcome"]).apply(lambda x: math.log(x))
            fpdiv = fpdiv.sort_values(sort_by_value, ascending = ascending, key = key)
            d = printable_with_ratio(fpdiv.head(k), abbreviations, show_weighted_ratio = show_weighted_ratio)
            cols = list(d.columns)
            d["min_sup"] = sup        
            d["type"] = type_gen
            ds.append(d[["min_sup", "type"]+cols])

    return pd.concat(ds)

In [284]:
40.4-33

7.399999999999999

In [260]:
asc = get_summary(FP_results, "d_outcome",  th_redundancy = 0, ascending = True, abbreviations = abbreviations)

print("Divergence")
display(asc)


asc = get_summary(FP_results, "d_outcome", sort_by_value="wlogr",  th_redundancy = 0, ascending = True, abbreviations = abbreviations)

print("Weighted log ratio")
display(asc)

Divergence


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
528,0.05,base,"AGEP=<=26.0, POBP=CA, WKHP=<=29.0",0.05,-57257.668,238.9,0.122,-0.108
1910,0.05,generalized,"AGEP=<=26.0, POBP=CA, WKHP=<=29.0",0.05,-57257.668,238.9,0.122,-0.108
1276,0.025,base,"AGEP=<=26.0, COW=Empl. for-profit-c, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.03,-57845.558,243.9,0.113,-0.063
5206,0.025,generalized,"AGEP=<=26.0, COW=Empl. for-profit-c, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.03,-57845.558,243.9,0.113,-0.063
5265,0.01,base,"MAR=Never married/<15yrs, RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58389.608,200.3,0.104,-0.023
23642,0.01,generalized,"AGEP=<=26.0, MAR=Never married/<15yrs, RELP=Noninstit. GQs, WKHP=<=39.0",0.01,-58463.301,203.8,0.103,-0.025


Weighted log ratio


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
58,0.05,base,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
40,0.05,generalized,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216
58,0.025,base,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
40,0.025,generalized,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216
58,0.01,base,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
40,0.01,generalized,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216


In [261]:
asc = get_summary(FP_results, "d_outcome",  k = 3, th_redundancy = 0, ascending = True, abbreviations = abbreviations)

print("Divergence")
display(asc)


asc = get_summary(FP_results, "d_outcome", k=3, sort_by_value="wlogr",  th_redundancy = 0, ascending = True, abbreviations = abbreviations)

print("Weighted log ratio")
display(asc)

Divergence


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
528,0.05,base,"AGEP=<=26.0, POBP=CA, WKHP=<=29.0",0.05,-57257.668,238.9,0.122,-0.108
375,0.05,base,"AGEP=<=26.0, MAR=Never married/<15yrs, WKHP=<=29.0",0.06,-57194.944,249.3,0.123,-0.134
524,0.05,base,"AGEP=<=26.0, COW=Empl. for-profit-c, WKHP=<=29.0",0.05,-57175.789,236.8,0.123,-0.108
1910,0.05,generalized,"AGEP=<=26.0, POBP=CA, WKHP=<=29.0",0.05,-57257.668,238.9,0.122,-0.108
1301,0.05,generalized,"AGEP=<=26.0, MAR=Never married/<15yrs, WKHP=<=29.0",0.06,-57194.944,249.3,0.123,-0.134
1902,0.05,generalized,"AGEP=<=26.0, COW=Empl. for-profit-c, WKHP=<=29.0",0.05,-57175.789,236.8,0.123,-0.108
1276,0.025,base,"AGEP=<=26.0, COW=Empl. for-profit-c, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.03,-57845.558,243.9,0.113,-0.063
1244,0.025,base,"AGEP=<=26.0, COW=Empl. for-profit-c, RELP=Son/daughter, WKHP=<=29.0",0.03,-57830.271,244.6,0.113,-0.064
937,0.025,base,"AGEP=<=26.0, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.04,-57789.323,241.1,0.113,-0.077
5206,0.025,generalized,"AGEP=<=26.0, COW=Empl. for-profit-c, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.03,-57845.558,243.9,0.113,-0.063


Weighted log ratio


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
58,0.05,base,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
75,0.05,base,"AGEP=<=26.0, MAR=Never married/<15yrs",0.15,-44945.122,175.9,0.31,-0.179
51,0.05,base,WKHP=<=29.0,0.18,-40394.306,122.5,0.38,-0.178
40,0.05,generalized,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216
285,0.05,generalized,"AGEP=<=34.0, WKHP=<=39.0",0.14,-49875.873,203.6,0.235,-0.197
26,0.05,generalized,AGEP=<=34.0,0.34,-27957.35,104.3,0.571,-0.193
58,0.025,base,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
75,0.025,base,"AGEP=<=26.0, MAR=Never married/<15yrs",0.15,-44945.122,175.9,0.31,-0.179
51,0.025,base,WKHP=<=29.0,0.18,-40394.306,122.5,0.38,-0.178
40,0.025,generalized,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216


In [262]:
mean_income = df_analysis_proc["income"].mean()

v = mean_income*0.05
        
asc_th = get_summary(FP_results, "d_outcome",  th_redundancy = v, ascending = True, abbreviations = abbreviations)

print("Divergence")
display(asc_th)


asc_th = get_summary(FP_results, "d_outcome", sort_by_value="wlogr",  th_redundancy = v, ascending = True, abbreviations = abbreviations)

print("Weighted log ratio")
display(asc_th)

Divergence


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
339,0.05,base,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125,-0.14
1168,0.05,generalized,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125,-0.14
339,0.025,base,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125,-0.14
1168,0.025,generalized,"AGEP=<=26.0, WKHP=<=29.0",0.07,-57018.478,246.6,0.125,-0.14
5103,0.01,base,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108,-0.023
25212,0.01,generalized,"RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58127.422,195.5,0.108,-0.023


Weighted log ratio


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
58,0.05,base,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
40,0.05,generalized,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216
58,0.025,base,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
40,0.025,generalized,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216
58,0.01,base,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
40,0.01,generalized,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216


In [263]:
desc = get_summary(FP_results, "d_outcome",  th_redundancy = 0, ascending = False, abbreviations = abbreviations)

print("Divergence")
display(desc)


desc = get_summary(FP_results, "d_outcome", sort_by_value="wlogr",  th_redundancy = 0, ascending = False, abbreviations = abbreviations)

print("Weighted log ratio")
display(desc)

Divergence


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
341,0.05,base,"MAR=Married, RAC=White, SEX=Male, WKHP=>=44.0",0.07,80986.277,62.3,2.242,0.054
1858,0.05,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male",0.05,90204.019,60.6,2.384,0.045
1285,0.025,base,SCHL=Prof beyond bachelor,0.03,105256.743,46.7,2.615,0.028
6602,0.025,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male, WKHP=>=44.0",0.03,119340.209,50.6,2.831,0.026
4388,0.01,base,"SCHL=Prof beyond bachelor, WKHP=>=44.0",0.01,163479.862,40.3,3.508,0.015
25541,0.01,generalized,"AGEP=>=35.0, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=40.0",0.01,172295.97,39.3,3.643,0.013


Weighted log ratio


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
4,0.05,base,MAR=Married,0.52,17200.215,46.8,1.264,0.123
7,0.05,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26585.978,69.4,1.408,0.166
4,0.025,base,MAR=Married,0.52,17200.215,46.8,1.264,0.123
7,0.025,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26585.978,69.4,1.408,0.166
4,0.01,base,MAR=Married,0.52,17200.215,46.8,1.264,0.123
7,0.01,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26585.978,69.4,1.408,0.166


In [264]:
desc = get_summary(FP_results, "d_outcome",  k = 3, th_redundancy = 0, ascending = False, abbreviations = abbreviations)

print("Divergence")
display(desc)


desc = get_summary(FP_results, "d_outcome", k = 3, sort_by_value="wlogr",  th_redundancy = 0, ascending = False, abbreviations = abbreviations)

print("Weighted log ratio")
display(desc)

Divergence


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
341,0.05,base,"MAR=Married, RAC=White, SEX=Male, WKHP=>=44.0",0.07,80986.277,62.3,2.242,0.054
468,0.05,base,"MAR=Married, RELP=Ref person, SEX=Male, WKHP=>=44.0",0.05,79218.083,55.3,2.215,0.044
450,0.05,base,"COW=Empl. for-profit-c, MAR=Married, RAC=White, WKHP=>=44.0",0.06,79129.066,56.4,2.214,0.045
1858,0.05,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male",0.05,90204.019,60.6,2.384,0.045
1567,0.05,generalized,"AGEP=>=35.0, MAR=Married, RAC=White, SEX=Male, WKHP=>=44.0",0.06,90159.062,62.0,2.383,0.05
1745,0.05,generalized,"AGEP=>=35.0, MAR=Married, OCCP=MGR, WKHP=>=40.0",0.05,89924.94,64.1,2.38,0.047
1285,0.025,base,SCHL=Prof beyond bachelor,0.03,105256.743,46.7,2.615,0.028
1300,0.025,base,"COW=Empl. for-profit-c, SCHL=Master, SEX=Male",0.03,96155.697,50.1,2.475,0.026
1425,0.025,base,"MAR=Married, RAC=White, SCHL=Bachelor, WKHP=>=44.0",0.03,92993.584,46.2,2.427,0.024
6602,0.025,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male, WKHP=>=44.0",0.03,119340.209,50.6,2.831,0.026


Weighted log ratio


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
4,0.05,base,MAR=Married,0.52,17200.215,46.8,1.264,0.123
20,0.05,base,"MAR=Married, SEX=Male",0.29,33802.864,65.7,1.519,0.122
38,0.05,base,WKHP=>=44.0,0.22,44968.106,72.2,1.69,0.114
7,0.05,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26585.978,69.4,1.408,0.166
18,0.05,generalized,"MAR=Married, WKHP=>=40.0",0.4,29142.31,69.0,1.447,0.146
32,0.05,generalized,"AGEP=>=35.0, MAR=Married, WKHP=>=40.0",0.33,34991.585,73.7,1.537,0.141
4,0.025,base,MAR=Married,0.52,17200.215,46.8,1.264,0.123
20,0.025,base,"MAR=Married, SEX=Male",0.29,33802.864,65.7,1.519,0.122
38,0.025,base,WKHP=>=44.0,0.22,44968.106,72.2,1.69,0.114
7,0.025,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26585.978,69.4,1.408,0.166


In [265]:
mean_income = df_analysis_proc["income"].mean()

v = mean_income*0.05
        
desc_th = get_summary(FP_results, "d_outcome",  th_redundancy = v, ascending = False, abbreviations = abbreviations)

print("Divergence")
display(desc_th)


desc_th = get_summary(FP_results, "d_outcome", sort_by_value="wlogr",  th_redundancy = v, ascending = False, abbreviations = abbreviations)

print("Weighted log ratio")
display(desc_th)

Divergence


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
341,0.05,base,"MAR=Married, RAC=White, SEX=Male, WKHP=>=44.0",0.07,80986.277,62.3,2.242,0.054
1858,0.05,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male",0.05,90204.019,60.6,2.384,0.045
1285,0.025,base,SCHL=Prof beyond bachelor,0.03,105256.743,46.7,2.615,0.028
6602,0.025,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male, WKHP=>=44.0",0.03,119340.209,50.6,2.831,0.026
4388,0.01,base,"SCHL=Prof beyond bachelor, WKHP=>=44.0",0.01,163479.862,40.3,3.508,0.015
25541,0.01,generalized,"AGEP=>=35.0, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=40.0",0.01,172295.97,39.3,3.643,0.013


Weighted log ratio


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
4,0.05,base,MAR=Married,0.52,17200.215,46.8,1.264,0.123
7,0.05,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26585.978,69.4,1.408,0.166
4,0.025,base,MAR=Married,0.52,17200.215,46.8,1.264,0.123
7,0.025,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26585.978,69.4,1.408,0.166
4,0.01,base,MAR=Married,0.52,17200.215,46.8,1.264,0.123
7,0.01,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26585.978,69.4,1.408,0.166


## Highest divergence  - ITP

In [273]:
desc = get_summary(FP_results, "d_outcome",  th_redundancy = 0, key = abs,  ascending = False, abbreviations = abbreviations)

print("Divergence")
desc["Δ_outcome"] =  (desc["Δ_outcome"]/1000).round(1).astype(str) + "k"
desc["ratio"] =  desc["ratio"].round(2)

display(desc)

Divergence


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
341,0.05,base,"MAR=Married, RAC=White, SEX=Male, WKHP=>=44.0",0.07,81.0k,62.3,2.24,0.054
1858,0.05,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male",0.05,90.2k,60.6,2.38,0.045
1285,0.025,base,SCHL=Prof beyond bachelor,0.03,105.3k,46.7,2.62,0.028
6602,0.025,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male, WKHP=>=44.0",0.03,119.3k,50.6,2.83,0.026
4388,0.01,base,"SCHL=Prof beyond bachelor, WKHP=>=44.0",0.01,163.5k,40.3,3.51,0.015
25541,0.01,generalized,"AGEP=>=35.0, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=40.0",0.01,172.3k,39.3,3.64,0.013


In [274]:
desc = get_summary(FP_results, "d_outcome", sort_by_value="wlogr",  th_redundancy = 0, ascending = False, abbreviations = abbreviations)

print("Weighted log ratio")
desc["Δ_outcome"] =  (desc["Δ_outcome"]/1000).round(1).astype(str) + "k"
display(desc)

Weighted log ratio


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
4,0.05,base,MAR=Married,0.52,17.2k,46.8,1.264,0.123
7,0.05,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26.6k,69.4,1.408,0.166
4,0.025,base,MAR=Married,0.52,17.2k,46.8,1.264,0.123
7,0.025,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26.6k,69.4,1.408,0.166
4,0.01,base,MAR=Married,0.52,17.2k,46.8,1.264,0.123
7,0.01,generalized,"AGEP=>=35.0, WKHP=>=40.0",0.49,26.6k,69.4,1.408,0.166


## Ascending

In [275]:
asc = get_summary(FP_results, "d_outcome",  k=1, th_redundancy = 0, ascending = True, abbreviations = abbreviations)

print("Divergence")
asc["Δ_outcome"] =  (asc["Δ_outcome"]/1000).round(1).astype(str) + "k"
display(asc)

Divergence


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
528,0.05,base,"AGEP=<=26.0, POBP=CA, WKHP=<=29.0",0.05,-57.3k,238.9,0.122,-0.108
1910,0.05,generalized,"AGEP=<=26.0, POBP=CA, WKHP=<=29.0",0.05,-57.3k,238.9,0.122,-0.108
1276,0.025,base,"AGEP=<=26.0, COW=Empl. for-profit-c, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.03,-57.8k,243.9,0.113,-0.063
5206,0.025,generalized,"AGEP=<=26.0, COW=Empl. for-profit-c, MAR=Never married/<15yrs, RELP=Son/daughter, WKHP=<=29.0",0.03,-57.8k,243.9,0.113,-0.063
5265,0.01,base,"MAR=Never married/<15yrs, RELP=Noninstit. GQs, WKHP=<=29.0",0.01,-58.4k,200.3,0.104,-0.023
23642,0.01,generalized,"AGEP=<=26.0, MAR=Never married/<15yrs, RELP=Noninstit. GQs, WKHP=<=39.0",0.01,-58.5k,203.8,0.103,-0.025


In [276]:
asc = get_summary(FP_results, "d_outcome", sort_by_value="wlogr",  th_redundancy = 0, ascending = True, abbreviations = abbreviations)

print("Weighted log ratio")
asc["Δ_outcome"] =  (asc["Δ_outcome"]/1000).round(1).astype(str) + "k"
display(asc)

Weighted log ratio


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
58,0.05,base,AGEP=<=26.0,0.17,-43.9k,173.8,0.326,-0.191
40,0.05,generalized,WKHP=<=39.0,0.31,-33.0k,108.5,0.493,-0.216
58,0.025,base,AGEP=<=26.0,0.17,-43.9k,173.8,0.326,-0.191
40,0.025,generalized,WKHP=<=39.0,0.31,-33.0k,108.5,0.493,-0.216
58,0.01,base,AGEP=<=26.0,0.17,-43.9k,173.8,0.326,-0.191
40,0.01,generalized,WKHP=<=39.0,0.31,-33.0k,108.5,0.493,-0.216


## Highest wlogr2

In [280]:
desc = get_summary(FP_results, "d_outcome", sort_by_value="wlogr", sups = [0.01], k = 3, key = abs, th_redundancy = 0, ascending = False, abbreviations = abbreviations)
display(desc)
print("Weighted log ratio")
desc["Δ_outcome"] =  (desc["Δ_outcome"]/1000).round(1).astype(str) + "k"
display(desc)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome,wlogr,wlogr2
4388,0.011654,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",2,2279.0,228664.931988,163479.861563,40.290922,0.014626,-0.014626
4366,0.011695,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2287.0,221697.415829,156512.345404,39.083566,0.014315,-0.014315
5235,0.010151,"(SCHL=Professional degree beyond a bachelor's degree, RELP=Reference person, SEX=Male)",3,1985.0,212307.662469,147122.592044,34.782031,0.011986,-0.011986
4651,0.011127,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SCHL=Master's degree, WKHP=>=44.0, SEX=Male)",4,2176.0,209515.284926,144330.214502,39.830737,0.012992,-0.012992
4472,0.01146,"(RAC1P=White alone, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2241.0,208800.406069,143615.335644,36.158773,0.013341,-0.013341


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome,wlogr,wlogr2
58,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073,-0.190545,0.190545
75,0.152897,"(AGEP=<=26.0, MAR=Never married or under 15 years old)",2,29900.0,20239.948629,-44945.121796,175.870942,-0.178825,0.178825
51,0.184622,(WKHP=<=29.0),1,36104.0,24790.764015,-40394.30641,122.486801,-0.178485,0.178485
11,0.34967,(MAR=Never married or under 15 years old),1,68380.0,39469.039544,-25716.030881,91.255149,-0.175434,0.175434
201,0.089739,"(WKHP=<=29.0, MAR=Never married or under 15 years old)",2,17549.0,11808.095846,-53376.974579,206.363504,-0.153314,0.153314


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
58,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
75,"AGEP=<=26.0, MAR=Never married/<15yrs",0.15,-44945.122,175.9,0.31,-0.179
51,WKHP=<=29.0,0.18,-40394.306,122.5,0.38,-0.178


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome,wlogr,wlogr2
25541,0.010268,"(WKHP=>=40.0, SCHL=Professional degree beyond a bachelor's degree, AGEP=>=35.0, SEX=Male)",4,2008.0,237481.040837,172295.970412,39.344466,0.013275,-0.013275
24251,0.010652,"(MAR=Married, AGEP=>=35.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",4,2083.0,228965.972156,163780.901731,38.327022,0.013382,-0.013382
21275,0.011654,"(SCHL=Professional degree beyond a bachelor's degree, WKHP=>=44.0)",2,2279.0,228664.931988,163479.861563,40.290922,0.014626,-0.014626
26256,0.010094,"(AGEP=>=46.0, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,1974.0,221792.608916,156607.538491,35.238657,0.012361,-0.012361
21164,0.011695,"(MAR=Married, SCHL=Professional degree beyond a bachelor's degree, SEX=Male)",3,2287.0,221697.415829,156512.345404,39.083566,0.014315,-0.014315


Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome,wlogr,wlogr2
40,0.305017,(WKHP=<=39.0),1,59648.0,32154.421171,-33030.649254,108.532595,-0.21555,0.21555
285,0.135935,"(WKHP=<=39.0, AGEP=<=34.0)",2,26583.0,15309.197382,-49875.873043,203.613355,-0.19694,0.19694
26,0.344602,(AGEP=<=34.0),1,67389.0,37227.720756,-27957.349669,104.272437,-0.193038,0.193038
174,0.170227,(AGEP=<=26.0),1,33289.0,21282.329118,-43902.741307,173.81073,-0.190545,0.190545
72,0.245613,"(MAR=Never married or under 15 years old, AGEP=<=34.0)",2,48031.0,30080.525994,-35104.544431,133.21964,-0.189945,0.189945


Unnamed: 0,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
40,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216
285,"AGEP=<=34.0, WKHP=<=39.0",0.14,-49875.873,203.6,0.235,-0.197
26,AGEP=<=34.0,0.34,-27957.35,104.3,0.571,-0.193


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
58,0.01,base,AGEP=<=26.0,0.17,-43902.741,173.8,0.326,-0.191
75,0.01,base,"AGEP=<=26.0, MAR=Never married/<15yrs",0.15,-44945.122,175.9,0.31,-0.179
51,0.01,base,WKHP=<=29.0,0.18,-40394.306,122.5,0.38,-0.178
40,0.01,generalized,WKHP=<=39.0,0.31,-33030.649,108.5,0.493,-0.216
285,0.01,generalized,"AGEP=<=34.0, WKHP=<=39.0",0.14,-49875.873,203.6,0.235,-0.197
26,0.01,generalized,AGEP=<=34.0,0.34,-27957.35,104.3,0.571,-0.193


Weighted log ratio


Unnamed: 0,min_sup,type,itemsets,sup,Δ_outcome,t_outcome,ratio,wlogr
58,0.01,base,AGEP=<=26.0,0.17,-43.9k,173.8,0.326,-0.191
75,0.01,base,"AGEP=<=26.0, MAR=Never married/<15yrs",0.15,-44.9k,175.9,0.31,-0.179
51,0.01,base,WKHP=<=29.0,0.18,-40.4k,122.5,0.38,-0.178
40,0.01,generalized,WKHP=<=39.0,0.31,-33.0k,108.5,0.493,-0.216
285,0.01,generalized,"AGEP=<=34.0, WKHP=<=39.0",0.14,-49.9k,203.6,0.235,-0.197
26,0.01,generalized,AGEP=<=34.0,0.34,-28.0k,104.3,0.571,-0.193


## Highest wlogr

In [None]:
desc = get_summary(FP_results, "d_outcome", sort_by_value="wlogr", sups = [0.01], k = 3, key = abs, th_redundancy = 0, ascending = False, abbreviations = abbreviations)

print("Weighted log ratio")
desc["Δ_outcome"] =  (desc["Δ_outcome"]/1000).round(1).astype(str) + "k"
display(desc)

# Tree divergence - income - combined

In [271]:
FP_results["generalized"][0.05].sort_values(by = "d_outcome", key=abs, ascending = True)

Unnamed: 0,support,itemsets,length,support_count,outcome,d_outcome,t_value_outcome
0,1.000000,(),0,195556.0,65185.070425,0.000000,0.000000
379,0.117358,"(WKHP=[40.0-43.0], POBP=California/CA, SEX=Male)",3,22950.0,65150.460741,-34.609684,0.069512
231,0.148996,"(WKHP=[40.0-43.0], RAC1P=White alone, POBP=California/CA)",3,29137.0,65113.524179,-71.546246,0.162939
495,0.104241,"(COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, AGEP=>=35.0, SEX=Female, MAR=Married)",4,20385.0,65259.402011,74.331586,0.123213
135,0.188493,"(AGEP=>=35.0, SEX=Female, MAR=Married)",3,36861.0,65104.250292,-80.820133,0.179588
...,...,...,...,...,...,...,...
1767,0.053729,"(AGEP=>=46.0, MAR=Married, WKHP=>=44.0, SEX=Male)",4,10507.0,150511.681165,85326.610740,56.369136
1950,0.050983,"(AGEP=>=46.0, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",4,9970.0,152503.482849,87318.412424,56.646643
1745,0.054296,"(WKHP=>=40.0, AGEP=>=35.0, OCCP=MGR, MAR=Married)",4,10618.0,155110.010360,89924.939935,64.128105
1567,0.057298,"(AGEP=>=35.0, MAR=Married, RAC1P=White alone, WKHP=>=44.0, SEX=Male)",5,11205.0,155344.132441,90159.062016,62.025598


In [272]:
aaaa

NameError: name 'aaaa' is not defined

## Tree

In [None]:
df_analysis_proc.head()

In [None]:
generalization_dict_all.keys()

In [None]:
continuous_attributes = ['AGEP', 'WKHP']

In [None]:
min_support = 0.1

In [None]:
metric = "d_outcome"

target = "income"

In [None]:
minimal_gain = 0

In [None]:
from tree_discretization_ranking import TreeDiscretization_ranking

tree_discr = TreeDiscretization_ranking()


type_experiment = "all_attributes"

type_criterion = "weighted_sum_abs_reference_s"


generalization_dict, discretizations = tree_discr.get_tree_discretization(
    df_analysis_proc,
    type_splitting=type_experiment,
    min_support=0.1,
    metric=metric,
    #class_map=class_map,
    continuous_attributes= list(continuous_attributes),
    #class_and_pred_names=cols_c,
    storeTree=True,
    type_criterion = type_criterion,
    minimal_gain = minimal_gain,
    target_col = target
)
tree_discr.printDiscretizationTrees()

tree_discr.trees.visualizeTreeDiGraph()

In [None]:
tree_discr.trees.visualizeTreeDiGraph(all_info = False)

In [None]:
tree_discr.get_number_nodes()

## min_sup_divergence  = 0.1

In [None]:
df_analysis["income"].describe()["mean"]

In [None]:
allow_overalp = (
    True if type_experiment == "all_attributes" else False
)
allow_overalp

### With Generalization 

In [None]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

FP_fm_True = extract_divergence_generalized(
    df_analysis,
    discretizations,
    generalization_dict,
    continuous_attributes,
    min_sup_divergence=0.1,
    apply_generalization=True,
    #true_class_name=pred_name,
    #predicted_class_name=class_name,
    target_name = target,
    #class_map=class_map,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

In [None]:
len(FP_fm_True)

In [None]:
FP_fm_True.sort_values(metric, ascending = False).head()

In [None]:
len(FP_fm_True)

## min_sup_divergence  = 0.05

In [None]:
df_analysis["income"].describe()["mean"]

In [None]:
allow_overalp = (
    True if type_experiment == "all_attributes" else False
)
allow_overalp

### With Generalization 

In [None]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

FP_fm_True = extract_divergence_generalized(
    df_analysis,
    discretizations,
    generalization_dict,
    continuous_attributes,
    min_sup_divergence=0.05,
    apply_generalization=True,
    target_name = target,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    allow_overalp = False if type_experiment!="all_attributes" else True
)

In [None]:
FP_fm_True.sort_values(metric, ascending = False).head()

In [None]:
len(FP_fm_True)

# Quantile

In [None]:
def check_ranges_validity(dfI_discr_quantile, bins, continuous_attributes):
    for cont_attr in continuous_attributes:
        print(
            cont_attr,
            len(dfI_discr_quantile[cont_attr].value_counts()),
            dict(dfI_discr_quantile[cont_attr].value_counts()),
            sum(dfI_discr_quantile[cont_attr].value_counts().values),
        )
        if len(dfI_discr_quantile[cont_attr].value_counts()) != bins:
            print(dict(dfI_discr_quantile[cont_attr].value_counts()))
            raise ValueError


In [None]:
attrs_discretize = [a for a in df_analysis_proc.columns if a not in [target]]

In [None]:
from import_datasets import discretize

bins = 5


dfI_discr_quantile = discretize(
    df_analysis_proc,  bins=bins, strategy="quantile", adaptive=True, attributes=attrs_discretize
)

check_ranges_validity(dfI_discr_quantile, bins, continuous_attributes)

In [None]:
# ### Extract divergence

from divexplorer_generalized_ranking.FP_DivergenceExplorer import FP_DivergenceExplorer_ranking
    
fp_diver = FP_DivergenceExplorer_ranking(
    dfI_discr_quantile,
    target_name=target
)

FP_fm = fp_diver.getFrequentPatternDivergence(
                min_support=0.01, metrics=["d_outcome"]
            )

FP_fm.sort_values("d_outcome", ascending = False).head()

In [None]:
from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = True)
display(printable_with_ratio(fpdiv.head(3), abbreviations))

fpdiv = fp_divergence_o.getDivergence(th_redundancy=0).sort_values(fp_divergence_o.metric, ascending = False)
printable_with_ratio(fpdiv.head(3), abbreviations)

# Exp

In [None]:
import pandas as pd
import os
filename = os.path.join(os.path.curdir, "datasets", "ACSPUMS", "PUMS_Data_Dictionary_2018.csv")

In [None]:
df_mappings = pd.read_csv(filename)

In [None]:
remapping_cols = {}

In [None]:
len(df_mappings.loc["VAL"].loc[col_name][cols_i])

In [None]:
orig_col = "1"
new_col = "Record Type"
cols_i = [orig_col, new_col]


not_conv = []
#col_name = "OCCP"
for col_name in acs_data.columns:
    c = False
    if len(df_mappings.loc["VAL"].loc[col_name])>0:
        if type(acs_data[col_name].values[0])!=str:            
            if (len(df_mappings.loc["VAL"].loc[[col_name]]))>1:
                dict_i = df_mappings.loc["VAL"].loc[col_name][cols_i].set_index(orig_col).to_dict()[new_col]
                dict_i = {float(k) if (k not in ["b", "bb", 'bbb', 'bbbb', "bbbbb", "bbbbbb", "bbbbbbb"]) else -1 :v for k,v in dict_i.items()}
                remapping_cols[col_name] = dict_i
                c = True
    if c == False:
        not_conv.append((col_name, len(df_mappings.loc["VAL"].loc[col_name][cols_i])))

In [None]:
from copy import deepcopy

acs_data_renamed = deepcopy(acs_data)

for column_name in remapping_cols:
    acs_data_renamed[column_name] = acs_data[column_name].replace(remapping_cols[column_name])

In [None]:
v, i = 10, 0
for i in range(20,30):
    cols = acs_data_renamed.columns[(i*v):(i*v)+v]
    display(acs_data_renamed[cols].head(7))

In [None]:
remapping_cols["OCCP"]

In [None]:
attr = "OCCP"
{x: x.split("-")[0] for x in acs_data_renamed[attr].values if "-" in x}

In [None]:
acs_data_renamed["JWTR"].value_counts()#.sum()/len(acs_data_renamed)

acs_data_renamed["JWTR"].value_counts()/len(acs_data_renamed)

In [None]:
FP_fm_s = deepcopy(FP_fm)

In [None]:
mean_outcome = FP_fm_s.loc[0]["outcome"]
FP_fm_s["ratio"] = FP_fm_s["outcome"] / mean_outcome
import math
FP_fm_s["wlogr"] = FP_fm_s["support"] * (FP_fm_s["ratio"]).apply(lambda x: math.log(x))

In [None]:
FP_fm_s.sort_values("wlogr", ascending = False)

In [None]:
import math


b = FP_results["base"][0.05]
g = FP_results["generalized"][0.05]


mean_outcome = b.loc[0]["outcome"]

fp_i = FP_Divergence(b, "d_outcome")
b = fp_i.getDivergence(th_redundancy=0)
b["ratio"] = b["outcome"] / mean_outcome
b["wlogr"] = b["support"] * (b["ratio"]).apply(lambda x: math.log(x))

fp_i = FP_Divergence(g, "d_outcome")
g = fp_i.getDivergence(th_redundancy=0)
g["ratio"] = g["outcome"] / mean_outcome
g["wlogr"] = g["support"] * (g["ratio"]).apply(lambda x: math.log(x))

display(b[["ratio", "wlogr"]].describe())
display(g[["ratio", "wlogr"]].describe())

In [None]:
import math


b = FP_results["base"][0.01]
g = FP_results["generalized"][0.01]


mean_outcome = b.loc[0]["outcome"]

fp_i = FP_Divergence(b, "d_outcome")
b = fp_i.getDivergence(th_redundancy=0)
b["ratio"] = b["outcome"] / mean_outcome
b["wlogr"] = b["support"] * (b["ratio"]).apply(lambda x: math.log(x))

fp_i = FP_Divergence(g, "d_outcome")
g = fp_i.getDivergence(th_redundancy=0)
g["ratio"] = g["outcome"] / mean_outcome
g["wlogr"] = g["support"] * (g["ratio"]).apply(lambda x: math.log(x))

display(b[["ratio", "wlogr"]].describe())
display(g[["ratio", "wlogr"]].describe())

In [None]:
# df_analysis["OCCP-hier"] = df_analysis["OCCP"].str.split("-").apply(lambda x: x[0])

In [51]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = True

pattern_type = "generalized" if apply_generalization else "base"
min_sup_divergence = 0.01 
import time
s = time.time()
FP_results[pattern_type][min_sup_divergence] = extract_divergence_generalized(
    df_analysis_proc,
    discretizations,
    generalization_dict_all,
    continuous_attributes,
    min_sup_divergence=min_sup_divergence,
    apply_generalization=apply_generalization,
    #true_class_name=pred_name,
    #predicted_class_name=class_name,
    target_name = target,
    #class_map=class_map,
    FPM_type="fpgrowth",
    metrics_divergence = ["d_outcome"],
    type_experiment = type_experiment,
    allow_overalp = False if type_experiment!="all_attributes" else True
)
e = time.time()

1 10000
2 20000


In [52]:
e-s

43.469666957855225

In [53]:
22.5+14

36.5

In [55]:
from utils_discretize_df import (
    discretize_df_via_discretizations,
)

df_discreted_d = discretize_df_via_discretizations(
    df_analysis_proc,
    discretizations,
    allow_overalp = False
)

In [57]:
from divexplorer_generalized_ranking.FP_DivergenceExplorer import (
        FP_DivergenceExplorer_ranking,
    )

s = time.time()
fp_diver = FP_DivergenceExplorer_ranking(
    df_discreted_d,
    true_class_name=None,
    predicted_class_name=None,
    target_name=target,
    class_map={"N": 0, "P": 1},
    generalizations_obj=None,
    preserve_interval=None,
    already_in_one_hot_encoding=False,
)

FP_fm_input = fp_diver.getFrequentPatternDivergence(
    min_support=min_sup_divergence,
    metrics = ["d_outcome"],
    FPM_type="fpgrowth"
)
e = time.time()

In [58]:
e-s

15.331115961074829