In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd

from utils_data import import_process_folktables
from utils import discretize


In [3]:
year = '2023'
filename = os.path.join("./data", f"folktables_{year}.csv")
if os.path.exists(filename):
    df = pd.read_csv(os.path.join("./data", f"folktables_{year}.csv"))
    target_col = 'income'
else:
    df, target_col = import_process_folktables(input_dir="./data", store_data = True, year=year)

In [4]:
df = df.loc[df['AGEP']>=18]

In [5]:
attributes = list(df.columns)
attributes.remove(target_col)
df_discretized = discretize(df, attributes=attributes)  

# Divergence analysis

In [6]:
min_support_count = 100
min_support = min_support_count/len(df_discretized)

In [7]:
from divexplorer import DivergenceExplorer


fp_diver = DivergenceExplorer(df_discretized)

In [8]:
import resource

def limit_memory(max_mem_mb):
    """Limit memory usage of the current notebook process."""
    soft, hard = max_mem_mb * 1024 * 1024, max_mem_mb * 1024 * 1024
    resource.setrlimit(resource.RLIMIT_AS, (soft, hard))

# Example: Limit the notebook to 2 GB of memory
limit_memory(10*1024)

In [9]:
import time
try:
    start_time = time.time()
    subgroups = fp_diver.get_pattern_divergence(min_support=min_support, quantitative_outcomes=[target_col])
    print("--- %s seconds ---" % (time.time() - start_time))
except MemoryError:
    print("Memory limit exceeded!")

--- 42.78594970703125 seconds ---


In [10]:
pd.set_option('display.max_colwidth', None)
subgroups.sort_values(by=f"{target_col}_div", ascending=False).head(5)

Unnamed: 0,support,itemset,income,income_div,income_t,length,support_count
138370,0.000658,"(WKHP=> 40, MAR=Married, AGEP=(34 - 51], OCCP=MED-Physicians, SEX=Male)",434180.75188,352551.537817,16.862418,5,133.0
186271,0.0005,"(WKHP=> 40, AGEP=> 51, COW=Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions, SCHL=Master's degree, OCCP=MGR-Chief Executives And Legislators)",427162.970297,345533.756234,15.292203,5,101.0
146892,0.000623,"(WKHP=> 40, RAC1P=White alone, RELSHIPP=Reference person, OCCP=MED-Physicians, SEX=Male)",424946.428571,343317.214508,14.450744,5,126.0
165946,0.000559,"(WKHP=> 40, AGEP=(34 - 51], OCCP=MED-Physicians, RELSHIPP=Reference person, SEX=Male)",421460.0,339830.785937,14.203841,5,113.0
174763,0.000529,"(WKHP=> 40, MAR=Married, AGEP=(34 - 51], SCHL=Professional degree beyond a bachelor's degree, OCCP=MED-Physicians, SEX=Male)",420501.308411,338872.094348,15.167998,6,107.0


In [50]:
abbreviations = {"RAC1P": "RAC", "White alone": "White", "Reference person": "Ref person", \
#"Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions": "Empl. for-profit-c",\
'Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions' : 'Empl-prv',
#"Employee of a private not-for-profit, tax-exempt, or charitable organization": "Empl. no-profit-c",
"Self-employed in own not incorporated business, professional practice, or farm": "Self-Empl. not incorp" ,     
"Self-employed in own incorporated business, professional practice or farm": "Self-Empl. incorp" ,
"Local government employee (city, county, etc.)" : "Local gov. Empl. ",
"Federal government employee": "Federal gov.  Empl.",
"California/CA":"CA", "Never married or under 15 years old": "Never married/<15yrs", 
"Biological son or daughter": "Son/daughter", "Regular high school diploma":"HS", \
"Asian alone":"Asian", "1 or more years of college credit":"1+ collage cr",
"Some Other Race alone": "Other", "Bachelor's degree": "Bachelor", "Master's degree": "Master", 
                "Some college, but less than 1 year": "<1y collage",
                "Associate's degree":"Associate",
"Noninstitutionalized group quarters population": "Noninstitutionalized group quarters", 
"OCCP=CMM-Software Developers": "OCCP=CMM-SW Dev", 
                 "Professional degree beyond a bachelor's degree": "Prof beyond bachelor",
                 'AGEP': 'age', 'SEX':'gender', '=<=': '$\\le$', '=> ': '>', 'OCCP': 'occ', \
                    'RAC': 'eth', 'WKHP': 'wkh', 'SCHL': 'schl', 'MAR': 'mar', 'RELP': 'relp',
                    'POBP':'pob', 'RELSHIPP':'relp',
                    'Noninstitutionalized group quarters': 'NonInst-GQ', 
                    'Executives And Legislators': 'Execs/Legislators',
                    'Fast Food And Counter Workers': 'FastFood/Counter', 'COW': 'cow',
                    'Preschool And Kindergarten Teachers': 'Preschool/Kinderg. Tchrs',
                    'Opposite-sex husband/wife/spouse': 'Opposite-sex-rel'}


In [25]:

def printable(subgroups_print, target_col, abbreviations = {},  use_k = False):
    from utils import sort_itemset
    subgroups_print['itemset'] = subgroups_print['itemset'].apply(lambda x: sort_itemset(x, abbreviations=abbreviations))
    subgroups_print['support'] = subgroups_print['support'].apply(lambda x: round(x, 3))
    subgroups_print[f"{target_col}_t"] = subgroups_print[f"{target_col}_t"].apply(lambda x: round(x, 1))

    if use_k:
        subgroups_print[f"{target_col}_div"] = subgroups_print[f"{target_col}_div"].apply(lambda x: x/1000)
        subgroups_print[f"{target_col}_div"] = subgroups_print[f"{target_col}_div"].apply(lambda x: f'{round(x, 2)}k')
        
    remaining_cols = [col for col in subgroups_print.columns if col not in ['itemset', 'support', f"{target_col}_t", f"{target_col}_div"]]
    for col in remaining_cols:
        if use_k:
            subgroups_print[col] = subgroups_print[col].apply(lambda x: x/1000)
            subgroups_print[col] = subgroups_print[col].apply(lambda x: f'{round(x, 2)}k')
        else:
            subgroups_print[col] = subgroups_print[col].apply(lambda x: round(x, 3))
    return subgroups_print

In [26]:
pd.set_option('display.max_colwidth', None)
cols = ['itemset', 'support', target_col, f"{target_col}_div", f"{target_col}_t"]

In [27]:
from divexplorer import DivergencePatternProcessor


fp_details = DivergencePatternProcessor(subgroups, target_col)


In [None]:
import time
try:
    
    start_time = time.time()
    patterns = fp_details.get_patterns(th_redundancy=0)
    print("--- %s seconds ---" % (time.time() - start_time))
except MemoryError:
    print("Memory limit exceeded!")

--- 1.163111686706543 seconds ---


In [29]:
subgroups_print = patterns[cols].sort_values(by=f"{target_col}_div", ascending=False).head(3).copy()
subgroups_print = printable(subgroups_print, target_col, use_k = True, abbreviations=abbreviations)

display(subgroups_print)

print(subgroups_print.to_latex(index=False))

Unnamed: 0,itemset,support,income,income_div,income_t
138370,"age=(34 - 51], mar=Married, occ=MED-Physicians, gender=Male, wkh>40",0.001,434.18k,352.55k,16.9
186271,"age>51, COW=Empl-prv, occ=MGR-Chief Execs/Legislators, schl=Master, wkh>40",0.0,427.16k,345.53k,15.3
146892,"occ=MED-Physicians, eth=White, relp=Ref person, gender=Male, wkh>40",0.001,424.95k,343.32k,14.5


\begin{tabular}{lrllr}
\toprule
itemset & support & income & income_div & income_t \\
\midrule
age=(34 - 51], mar=Married, occ=MED-Physicians, gender=Male, wkh>40 & 0.001000 & 434.18k & 352.55k & 16.900000 \\
age>51, COW=Empl-prv, occ=MGR-Chief Execs/Legislators, schl=Master, wkh>40 & 0.000000 & 427.16k & 345.53k & 15.300000 \\
occ=MED-Physicians, eth=White, relp=Ref person, gender=Male, wkh>40 & 0.001000 & 424.95k & 343.32k & 14.500000 \\
\bottomrule
\end{tabular}



In [35]:
subgroups_print = patterns[cols].sort_values(by=f"{target_col}_div", ascending=True).head(3).copy()
subgroups_print = printable(subgroups_print, target_col, use_k = True, abbreviations=abbreviations)

display(subgroups_print)

print(subgroups_print.to_latex(index=False))

Unnamed: 0,itemset,support,income,income_div,income_t
149035,"cow=Empl-prv, occ=EAT-FastFood/Counter, pob=CA, relp=NonInst-GQ",0.001,4.93k,-76.69k,186.9
139748,"occ=EAT-FastFood/Counter, pob=CA, relp=NonInst-GQ",0.001,5.05k,-76.58k,188.7
134478,"age$\le$ 34, occ=EAT-FastFood/Counter, relp=NonInst-GQ, gender=Female",0.001,5.18k,-76.45k,158.0


\begin{tabular}{lrllr}
\toprule
itemset & support & income & income_div & income_t \\
\midrule
cow=Empl-prv, occ=EAT-FastFood/Counter, pob=CA, relp=NonInst-GQ & 0.001000 & 4.93k & -76.69k & 186.900000 \\
occ=EAT-FastFood/Counter, pob=CA, relp=NonInst-GQ & 0.001000 & 5.05k & -76.58k & 188.700000 \\
age$\le$ 34, occ=EAT-FastFood/Counter, relp=NonInst-GQ, gender=Female & 0.001000 & 5.18k & -76.45k & 158.000000 \\
\bottomrule
\end{tabular}



In [38]:
patterns.loc[patterns['itemset']==frozenset()].iloc[0]['income']/1000

np.float64(81.6292140631029)

In [31]:
# Redundancy

In [None]:
th_value = subgroups.loc[subgroups["itemset"]==frozenset()][target_col].values[0]*0.01
th_value = 5000

In [None]:
import time
try:
    
    start_time = time.time()
    red_patterns = fp_details.get_patterns(th_redundancy=th_value)
    print("--- %s seconds ---" % (time.time() - start_time))
except MemoryError:
    print("Memory limit exceeded!")

In [None]:
subgroups_print = red_patterns[cols].sort_values(by=f"{target_col}_div", ascending=False).head(3).copy()
subgroups_print = printable(subgroups_print, target_col, use_k = True, abbreviations=abbreviations)

display(subgroups_print)

print(subgroups_print.to_latex(index=False))

In [None]:
subgroups_print = red_patterns[cols].sort_values(by=f"{target_col}_div", ascending=True).head(3).copy()
subgroups_print = printable(subgroups_print, target_col, use_k = True, abbreviations=abbreviations)

display(subgroups_print)

print(subgroups_print.to_latex(index=False))

In [None]:
subgroups.loc[subgroups['itemset']==frozenset()]

# Gender - Attribute categorical divergence

In [39]:
target_col = 'SEX'

attributes = list(df.columns)
df_discretized_all = discretize(df, attributes=attributes)  

df_discretized_all[target_col] = df_discretized_all[target_col].apply(lambda x: 1 if x == 'Female' else 0)

fp_diver = DivergenceExplorer(df_discretized_all)

In [40]:
import time
try:
    start_time = time.time()
    subgroups = fp_diver.get_pattern_divergence(min_support=min_support, boolean_outcomes=[target_col])
    print("--- %s seconds ---" % (time.time() - start_time))
except MemoryError:
    print("Memory limit exceeded!")

--- 47.01238179206848 seconds ---


In [41]:
cols = ['itemset', 'support', target_col,  f"{target_col}_div", f"{target_col}_t"]

In [42]:
from divexplorer import DivergencePatternProcessor


fp_details = DivergencePatternProcessor(subgroups, target_col)

import time
try:
    
    start_time = time.time()
    patterns = fp_details.get_patterns(th_redundancy=0)
    print("--- %s seconds ---" % (time.time() - start_time))
except MemoryError:
    print("Memory limit exceeded!")


--- 1.140322208404541 seconds ---


In [51]:
subgroups_print = patterns[cols].sort_values(by=[f"{target_col}_div", f"{target_col}_t"], ascending=[False, False]).head(3).copy()
subgroups_print = printable(subgroups_print, target_col, abbreviations=abbreviations)

display(subgroups_print)

print(subgroups_print.to_latex(index=False))

Unnamed: 0,itemset,support,SEX,SEX_div,SEX_t
135129,"occ=EDU-Preschool/Kinderg. Tchrs, relp=Opposite-sex-rel, income$\le$ 33800",0.001,1.0,0.527243,70.9
151329,"cow=Empl-prv, occ=EDU-Preschool/Kinderg. Tchrs, relp=Opposite-sex-rel",0.001,1.0,0.527243,63.8
158312,"mar=Married, occ=PRS-Childcare Workers, eth=Two or More Races",0.001,1.0,0.527243,61.2


\begin{tabular}{lrrrr}
\toprule
itemset & support & SEX & SEX_div & SEX_t \\
\midrule
occ=EDU-Preschool/Kinderg. Tchrs, relp=Opposite-sex-rel, income$\le$ 33800 & 0.001000 & 1.000000 & 0.527243 & 70.900000 \\
cow=Empl-prv, occ=EDU-Preschool/Kinderg. Tchrs, relp=Opposite-sex-rel & 0.001000 & 1.000000 & 0.527243 & 63.800000 \\
mar=Married, occ=PRS-Childcare Workers, eth=Two or More Races & 0.001000 & 1.000000 & 0.527243 & 61.200000 \\
\bottomrule
\end{tabular}



In [52]:
subgroups_print = patterns[cols].sort_values(by=[f"{target_col}_div", f"{target_col}_t"], ascending=[True, False]).head(3).copy()
subgroups_print = printable(subgroups_print, target_col, use_k = False, abbreviations=abbreviations)

display(subgroups_print)

print(subgroups_print.to_latex(index=False))

Unnamed: 0,itemset,support,SEX,SEX_div,SEX_t
61133,"cow=Empl-prv, occ=CON-Plumbers, Pipefitters, And Steamfitters, income=(33800 - 80000]",0.001,0.0,-0.472757,125.4
78823,"age$\le$ 34, occ=CON-Construction Laborers, wkh$\le$ 40, income=(33800 - 80000]",0.001,0.0,-0.472757,101.4
79364,"age$\le$ 34, cow=Empl-prv, mar=Never married/<15yrs, occ=CON-Construction Laborers, pob=CA, wkh$\le$ 40",0.001,0.0,-0.472757,100.9


\begin{tabular}{lrrrr}
\toprule
itemset & support & SEX & SEX_div & SEX_t \\
\midrule
cow=Empl-prv, occ=CON-Plumbers, Pipefitters, And Steamfitters, income=(33800 - 80000] & 0.001000 & 0.000000 & -0.472757 & 125.400000 \\
age$\le$ 34, occ=CON-Construction Laborers, wkh$\le$ 40, income=(33800 - 80000] & 0.001000 & 0.000000 & -0.472757 & 101.400000 \\
age$\le$ 34, cow=Empl-prv, mar=Never married/<15yrs, occ=CON-Construction Laborers, pob=CA, wkh$\le$ 40 & 0.001000 & 0.000000 & -0.472757 & 100.900000 \\
\bottomrule
\end{tabular}



In [45]:
patterns.loc[subgroups['itemset']==frozenset()]

Unnamed: 0,support,itemset,length,support_count,SEX,SEX_div,SEX_t
0,1.0,(),0,202146.0,0.472757,0.0,0.0
