In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

import pandas as pd
import os

In [3]:
from utils_printable import printable



abbreviations = {"RAC1P": "RAC", "White alone": "White", "Reference person": "Ref person", \
"Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions": "Empl. for-profit-c",\
"Employee of a private not-for-profit, tax-exempt, or charitable organization": "Empl. no-profit-c",
"Self-employed in own not incorporated business, professional practice, or farm": "Self-Empl. not incorp" ,     
"Self-employed in own incorporated business, professional practice or farm": "Self-Empl. incorp" ,
"Local government employee (city, county, etc.)" : "Local gov. Empl. ",
"Federal government employee": "Federal gov.  Empl.",
"California/CA":"CA", "Never married or under 15 years old": "Never married/<15yrs", 
"Biological son or daughter": "Son/daughter", "Regular high school diploma":"HS", \
"Asian alone":"Asian", "1 or more years of college credit":"1+ collage cr",
"Some Other Race alone": "Other", "Bachelor's degree": "Bachelor", "Master's degree": "Master", 
                "Some college, but less than 1 year": "<1y collage",
                "Associate's degree":"Associate",
"Noninstitutionalized group quarters population": "Noninstit. GQs", 
"OCCP=CMM-Software Developers": "OCCP=CMM-SW Dev", 
                 "Professional degree beyond a bachelor's degree": "Prof beyond bachelor"}

In [4]:
def printable_with_ratio(df_pr, abbreviations = {}, show_weighted_ratio = False):
    out_cols = ["itemsets", "sup", "Δ_outcome", "t_outcome", "ratio"]
    df_pr["ratio"] = df_pr["outcome"]/mean_outcome
    if show_weighted_ratio:
        out_cols = out_cols + ["wlogr"]
    df_pr = printable(df_pr, abbreviations=abbreviations)
    return df_pr[out_cols]



# Load data

In [5]:
from folktables import ACSDataSource, ACSIncome

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)
features, label, group = ACSIncome.df_to_numpy(acs_data)

In [6]:
from folktables.acs import adult_filter
from folktables import BasicProblem
import numpy as np

feature_names = [ 'AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP',  'RELP', 'WKHP', 'SEX', 'RAC1P' ]

target='PINCP'

ACSIncome_outcome = BasicProblem(
    features=feature_names,
    target=target,
    group='RAC1P',
    preprocess=adult_filter,
)

In [7]:
features, label, group = ACSIncome_outcome.df_to_numpy(acs_data)

In [8]:
continuous_attributes = ['AGEP', 'WKHP']

In [9]:
categorical_attributes = list(set(feature_names)-set(continuous_attributes))

In [10]:
import pandas as pd


df = pd.DataFrame(features, columns=feature_names)
df["income"] = label
df.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,income
0,30.0,6.0,14.0,1.0,9610.0,6.0,16.0,40.0,1.0,8.0,48500.0
1,21.0,4.0,16.0,5.0,1970.0,6.0,17.0,20.0,1.0,1.0,7700.0
2,65.0,2.0,22.0,5.0,2040.0,6.0,17.0,8.0,1.0,1.0,17200.0
3,33.0,1.0,14.0,3.0,9610.0,36.0,16.0,40.0,1.0,1.0,12000.0
4,18.0,2.0,19.0,5.0,1021.0,6.0,17.0,18.0,2.0,1.0,300.0


# Remap columns

From label encoding to interpretable notation: e.g., 0, 1 to "male", "female"

In [11]:
filename = os.path.join(os.path.curdir, "datasets", "ACSPUMS", "PUMS_Data_Dictionary_2018.csv")
df_mappings = pd.read_csv(filename)

In [12]:
remapping_cols = {}

orig_col = "1"
new_col = "Record Type"
cols_i = [orig_col, new_col]

#col_name = "OCCP"
for col_name in categorical_attributes:
    dict_i = df_mappings.loc["VAL"].loc[col_name][cols_i].set_index(orig_col).to_dict()[new_col]
    dict_i = {float(k) if (k not in ["b", "bb", 'bbb', 'bbbb']) else -1 :v for k,v in dict_i.items()}
    remapping_cols[col_name] = dict_i

In [13]:
from copy import deepcopy

df_analysis = deepcopy(df)

for column_name in remapping_cols:
    df_analysis[column_name] = df[column_name].replace(remapping_cols[column_name])

We substituite nan with string "NaN"

In [14]:
for c in df_analysis:
    if df_analysis[c].isna().any():
        print(c)
        df_analysis[c].fillna('NaN', inplace=True)

In [15]:
df_analysis.head(3)

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,income
0,30.0,Self-employed in own not incorporated business...,Grade 11,Married,TRN-Cleaners Of Vehicles And Equipment,California/CA,Institutionalized group quarters population,40.0,Male,Some Other Race alone,48500.0
1,21.0,State government employee,Regular high school diploma,Never married or under 15 years old,"SCI-Other Life, Physical, And Social Science T...",California/CA,Noninstitutionalized group quarters population,20.0,Male,White alone,7700.0
2,65.0,"Employee of a private not-for-profit, tax-exem...",Master's degree,Never married or under 15 years old,CMS-Clergy,California/CA,Noninstitutionalized group quarters population,8.0,Male,White alone,17200.0


In [16]:
import pickle

with open(
    os.path.join(os.path.curdir, "datasets", "ACSPUMS", "census_column_mapping.pickle"),
    "wb",
) as fp:
    pickle.dump(remapping_cols, fp)

In [17]:
df_analysis.to_csv(os.path.join(os.path.curdir, "datasets", "ACSPUMS", "adult_dataset_income.csv"), index=False)

In [18]:
continuous_attributes = ['AGEP', 'WKHP']

In [19]:
df_analysis.shape

(195665, 11)

# Taxonomy

In [20]:
df_analysis_proc = deepcopy(df_analysis)

### Occupation

In [21]:
attr = "OCCP"

generalization_dict_tax = {}
generalization_dict_tax[attr] = {x: x.split("-")[0] for x in df_analysis_proc[attr].values if "-" in x}

### Country

We rename some of the columns for vizualization and mapping reasons.

In [22]:
if "POBP" in df_analysis_proc.columns:
    df_analysis_proc["POBP"] = df_analysis_proc["POBP"]\
    .replace({"England":"United Kingdom", 'Scotland': "United Kingdom", \
              "United Kingdom, Not Specified": "United Kingdom",  'Northern Ireland': "United Kingdom",\
               "USSR": "Russia", "Hong Kong": "Hong Kong SAR",
             "Democratic Republic of Congo (Zaire)": 'Democratic Republic of the Congo',
             "Trinidad & Tobago": "Trinidad and Tobago", 'Macedonia':'North Macedonia',
             'Antigua & Barbuda': 'Antigua and Barbuda', \
              'Commonwealth of the Northern Mariana Islands':'Northern Mariana Islands',
             'St. Vincent & the Grenadines': 'St. Vincent and the Grenadines', \
              #simplification
             "Azores Islands": "Portugal"})

In [23]:
# https://statisticstimes.com/geography/countries-by-continents.php
df_taxonomy_country = pd.read_csv("country_hierarchies.csv", header=None, index_col=0)

df_taxonomy_country.columns=["Country or Area","ISO-alpha3 Code", "M49 Code", "Region 1", "Region 2", "Continent"]

df_taxonomy_country["Country or Area"] = df_taxonomy_country["Country or Area"].replace({
    "Viet Nam":"Vietnam", "Bolivia (Plurinational State of)": "Bolivia" ,\
      "Russian Federation": "Russia",'Iran (Islamic Republic of)': 'Iran', \
    "Lao People's Democratic Republic": "Laos", 'Syrian Arab Republic': "Syria", 
    'Venezuela (Bolivarian Republic of)': "Venezuela",
    "Republic of Moldova": "Moldova", "Czechia": "Czech Republic",
        'United Republic of Tanzania': "Tanzania", 'Côte d’Ivoire': "Ivory Coast",
        'United States Virgin Islands': 'US Virgin Islands', 'Saint Lucia': 'St. Lucia', 
    'Saint Kitts and Nevis': 'St. Kitts-Nevis',
    'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines',
    "United Kingdom of Great Britain and Northern Ireland" : "United Kingdom",\
    "China, Hong Kong Special Administrative Region" : "Hong Kong SAR"
})


g_hierarchy = {}
target_cols = ["Country or Area", "Region 1", "Continent"]
for i, row in df_taxonomy_country.iterrows():
    levels = [col for col in target_cols if pd.notnull(row[col])]
    
    for l in range(len(levels)-1):
        tax_c, tax_p = row[levels[l]], row[levels[l+1]]
        # If already exist but the value is different
        if "Korea" in tax_c:
            tax_c = "Korea"
        
        if tax_c in g_hierarchy and tax_p!=g_hierarchy[tax_c]:
            print(row)
            raise ValueError()
        if tax_c == tax_p:
            continue
        g_hierarchy[tax_c] = tax_p
attr = 'POBP'
#g_hierarchy = {}
g_hierarchy.update({"North America": "America", 
 "South America": "America", 
 "Americas, Not Specified" : "America", 
"Other Europe, Not Specified": "Europe",
"Eastern Africa, Not Specified": "Africa",
"Other Asia, Not Specified": "Asia",
"South Central Asia, Not Specified": "Asia",
"Caribbean, Not Specified": "North America",
"Other US Island Areas, Oceania, Not Specified, or at Sea": "Oceania",
"Other Africa, Not Specified": "Africa",
"Western Africa, Not Specified": "Africa",
                   "Taiwan" : 'Eastern Asia',
                   'Czechoslovakia':'Eastern Europe', 'Yugoslavia':'Southern Europe'})


In [24]:
generalization_dict_tax_u = deepcopy(g_hierarchy)
delete_values = []
for c, v in df_analysis_proc["POBP"].value_counts().items():
    if c not in g_hierarchy.keys():
        if "/" in c:
            g_hierarchy[c] = "United States of America"
        else:
            delete_values.append(c)

delete_values.append("North America")
delete_values.append("South America")
delete_values.append("Micronesia")

tot_deleted = 0
for c in delete_values:
    print(c, len(df_analysis_proc.loc[df_analysis_proc["POBP"]==c]))
    tot_deleted +=len(df_analysis_proc.loc[df_analysis_proc["POBP"]==c])
print(tot_deleted, round(tot_deleted/len(df_analysis_proc),5))

Asia 54
Europe 23
Africa 18
West Indies 2
North America 0
South America 4
Micronesia 8
109 0.00056


Delete rows for which the detailed information is not available

In [25]:
df_analysis_proc = df_analysis_proc.loc[df_analysis_proc["POBP"].isin(delete_values)==False]

In [26]:
generalization_dict_tax[attr] = generalization_dict_tax_u

In [27]:
df_analysis_proc.to_csv(os.path.join(os.path.curdir, "datasets", "ACSPUMS", "adult_dataset_income_tax.csv"), index=False)

In [28]:
import json

with open(os.path.join(os.path.curdir, "datasets", "ACSPUMS", "adult_taxonomies.json"), "w") as fp:
    json.dump(generalization_dict_tax, fp)

# Tree divergence - income - individual

## Tree

In [29]:
min_support = 0.1

In [30]:
metric = "d_outcome"
target = "income"

In [31]:
from tree_discretization_ranking import TreeDiscretization_ranking

tree_discr = TreeDiscretization_ranking()


type_experiment = "one_at_time"

type_criterion = "divergence_criterion"


generalization_dict, discretizations = tree_discr.get_tree_discretization(
    df_analysis_proc,
    type_splitting=type_experiment,
    min_support=min_support,
    metric=metric,
    continuous_attributes= list(continuous_attributes),
    storeTree=True,
    type_criterion = type_criterion,
    minimal_gain = 0.0,
    target_col = target
)

In [32]:
FP_results = {"base": {}, "generalized": {}}

## min_sup_divergence  = 0.05

In [33]:
generalization_dict_all = deepcopy(generalization_dict)
generalization_dict_all.update(generalization_dict_tax)

In [34]:
print(generalization_dict_all.keys())

dict_keys(['AGEP', 'WKHP', 'OCCP', 'POBP'])


In [35]:
min_sup_divergence = 0.05

In [36]:
import pandas as pd
pd.set_option('max_colwidth', None)

In [37]:
INFO = ['support', 'itemsets', 'tn', 'fp', 'fn', 'tp']

In [38]:
min_sup_divergences = [0.05, 0.025, 0.01]

In [39]:
out_support = {}
out_support_wlogr = {}

### Without Generalization 

In [40]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = False

pattern_type = "generalized" if apply_generalization else "base"

import time
for min_sup_divergence in min_sup_divergences:
    print(min_sup_divergence)
    st = time.time()
    FP_fm = extract_divergence_generalized(
        df_analysis_proc,
        discretizations,
        generalization_dict_all,
        continuous_attributes,
        min_sup_divergence=min_sup_divergence,
        apply_generalization=apply_generalization,
        target_name = target,
        FPM_type="fpgrowth",
        metrics_divergence = ["d_outcome"],
        type_experiment = type_experiment,
        allow_overalp = False if type_experiment!="all_attributes" else True
    )
    print(f"Time: {round(time.time()-st, 2)}")


    from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
    fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
    fp_div = fp_divergence_o.getDivergence(th_redundancy=0)
    most_divergent = fp_div.sort_values(fp_divergence_o.metric, ascending = False).head(1)
    
    if min_sup_divergence == 0.01 :
        import math
        mean_outcome = fp_div.loc[0]["outcome"]
        fp_div["wlogr"] = fp_div["support"] * (fp_div["outcome"]/mean_outcome).apply(lambda x: math.log(x))
        most_divergent_wlogr = fp_div.sort_values("wlogr", ascending = True).head(3)

        out_support_wlogr.setdefault(min_sup_divergence, {})["base"] = most_divergent_wlogr
    
    
    out_support.setdefault(min_sup_divergence, {})["base"] = most_divergent
    

0.05
Time: 15.35
0.025
Time: 14.72
0.01
Time: 15.95


### With Generalization 

In [41]:
from utils_extract_divergence_generalized_ranking import (
    extract_divergence_generalized,
)

apply_generalization = True
pattern_type = "generalized" if apply_generalization else "base"

import time 



for min_sup_divergence in min_sup_divergences:
    print(min_sup_divergence)
    st = time.time()
    FP_fm = extract_divergence_generalized(
        df_analysis_proc,
        discretizations,
        generalization_dict_all,
        continuous_attributes,
        min_sup_divergence=min_sup_divergence,
        apply_generalization=apply_generalization,
        target_name = target,
        FPM_type="fpgrowth",
        metrics_divergence = ["d_outcome"],
        type_experiment = type_experiment,
        considerOnlyContinuos = False,
        allow_overalp = False if type_experiment!="all_attributes" else True
    )

    print(f"Time: {round(time.time()-st, 2)}")

    from divexplorer_generalized_ranking.FP_Divergence import FP_Divergence
    fp_divergence_o=FP_Divergence(FP_fm, "d_outcome")
    fp_div = fp_divergence_o.getDivergence(th_redundancy=0)
    most_divergent = fp_div.sort_values(fp_divergence_o.metric, ascending = False).head(1)
    out_support.setdefault(min_sup_divergence, {})["generalized"] = most_divergent
         
    
    if min_sup_divergence == 0.01 :
        import math
        mean_outcome = fp_div.loc[0]["outcome"]
        fp_div["wlogr"] = fp_div["support"] * (fp_div["outcome"]/mean_outcome).apply(lambda x: math.log(x))
        most_divergent_wlogr = fp_div.sort_values("wlogr", ascending = True).head(3)

        out_support_wlogr.setdefault(min_sup_divergence, {})["generalized"] = most_divergent_wlogr

0.05
Time: 25.72
0.025
Time: 30.45
0.01
1 10000
2 20000
Time: 46.66


# Print results

## Divergence

In [42]:
t_value_col = "t_value_outcome"

In [43]:
method_name = "Exploration approach"

res_acc = []
for min_sup_divergence in min_sup_divergences:
    res = pd.concat([out_support[min_sup_divergence][k] for k in out_support[min_sup_divergence]])
    res[method_name] = out_support[min_sup_divergence].keys()
    res["s"] = [min_sup_divergence]*len(res)
    from utils_printable import printable

    COLS = ["s", method_name] + ["itemsets", "support"] + [metric, t_value_col]
    
    res_pr = printable(res[COLS], abbreviations=abbreviations, resort_cols = False)
    
    res_acc.append(res_pr)

pd.concat(res_acc)

Unnamed: 0,s,Exploration approach,itemsets,sup,Δ_outcome,t_outcome
341,0.05,base,"MAR=Married, RAC=White, SEX=Male, WKHP=>=44.0",0.07,80986.277,62.3
1858,0.05,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male",0.05,90204.019,60.6
1285,0.025,base,SCHL=Prof beyond bachelor,0.03,105256.743,46.7
6602,0.025,generalized,"AGEP=>=35.0, OCCP=MGR, SEX=Male, WKHP=>=44.0",0.03,119340.209,50.6
4388,0.01,base,"SCHL=Prof beyond bachelor, WKHP=>=44.0",0.01,163479.862,40.3
25541,0.01,generalized,"AGEP=>=35.0, SCHL=Prof beyond bachelor, SEX=Male, WKHP=>=40.0",0.01,172295.97,39.3


## Wlogr

In [60]:
method_name = "Exploration approach"

min_sup_divergence = 0.01
for k in out_support_wlogr[min_sup_divergence]:
    out_support_wlogr[min_sup_divergence][k][method_name] = [k] * len(out_support_wlogr[min_sup_divergence][k])
res = pd.concat([out_support_wlogr[min_sup_divergence][k] for k in out_support_wlogr[min_sup_divergence]])

from utils_printable import printable

COLS = [method_name] +["itemsets", "support"] + [metric, t_value_col, "wlogr"]
res[metric] = (res[metric]/1000).round(1)
res_pr = printable(res[COLS], abbreviations=abbreviations, resort_cols = False)

print(f"Minimum support divergence: {min_sup_divergence}")
display(res_pr)

Minimum support divergence: 0.01


Unnamed: 0,Exploration approach,itemsets,sup,Δ_outcome,t_outcome,wlogr
58,base,AGEP=<=26.0,0.17,-43.9,173.8,-0.191
75,base,"AGEP=<=26.0, MAR=Never married/<15yrs",0.15,-44.9,175.9,-0.179
51,base,WKHP=<=29.0,0.18,-40.4,122.5,-0.178
40,generalized,WKHP=<=39.0,0.31,-33.0,108.5,-0.216
285,generalized,"AGEP=<=34.0, WKHP=<=39.0",0.14,-49.9,203.6,-0.197
26,generalized,AGEP=<=34.0,0.34,-28.0,104.3,-0.193
