In [1]:
# importing all required library and functions

from itertools import combinations, product
import pandas as pd
import random

In [2]:
# importing data from csv file to dataframe
# I just keep the data without header

census = pd.read_csv('census.csv', header=None)
census.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,age=Middle-aged,sex=Male,education=Bachelors,native-country=United-States,race=White,marital-status=Never-married,workclass=State-gov,occupation=Adm-clerical,hours-per-week=Full-time,income=Small,capital-gain=Low,capital-loss=None
1,age=Senior,sex=Male,education=Bachelors,native-country=United-States,race=White,marital-status=Married-civ-spouse,workclass=Self-emp-not-inc,occupation=Exec-managerial,hours-per-week=Part-time,income=Small,capital-gain=None,capital-loss=None
2,age=Middle-aged,sex=Male,education=HS-grad,native-country=United-States,race=White,marital-status=Divorced,workclass=Private,occupation=Handlers-cleaners,hours-per-week=Full-time,income=Small,capital-gain=None,capital-loss=None
3,age=Senior,sex=Male,education=11th,native-country=United-States,race=Black,marital-status=Married-civ-spouse,workclass=Private,occupation=Handlers-cleaners,hours-per-week=Full-time,income=Small,capital-gain=None,capital-loss=None
4,age=Middle-aged,sex=Female,education=Bachelors,native-country=Cuba,race=Black,marital-status=Married-civ-spouse,workclass=Private,occupation=Prof-specialty,hours-per-week=Full-time,income=Small,capital-gain=None,capital-loss=None


In [3]:
def censusAttribute(numOfAttributes, threshold, numOfData=None):
    # function definition
    # numOfAttributes : number of attributes in a set (2-12)
    # threshold : appearance frequency threshold between 0 and 1 (float)
    # numOfData : data sample to be considered. If no input value, all data will be considered
    
    if numOfData == None:
        data = census
        datarows = len(data.index)
    else:
        data = census.sample(n=200, random_state=1)
        datarows = numOfData
    
    cols = list(data.columns)
    nAtt = numOfAttributes
    thres = threshold

    # preparing all possible combination of sets
    nComb = list(combinations(cols, numOfAttributes))
    att = []
    for comb in nComb:
        uniq = []
        for col in comb:
            uniqCol = list(data[col].unique())
            uniq.append(uniqCol)
        uniqComb = list(product(*uniq))
        att.append(uniqComb)
        
    # counting appearance of sets
    sum_all = []
    for com_i in range(len(nComb)):
        sum_atts = []
        for atts in att[com_i]:
            check = data
            for i in range(len(atts)):
                check_i = check[nComb[com_i][i]] == atts[i]
                check = check[check_i]
                sum_check_i = sum(check_i)
            sum_atts.append(sum_check_i)
        sum_all.append(sum_atts)
        
    # counting frequency of appearance of sets
    ratio_all = []
    for i in sum_all:
        ratio_comb = []
        for j in i:
            ratio_j = j/datarows
            ratio_comb.append(ratio_j)
        ratio_all.append(ratio_comb)

    # displaying the results
    for i in range(len(ratio_all)):
        for j in range(len(ratio_all[i])):
            if ratio_all[i][j] > thres:
                print(att[i][j])

In [4]:
# input value
numOfAttributes = 2
threshold = 0.8

# calling the function
censusAttribute(numOfAttributes, threshold)

('native-country=United-States', 'race=White')
('native-country=United-States', 'capital-gain=None')
('native-country=United-States', 'capital-loss=None')
('race=White', 'capital-loss=None')
('capital-gain=None', 'capital-loss=None')


In [5]:
# if only several data to be used
numOfData = 200
censusAttribute(numOfAttributes, threshold, numOfData)

('native-country=United-States', 'race=White')
('native-country=United-States', 'capital-gain=None')
('native-country=United-States', 'capital-loss=None')
('race=White', 'capital-gain=None')
('race=White', 'capital-loss=None')
('capital-gain=None', 'capital-loss=None')
