In [35]:
# Data libraries.
import numpy as np
import scipy as sp
import pandas as pd

# Data vizualization libraries.
from tqdm import tqdm
import matplotlib.pyplot as plt

# Association rules libraries.
from efficient_apriori import apriori

# Machine learning libraries.
import datawig

# Utilities we wrote for this project.
import utils
from tqdm import tqdm

## 1. Load Dataset

In [31]:
# Read data from files.
df = pd.read_csv('./data/adultsIncome/raw/raw_adultsIncome_no_nan.csv')
df_with_missing = pd.read_csv('./data/adultsIncome/10percent/raw_adultsIncome_0.1nan.csv')
df_with_missing_ar = df_with_missing.copy()
df_with_missing_ml = df_with_missing.copy()
df_with_missing

Unnamed: 0,age,workclass,observation-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39.0,State-gov,77516.0,Bachelors,,Never-married,Adm-clerical,Not-in-family,White,,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,,,Handlers-cleaners,Not-in-family,White,Male,0.0,,40.0,,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,,0.0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,,<=50K
32557,,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
32558,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,,White,Female,0.0,0.0,40.0,United-States,<=50K
32559,22.0,Private,201490.0,HS-grad,,Never-married,Adm-clerical,Own-child,White,,0.0,0.0,20.0,United-States,<=50K


In [32]:
num_of_missing_before = df_with_missing_ar.isna().any(axis=1).sum()
print(f'Number of missing values: {num_of_missing_before}')


Number of missing values: 25263


## Association Rules algorithm to fill missing values

In [7]:
df_for_apriori = df_with_missing_ar.copy()

# go through each column and replace the values with a string
for col in df_for_apriori.columns:
    df_for_apriori[col] = df_for_apriori[col].astype(str)
# convert the dataframe to a list of lists
dict_for_apriori = df_for_apriori.to_dict(orient='records')
transactions = [list(item.items()) for item in dict_for_apriori]

Finding the association rules using the Apriori algorithm

In [29]:
itemsets, rules = apriori(transactions, min_support=0.3, min_confidence=0.6, output_transaction_ids=False)
print(f'We have {len(rules)} rules and here are the first 10:')
rules[:10]

We have 311 rules and here are the first 10:


[{('capital-loss', '0.0')} -> {('capital-gain', '0.0')},
 {('capital-gain', '0.0')} -> {('capital-loss', '0.0')},
 {('class', ' <=50K')} -> {('capital-gain', '0.0')},
 {('capital-gain', '0.0')} -> {('class', ' <=50K')},
 {('hours-per-week', '40.0')} -> {('capital-gain', '0.0')},
 {('marital-status', ' Married-civ-spouse')} -> {('capital-gain', '0.0')},
 {('native-country', ' United-States')} -> {('capital-gain', '0.0')},
 {('capital-gain', '0.0')} -> {('native-country', ' United-States')},
 {('race', ' White')} -> {('capital-gain', '0.0')},
 {('capital-gain', '0.0')} -> {('race', ' White')}]

In [28]:
# sort rules by lift value
sorted_rules = sorted(rules, key=lambda x: x.lift, reverse=True)
for rule in sorted_rules[:10]:
    print(f'Rule: {rule.lhs} -> {rule.rhs}, Lift: {rule.lift}')

Rule: (('relationship', ' Husband'),) -> (('marital-status', ' Married-civ-spouse'),), Lift: 2.1704298119741976
Rule: (('marital-status', ' Married-civ-spouse'),) -> (('relationship', ' Husband'),), Lift: 2.1704298119741976
Rule: (('relationship', ' Husband'),) -> (('sex', ' Male'),), Lift: 1.4940869263906227
Rule: (('marital-status', ' Married-civ-spouse'),) -> (('sex', ' Male'),), Lift: 1.3249761103300426
Rule: (('class', ' <=50K'), ('race', ' White')) -> (('capital-gain', '0.0'), ('capital-loss', '0.0'), ('native-country', ' United-States')), Lift: 1.0913095646708846
Rule: (('capital-gain', '0.0'), ('capital-loss', '0.0'), ('workclass', ' Private')) -> (('class', ' <=50K'),), Lift: 1.0885235124178607
Rule: (('class', ' <=50K'), ('workclass', ' Private')) -> (('capital-gain', '0.0'), ('capital-loss', '0.0')), Lift: 1.0747848688333201
Rule: (('class', ' <=50K'), ('native-country', ' United-States')) -> (('capital-gain', '0.0'), ('capital-loss', '0.0')), Lift: 1.0710700712799093
Rule: 

In [10]:
df_missing_index_rows = df_with_missing_ar.index[df_with_missing_ar.isna().any(axis=1)]

### Fill missing values algorithm

In [37]:
# create a dictionary with the column names and the index of the column
col_names = df_with_missing_ar.columns
col_names_dict = {}
for i, col in enumerate(col_names):
    col_names_dict[col] = i


# algortihm to fill missing values
for index in tqdm(df_missing_index_rows):
    # print the current row on the same line (replacing the previous line)
    row = df_with_missing_ar.iloc[index]
    rhs = []
    lhs = [] 
    for col in row.index:
        if row[col] != row[col]:
            rhs.append(col)
        else:
            lhs.append(col)
    relevant_rules = []
    for rule in sorted_rules:
        # check if [col[0] for col in rule.rhs] is a subset of rhs
        if set([col[0] for col in rule.rhs]).issubset(set(rhs)):
            relevant_rules.append(rule)
    for rule in relevant_rules:
        # check if [keyval[0] for keyval in rule.lhs] is a subset of lhs
        if set([keyval for keyval in rule.lhs]).issubset(set([(col, row[col]) for col in lhs])):
            should_fill = True
            for keyval in rule.rhs:
                if row[keyval[0]] == row[keyval[0]] and keyval[1] != row[keyval[0]]:
                    should_fill = False
                    break       
            if should_fill:
                for keyval in rule.rhs:
                    df_with_missing_ar.iloc[index,col_names_dict[keyval[0]]] = keyval[1]

 31%|███       | 7894/25263 [00:23<00:52, 332.43it/s] 


KeyboardInterrupt: 

In [21]:
num_of_missing_after = df_with_missing_ar.isna().any(axis=1).sum()
print(f'Number of filled values: {num_of_missing_before- num_of_missing_after}')
print(f'Number of left missing values: {num_of_missing_after}')

8544


16719

In [22]:
print(f'With NaN = {utils.check_accuracy(df,df_with_missing_ar,0.1,True)}%')
print(f'Without NaN = {utils.check_accuracy(df,df_with_missing_ar,0.1,False)}%')

With NaN = 33.78376994973531
 Without NaN = 79.0672462834755


## Machine learning algorithm to fill missing values

In [None]:
# Impute missing values using datawig.
df_with_missing_imputed = datawig.SimpleImputer.complete(df_with_missing_ml, precision_threshold = 0.05, num_epochs=100)

In [None]:
print(f'With NaN = {utils.check_accuracy(df, df_with_missing_imputed, 0.1, True)}%')
print(f'Without NaN = {utils.check_accuracy(df, df_with_missing_imputed, 0.1, False)}%')

In [122]:
# D, E, F, G
# A -> B
# A -> D
# A:1 -> D:4
# A, B, C -> D, E
# A:1, B:2, C:3 -> D:5, E:6
# Impute missing values using datawig.
# df_with_missing_imputed = datawig.SimpleImputer.complete(df_with_missing, precision_threshold = 0.05, num_epochs=100)

In [104]:
# print(df_with_missing.isna().sum().sum())
# print(df_with_missing_imputed.isna().sum().sum())

In [None]:
# print(f'With Nan = {check_accuracy(df, df_with_missing_imputed, True)}\n Without Nan = {check_accuracy(df, df_with_missing_imputed, False)}')