In [4]:
# Data libraries.
import numpy as np
import scipy as sp
import pandas as pd

# Data vizualization libraries.
from tqdm import tqdm
import matplotlib.pyplot as plt

# Association rules libraries.
from efficient_apriori import apriori

# Machine learning libraries.
import datawig

# Utilities we wrote for this project.
import utils

## 1. Load Dataset

In [17]:
# Read data from files.
df = pd.read_csv('./data/adultsIncome/raw/adultsIncome_no_nan.csv')
df_with_missing = pd.read_csv('./data/adultsIncome/10percent/adultsIncome_0.1_nan.csv')
df_with_missing_ar = df_with_missing.copy(deep=True)
df_with_missing_ml = df_with_missing.copy(deep=True)
df_with_missing

Unnamed: 0,age,workclass,observation-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39.0,,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,,,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,,<=50K
3,53.0,Private,234721.0,11th,7.0,,Handlers-cleaners,,Black,Male,,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,,0.0,0.0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27.0,Private,257302.0,Assoc-acdm,,Married-civ-spouse,Tech-support,,White,Female,0.0,,38.0,United-States,<=50K
32557,40.0,Private,,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
32558,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
32559,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,,White,Male,0.0,0.0,,United-States,<=50K


In [18]:
num_of_missing_before = df_with_missing_ar.isnull().sum().sum()
print(f'Number of missing values: {num_of_missing_before}')

Number of missing values: 48647


## Association Rules algorithm to fill missing values

In [7]:
df_for_apriori = df_with_missing_ar.copy(deep=True)

# go through each column and replace the values with a string
for col in df_for_apriori.columns:
    df_for_apriori[col] = df_for_apriori[col].astype(str)
# convert the dataframe to a list of lists
dict_for_apriori = df_for_apriori.to_dict(orient='records')
transactions = [list(item.items()) for item in dict_for_apriori]

Finding the association rules using the Apriori algorithm

In [8]:
itemsets, rules = apriori(transactions, min_support=0.2, min_confidence=0.55, output_transaction_ids=False)
print(f'We have {len(rules)} rules and here are the first 10:')
rules[:10]

We have 1143 rules and here are the first 10:


[{('capital-loss', '0.0')} -> {('capital-gain', '0.0')},
 {('capital-gain', '0.0')} -> {('capital-loss', '0.0')},
 {('class', ' <=50K')} -> {('capital-gain', '0.0')},
 {('capital-gain', '0.0')} -> {('class', ' <=50K')},
 {('education', ' HS-grad')} -> {('capital-gain', '0.0')},
 {('education-num', '9.0')} -> {('capital-gain', '0.0')},
 {('hours-per-week', '40.0')} -> {('capital-gain', '0.0')},
 {('marital-status', ' Married-civ-spouse')} -> {('capital-gain', '0.0')},
 {('marital-status', ' Never-married')} -> {('capital-gain', '0.0')},
 {('native-country', ' United-States')} -> {('capital-gain', '0.0')}]

In [9]:
# sort rules by lift value
sorted_rules = sorted(rules, key=lambda x: x.lift, reverse=True)
for rule in sorted_rules[:10]:
    print(f'Rule: {rule.lhs} -> {rule.rhs}, Lift: {rule.lift}')

Rule: (('capital-loss', '0.0'), ('education-num', '9.0')) -> (('education', ' HS-grad'),), Lift: 3.1010656863144934
Rule: (('education', ' HS-grad'),) -> (('capital-loss', '0.0'), ('education-num', '9.0')), Lift: 3.1010656863144934
Rule: (('capital-gain', '0.0'), ('education', ' HS-grad')) -> (('education-num', '9.0'),), Lift: 3.0999415464714577
Rule: (('education-num', '9.0'),) -> (('capital-gain', '0.0'), ('education', ' HS-grad')), Lift: 3.0999415464714577
Rule: (('capital-gain', '0.0'), ('education-num', '9.0')) -> (('education', ' HS-grad'),), Lift: 3.0992545221324486
Rule: (('education', ' HS-grad'),) -> (('capital-gain', '0.0'), ('education-num', '9.0')), Lift: 3.0992545221324486
Rule: (('education-num', '9.0'), ('native-country', ' United-States')) -> (('education', ' HS-grad'),), Lift: 3.0990446401987515
Rule: (('education', ' HS-grad'),) -> (('education-num', '9.0'), ('native-country', ' United-States')), Lift: 3.0990446401987515
Rule: (('education', ' HS-grad'), ('native-cou

In [10]:
df_missing_index_rows = df_with_missing_ar.index[df_with_missing_ar.isna().any(axis=1)]

### Fill missing values algorithm

In [11]:
# create a dictionary with the column names and the index of the column
col_names = df_with_missing.columns
col_names_dict = {}
for i, col in enumerate(col_names):
    col_names_dict[col] = i

In [21]:
# algortihm to fill missing values
for index in tqdm(df_missing_index_rows):
    # print the current row on the same line (replacing the previous line)
    row = df_with_missing_ar.iloc[index]
    rhs, lhs = [], []

    for col in row.index:
        rhs.append(col) if row[col] != row[col] else lhs.append(col)

    relevant_rules = []
    for rule in sorted_rules:
        # check if [col[0] for col in rule.rhs] is a subset of rhs
        if set([col[0] for col in rule.rhs]).issubset(set(rhs)): relevant_rules.append(rule)

    for rule in relevant_rules:
        # check if [keyval[0] for keyval in rule.lhs] is a subset of lhs
        if set([keyval for keyval in rule.lhs]).issubset(set([(col, row[col]) for col in lhs])):
            should_fill = True
            for keyval in rule.rhs:
                if row[keyval[0]] == row[keyval[0]] and keyval[1] != row[keyval[0]]:
                    should_fill = False
                    break
            if should_fill:
                for keyval in rule.rhs:
                    df_with_missing_ar.iloc[index, col_names_dict[keyval[0]]] = keyval[1]

100%|██████████| 25890/25890 [00:56<00:00, 460.71it/s]


In [22]:
num_of_missing_after = df_with_missing_ar.isnull().sum().sum()
print(f'Number of filled values: {num_of_missing_before - num_of_missing_after}')
print(f'Number of left missing values: {num_of_missing_after}')

Number of filled values: 28145
Number of left missing values: 20502


In [23]:
real_percent = num_of_missing_before / df.size
print(f'With NaN = {utils.check_accuracy(df,df_with_missing_ar,real_percent,True)}%')
print(f'Without NaN = {utils.check_accuracy(df,df_with_missing_ar,real_percent,False)}%')

Checking accuracy: 100%|██████████| 32561/32561 [00:38<00:00, 848.59it/s]


With NaN = 31.4264%


Checking accuracy: 100%|██████████| 32561/32561 [01:13<00:00, 442.27it/s]

Without NaN = 72.41281%





## Machine learning algorithm to fill missing values

In [None]:
# Impute missing values using datawig.
df_with_missing_ml = datawig.SimpleImputer.complete(df_with_missing_ml, precision_threshold = 0.05, num_epochs=100)

In [None]:
real_percent = num_of_missing_before / df.size
print(f'With NaN = {utils.check_accuracy(df, df_with_missing_ml, real_percent, True)}%')
print(f'Without NaN = {utils.check_accuracy(df, df_with_missing_ml, real_percent, False)}%')