In [1]:
%load_ext autoreload
%autoreload 2

import lib.fd_imputer as fd_imputer
import pandas as pd
import numpy as np
from sklearn import metrics
import itertools
import matplotlib.pyplot as plt

### Set up all paths and labels needed in this notebook

In [2]:
DATA_PATH = 'MLFD_fd_detection/backend/WEB-INF/classes/inputData/adult.csv'
SPLITS_PATH = 'MLFD_fd_detection/data/'
METANOME_DATA_PATH = 'MLFD_fd_detection/backend/WEB-INF/classes/inputData/'
FD_PATH = 'MLFD_fd_detection/results/HyFD-1.2-SNAPSHOT.jar2019-05-07T082200_fds'
DATA_TITLE = 'adult'

### Load data, make definitions

In [64]:
fd = pd.read_csv(DATA_PATH, sep=';', header=None)
fd_imputer.split_df('adult', fd, [0.8, 0.1, 0.1], SPLITS_PATH)

Dataset successfully written to MLFD_fd_detection/data/adult.csv
train set successfully written to MLFD_fd_detection/data/train/adult_train.csv
validate set successfully written to MLFD_fd_detection/data/validate/adult_validate.csv
test set successfully written to MLFD_fd_detection/data/test/adult_test.csv


In [158]:
df_train, df_validate, df_test = fd_imputer.load_dataframes(SPLITS_PATH, 
                                                            DATA_TITLE,
                                                           missing_value_token='noValueSetHere123156456')

In [76]:
fds = fd_imputer.read_fds(FD_PATH)
continuous = [0, 1, 3, 11, 12, 13] # cols containing continuous numbers

In [82]:
df_test = df_test.reset_index()

In [94]:
np.array_equal(df_test.iloc[:, 0].values, np.array(df_test.index))

True

In [92]:
np.array(df_test.index)

array([   0,    1,    2, ..., 3253, 3254, 3255])

In [83]:
df_test

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0,6393,37,Private,196338,9th,5,Separated,Priv-house-serv,Unmarried,White,Female,0,0,16,Mexico,<=50K
1,1,16734,27,Private,116207,9th,5,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,32,United-States,<=50K
2,2,25235,36,Private,31438,HS-grad,9,Divorced,Transport-moving,Unmarried,White,Male,0,0,43,,<=50K
3,3,6871,35,Private,337286,Masters,14,Never-married,Exec-managerial,Not-in-family,Asian-Pac-Islander,Male,0,0,40,United-States,<=50K
4,4,19695,41,Self-emp-inc,114967,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,40,United-States,>50K
5,5,3785,20,Private,231981,Some-college,10,Never-married,Transport-moving,Not-in-family,White,Male,0,0,32,United-States,<=50K
6,6,15754,47,Private,268022,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,55,United-States,>50K
7,7,4255,44,Private,228320,HS-grad,9,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,45,United-States,>50K
8,8,24653,36,Self-emp-not-inc,167691,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,50,United-States,<=50K
9,9,8671,33,State-gov,313729,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,60,United-States,<=50K


## FD_Imputer

In [159]:
res = fd_imputer.run_fd_imputer_on_fd_set(df_train, df_validate, fds, continuous)

{3: [0]}
{11: [0]}
{13: [0]}
{12: [0]}
{12: [13, 1, 2, 3, 4, 7]}
{12: [13, 1, 3, 4, 6, 7]}
{12: [13, 1, 3, 4, 7, 8]}
{12: [13, 1, 2, 3, 5, 7]}
{12: [13, 1, 3, 5, 6, 7]}
{12: [13, 1, 3, 5, 7, 8]}
{1: [0]}
{14: [0]}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


{4: [0]}
{4: [5]}
{5: [0]}
{5: [4]}
{7: [0]}
{2: [0]}
{6: [0]}
{6: [11, 13, 1, 3, 7, 8]}
{6: [13, 1, 3, 4, 7, 8]}
{6: [13, 1, 3, 5, 7, 8]}
{8: [0]}
{9: [0]}
{9: [11, 12, 13, 3, 7, 8]}
{9: [12, 13, 15, 3, 4, 6, 8]}
{9: [12, 13, 15, 3, 5, 6, 8]}
{9: [13, 1, 3, 4]}
{9: [13, 1, 3, 5]}
{9: [13, 1, 2, 3, 7]}
{9: [13, 14, 3, 4, 7]}
{9: [13, 14, 3, 4, 6, 8]}
{9: [13, 14, 3, 5, 7]}
{9: [13, 14, 3, 5, 6, 8]}
{9: [13, 14, 2, 3, 7]}
{9: [13, 14, 3, 6, 7]}
{9: [13, 14, 3, 7, 8]}
{9: [10, 13, 14, 15, 3, 7]}
{9: [13, 15, 2, 3, 4, 6, 8]}
{9: [13, 15, 2, 3, 5, 6, 8]}
{9: [12, 14, 2, 3, 4, 6, 8]}
{9: [12, 14, 2, 3, 5, 6, 8]}
{9: [12, 15, 2, 3, 4, 6, 8]}
{9: [12, 15, 2, 3, 5, 6, 8]}
{9: [14, 1, 3, 4]}
{9: [14, 1, 3, 5]}
{9: [14, 1, 2, 3, 7]}
{9: [14, 1, 2, 3, 6]}
{9: [1, 3, 4, 6]}
{9: [1, 3, 4, 8]}
{9: [15, 1, 3, 4]}
{9: [1, 3, 5, 6]}
{9: [1, 3, 5, 8]}
{9: [15, 1, 3, 5]}
{9: [1, 3, 6, 7]}
{9: [1, 3, 7, 8]}
{9: [15, 1, 3, 7]}
{9: [10, 14, 3, 4, 7]}
{9: [14, 15, 2, 3, 4, 6, 8]}
{9: [10, 14, 3, 5, 7]}
{9: [

In [156]:
fd_imputer_results = {}
for rhs in fds:
    results = []
    for lhs in fds[rhs]:
        print(rhs, lhs)
        fd = {rhs: lhs}
        df_fd_imputed = fd_imputer.fd_imputer(df_validate, df_train, fd)
        
        # make sure that value for missing data is of same type as row to be imputed
        # to avoid mix of labels with scikit.metrics
        if isinstance(df_fd_imputed.iloc[0, rhs], str):
            df_fd_imputed = df_fd_imputed.fillna('no value')
            y_pred = df_fd_imputed.loc[:, str(rhs)+'_imputed']
            y_true = df_fd_imputed.loc[:, rhs]
        else:
            # count, when no imputations have been found
            na_selector = df_fd_imputed.loc[:, str(rhs)+'_imputed'].isna()
            nans = na_selector.sum()
            
            # only retrieve successfully imputed values to compute MSE
            y_pred = df_fd_imputed.loc[~na_selector, str(rhs)+'_imputed']
            y_true = df_fd_imputed.loc[~na_selector, rhs]

        
        if rhs in continuous:
            mse = ''
            
            if  len(y_pred) > 0:
                mse = metrics.mean_squared_error(y_true, y_pred)
            
            result = {
                'nans': nans,
                'lhs': lhs,
                'mse': mse
            }
        else:
            result = {
                'lhs': lhs,
                'precision': metrics.precision_score(y_true, y_pred, average='weighted'),
                'recall': metrics.recall_score(y_true, y_pred, average='weighted'),
                'f1': metrics.f1_score(y_true, y_pred, average='weighted')
            }
        results.append(result)
    fd_imputer_results[rhs] = results

3 [0]


KeyError: Int64Index([0, 3], dtype='int64')

In [81]:
df_validate.shape

(3256, 16)

In [80]:
df_test.shape

(3256, 16)

In [105]:
test_df.reset_index()

Unnamed: 0,index,0,1,2
0,0,1.0,2.0,3
1,1,4.0,5.0,9
2,2,5.0,7.0,12
3,0,1.0,2.0,4
4,1,7.0,8.0,15
5,2,11.0,13.0,24
6,3,,,54


In [199]:
df_validate = pd.DataFrame([[1, 2, 3], [4, 5, 9], [5, 7, 12]],
                                        columns=[0, 1, 2])
df_train = pd.DataFrame([[1, 2, 4], [7, 8, 15], [11, 13, 24],
                                     [np.nan, np.nan, 54]],
                                     columns=[0, 1, 2])
test_df = pd.concat([df_validate, df_train], ignore_index=True)

In [200]:
result = fd_imputer.run_fd_imputer_on_fd_set(df_train, df_validate, {2: [[1, 0]]}, [0, 1, 2, 3])

{2: [1, 0]}


In [201]:
result

{2: [{'nans': 2, 'lhs': [1, 0], 'mse': 1.0}]}

In [210]:
result[2][0]['nans']

2

In [189]:
train, validate, test = fd_imputer.split_df('adult', test_df, [0.6, 0.2, 0.2])

kein doppelter index


In [190]:
train

Unnamed: 0,0,1,2,3
2,2,5.0,7.0,12
6,6,,,54
0,0,1.0,2.0,3
3,3,1.0,2.0,4


In [191]:
fd = {3:[[2, 1]]}

In [195]:
df_train, df_validate = (map(fd_imputer.index_as_first_column, [df_train, df_validate]))