# Preprocess Datasets

Final Notebook.

## Preliminaries

### Imports

In [1]:
import pandas as pd
import arff
import os
import numpy as np

from os.path import dirname
from aaai20.io import filename_dataset, filename_query, original_filename
from aaai20.wrangling import arff_to_df
from sklearn.model_selection import train_test_split
from modulo.utils.encoding import query_to_code, code_to_query

RANDOM_STATE = 42

#### Helpers

In [2]:
def generate_query(nb_atts, targ_idx=-1, nb_qry=10, random_state=42):
    # init ids
    attr_ids = list(range(nb_atts))
    targ_ids = [attr_ids[targ_idx]] # Last attribute by default
    desc_ids = [e for e in attr_ids if e not in targ_ids]
    miss_ids = []

    q_targ = [targ_ids]
    q_desc = [desc_ids]
    q_miss = [miss_ids]

    # Start query buiding
    nb_of_attributes_to_make_missing = np.linspace(0, nb_atts-1, nb_qry, endpoint=False, dtype=int)
    nb_items_to_transfer = np.ediff1d(nb_of_attributes_to_make_missing)

    for qry_id, e in enumerate(nb_items_to_transfer):
        desc_ids, miss_ids = transfer_contents(desc_ids, miss_ids, nb_items_to_transfer=e, random_state=random_state)

        #print(desc_ids, miss_ids, targ_ids)
        q_targ.append(targ_ids)
        q_desc.append(desc_ids)
        q_miss.append(miss_ids)
    
    return q_desc, q_targ, q_miss

def transfer_contents(list_one, list_two, nb_items_to_transfer=1, random_state=42):
    np.random.seed(random_state)
    
    list_one, list_two = list_one.copy(), list_two.copy() 
    
    idx_to_transfer = np.random.choice(range(len(list_one)), nb_items_to_transfer, replace=False)
    content_to_transfer = [e for idx, e in enumerate(list_one) if idx in idx_to_transfer]
    
    for e in content_to_transfer:
        list_one.remove(e)
        list_two.append(e)
    
    return list_one, list_two

### Functions

In [10]:
def generate_train_test_split(ds, target_idx=-1, random_state=42, extension='arff'):
    
    is_arff = extension == 'arff' # In arff, we do not encode. In csv, we do.
    
    # filenames
    fn = original_filename(ds)
    fn_train = filename_dataset(ds, step=1, suffix='train', extension=extension)
    fn_test = filename_dataset(ds, step=1, suffix='test', extension=extension)
    
    # Loading
    df, af = arff_to_df(fn, encode_nominal=not is_arff, return_af=True)    
    train, test = train_test_split(df, test_size=0.2, random_state=random_state, stratify=df.iloc[:, target_idx])
    
    if is_arff:
        # Save train
        af_train = af.copy()
        af_train['data'] = train.values
        with open(fn_train, 'w') as f:
            arff.dump(af_train, f)

        # Save test
        af_test = af.copy()
        af_test['data'] = test.values
        with open(fn_test, 'w') as f:
            arff.dump(af_test, f)
    else:
        # Save train
        train.to_csv(fn_train, index=False, header=False)

        # Save test
        test.to_csv(fn_test, index=False, header=False)
    
    return

def generate_queries(dataset, max_nb_queries=10, target_idx=-1, random_state=42):
    q_codes = []
    
    # Derive Parameters
    fn_test = filename_dataset(dataset, step=1, suffix='test')
    df_test, af_test = arff_to_df(fn_test, encode_nominal=False, return_af=True)
    
    nb_atts = len(df_test.columns)
    nb_qry = min(nb_atts-1, max_nb_queries)
    
    # Generate queries
    q_desc, q_targ, q_miss = generate_query(nb_atts, targ_idx=-1, nb_qry=nb_qry, random_state=random_state)
    
    for q_idx in range(nb_qry):
        q_codes.append(query_to_code(q_desc[q_idx], q_targ[q_idx], q_miss[q_idx]))
        
    q_codes = np.r_[q_codes] # Convert to proper np.ndarray
    
    # Save
    fn_qry = filename_query(dataset, suffix="default")
    np.save(fn_qry, q_codes)
    
    return

def generate_query_testsets(dataset, extension='arff'):
    is_arff = extension == 'arff' # In arff, we do not encode. In csv, we do.
    
    # Load
    fn_test = filename_dataset(dataset, step=1, suffix='test', extension=extension)
    fn_q_codes = filename_query(dataset, suffix="default")
    
    q_codes = np.load(fn_q_codes)
    
    if is_arff:
        df_test, af_test = arff_to_df(fn_test, encode_nominal=False, return_af=True)
    else:
        df_test = pd.read_csv(fn_test, header=None, index_col=None)
        #print(df.head())
    
    for q_idx, q_code in enumerate(q_codes):
        _, _, q_miss = code_to_query(q_code)
        fn_qry = filename_dataset(dataset, step=2, suffix='q_{}'.format(str(q_idx).zfill(3)), extension=extension)
        
        df_qry = df_test.copy()
        df_qry.iloc[:, q_miss] = np.nan
        
        if is_arff:
            af_qry = af_test.copy()
            af_qry['data'] = df_qry.values
            with open(fn_qry, 'w') as f:
                arff.dump(af_qry, f)
        else:
            df_qry.to_csv(fn_qry, index=False, header=False)
    return

def check_nan(dataset):
    fn = original_filename(dataset)
    print(dataset)
    
    df, af = arff_to_df(fn, encode_nominal=False, return_af=True)
    
    if df.isnull().any().any():
        msg = """
        Dataset: {} has NaN values from the start. We don't deal with this well in sklearn yet.
        """.format(dataset)
        print(msg)
        return False
    else:
        return True

## Actual Work

Here I implement the actual workflow.

In [7]:
datasets = ['glass',
             'credit-g',
             'ionosphere',
             'lymph',
             'vehicle',
             'iris',
             'splice',
             'sonar',
             'vowel',
             'segment',
             'zoo',
             'heart-statlog',
             'waveform-5000',
             'kr-vs-kp',
             'diabetes',
             'letter',
             'balance-scale']

print(len(datasets))

for ds in datasets:
    msg = """
    Starting dataset {}.
    """.format(ds)
    print(msg)
    
    generate_train_test_split(ds, random_state=RANDOM_STATE, extension='arff')
    
    generate_queries(ds,random_state=RANDOM_STATE)
    
    generate_query_testsets(ds, extension='arff')
    
print("Done")

17

    Starting dataset glass.
    

    Starting dataset credit-g.
    

    Starting dataset ionosphere.
    

    Starting dataset lymph.
    

    Starting dataset vehicle.
    

    Starting dataset iris.
    

    Starting dataset splice.
    

    Starting dataset sonar.
    

    Starting dataset vowel.
    

    Starting dataset segment.
    

    Starting dataset zoo.
    

    Starting dataset heart-statlog.
    

    Starting dataset waveform-5000.
    

    Starting dataset kr-vs-kp.
    

    Starting dataset diabetes.
    

    Starting dataset letter.
    

    Starting dataset balance-scale.
    
Done


In [11]:
for ds in datasets:
    msg = """
    Starting csv dataset {}.
    """.format(ds)
    print(msg)
    
    generate_train_test_split(ds, random_state=RANDOM_STATE, extension='csv')
    
    generate_queries(ds,random_state=RANDOM_STATE)
    
    generate_query_testsets(ds, extension='csv')
    
print("Done")


    Starting csv dataset glass.
    

    Starting csv dataset credit-g.
    

    Starting csv dataset ionosphere.
    

    Starting csv dataset lymph.
    

    Starting csv dataset vehicle.
    

    Starting csv dataset iris.
    

    Starting csv dataset splice.
    

    Starting csv dataset sonar.
    

    Starting csv dataset vowel.
    

    Starting csv dataset segment.
    

    Starting csv dataset zoo.
    

    Starting csv dataset heart-statlog.
    

    Starting csv dataset waveform-5000.
    

    Starting csv dataset kr-vs-kp.
    

    Starting csv dataset diabetes.
    

    Starting csv dataset letter.
    

    Starting csv dataset balance-scale.
    
Done


## Future Work

Which datasets can we conceivably use? 

In [None]:
datasets = [os.path.splitext(f) for f in os.listdir('../data/raw/datasets-UCI/UCI/')]
datasets = [f for f, e in datasets if f not in {'breast-w'}]
datasets = [ds for ds in datasets if check_nan(ds)]
datasets

In [None]:
len(datasets)