In [1]:

def remove_missing_values(df):
    """
    Filter rows with missing value
    """
    import pandas
    
    df.dropna(inplace=True)
    
    m = pd.Series(False, index=df.index) # Boolean indexing
    m |= ( # ...to filter specific values
    (df.node_caps == '?') &
    (df.breast_quad == '?') &
    (df.irradiat == 'NaN'))
    df = df[~m]
    return df


def encode_df(input_df):
    """
    Label encoding; categorical (str) variable to int
    """
    from sklearn.preprocessing import LabelEncoder
    lb = LabelEncoder() # use lb.inverse_transform(encoded_array) to decode
    
    # Encode categorical data to int values
    df_encoded = (df
                  .astype(str) # converts all values to str
                  .apply(lb.fit_transform)) # encodes all values to int
    
    return df_encoded, lb

In [3]:
def generate_XOR_example(attr_dict_sample):
    """
    attr_dict_sample; 
    dict containing attributes and attributes value range for sample dataset
    {arg_1 : [values], ...}
    
    returns a sample record labelled as per the XOR function (or parity function)
    depending on the number of attributes in attr_dict_sample
    """
    
    import random
    
    attributes = attr_dict_sample.keys()
    record = {k : None for k in attributes}
    record['Class'] = random.choice([0, 1])
    
    if record['Class'] == 1:        
        attr_pos_count = random.choice( [n for n in range(len(attributes)+1) if n%2==0] )
    else:
        attr_pos_count = random.choice( [n for n in range(len(attributes)+1) if n%2==1] )
    attr_pos = random.sample(attributes, attr_pos_count)
    
    for a in attributes:
        if a in attr_pos:
            record[a] = 1
        else:
            possible_values = [v for v in attr_dict_sample[a] if v != 1]
            record[a] = random.choice(possible_values)

    return record


def generate_XOR_dataset(n_examples,
                         attr_dict,
                         n_attributes,
                         no_duplicate=True):
    """
    Generates a list of examples labelled as per the XOR function,
    using attributes defined in attr_dict 
    """
    import random
    import numpy as np
    import pandas as pd

    attributes = list(attr_dict.keys())
    attributes = [a for a in attributes if a != 'Class']

    attributes_sample = random.sample(attributes, 
                                      min(n_attributes, len(attributes)))
    attr_dict_sample = {
        k : v
        for k, v in attr_dict.items()
        if k in attributes_sample
    }
    
    dataset = []
    if no_duplicate :
        example_space_size = np.prod([len(v) for v in attr_dict_sample.values()])       
        n_examples = min(n_examples, example_space_size)
        
        print("Space size:", example_space_size)
        
    while(len(dataset) < n_examples):
        new_example = generate_XOR_example(attr_dict_sample)
        if not no_duplicate or new_example not in dataset:
            dataset.append(new_example)
    
    dataset_df = pd.DataFrame(dataset)
    print("Dataset size:", len(dataset))
    return dataset_df