# ARFF

For comparative experiments, we will use Weka to see how we can cope with missing values. The question will be; does the MERCS approach survive better?

1. Get iris dataset in ARFF
2. Run Weka on this dataset
3. Run Weka on this dataset, from Python
4. Introduce missing values
5. Run same model again.

# Iris Dataset to ARFF

## Code

In [1]:
import numpy as np
import arff
from sklearn import datasets
import pandas as pd

Some code to convert dataframes.

In [24]:
def convert_dtype_to_arff_string(t):
    """
    Convert a Pandas dtype to an arff dtype.
    
    In arff, the dtypes are just strings, and some quirks (for nominals).
    """
    
    if pd.api.types.is_numeric_dtype(t):
        return 'NUMERIC'
    elif pd.api.types.is_bool_dtype(t):
        return ['1', '0']
    elif pd.api.types.is_categorical_dtype(t):
        # Maybe I also need convert to a string, I don't know
        return [str(e) for e in t.categories.values]
    else:
        msg = """
        Did not yet implement strategy to cope with this dtype: {}
        """.format(t)
        raise NotImplemented(msg)
        return
        
def compile_attribute_tuple_list(df):
    attributes = [(a, t) for a, t in dict(df.dtypes).items()]
    attributes = [(a, convert_dtype_to_arff_string(t)) for a, t in attributes]

    return attributes

def df_to_arff(df, relation='DataFrame', description=None):
    assert isinstance(relation, str)
    assert isinstance(description, (type(None), str))
    
    arff = {}
    
    # Add relation name (i.e. table name)
    arff['relation'] = relation
    
    # Add description (optional)
    if description is not None:
        arff['description'] = description
    
    # Add data
    arff['data'] = df.values
    
    # Add attributes
    arff['attributes'] = compile_attribute_tuple_list(df)
    
    return arff

## Basic Test

Now we do the work

In [3]:
iris = datasets.load_iris()

features = iris.get('data')
feature_names = iris.get('feature_names')

target = iris.get('target')
target_names = iris.get('target_names')

In [4]:
# Throw everything in big matrix
data = np.c_[features, target]
cols = [*feature_names, 'target']

df = pd.DataFrame(data, columns=cols)

In [5]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


So that looks good to me. I think it is reasonable to say that people should make good DataFrames and afterwards we manage the transition to arff from dataframes only.

Let's try that transition

In [6]:
# Tryout
arff_dict = df_to_arff(df)

And now we have to see if this actually results in a good arff file.

In [7]:
print(arff.dumps(arff_dict))

@RELATION DataFrame

@ATTRIBUTE "sepal length (cm)" NUMERIC
@ATTRIBUTE "sepal width (cm)" NUMERIC
@ATTRIBUTE "petal length (cm)" NUMERIC
@ATTRIBUTE "petal width (cm)" NUMERIC
@ATTRIBUTE target NUMERIC

@DATA
5.1,3.5,1.4,0.2,0.0
4.9,3.0,1.4,0.2,0.0
4.7,3.2,1.3,0.2,0.0
4.6,3.1,1.5,0.2,0.0
5.0,3.6,1.4,0.2,0.0
5.4,3.9,1.7,0.4,0.0
4.6,3.4,1.4,0.3,0.0
5.0,3.4,1.5,0.2,0.0
4.4,2.9,1.4,0.2,0.0
4.9,3.1,1.5,0.1,0.0
5.4,3.7,1.5,0.2,0.0
4.8,3.4,1.6,0.2,0.0
4.8,3.0,1.4,0.1,0.0
4.3,3.0,1.1,0.1,0.0
5.8,4.0,1.2,0.2,0.0
5.7,4.4,1.5,0.4,0.0
5.4,3.9,1.3,0.4,0.0
5.1,3.5,1.4,0.3,0.0
5.7,3.8,1.7,0.3,0.0
5.1,3.8,1.5,0.3,0.0
5.4,3.4,1.7,0.2,0.0
5.1,3.7,1.5,0.4,0.0
4.6,3.6,1.0,0.2,0.0
5.1,3.3,1.7,0.5,0.0
4.8,3.4,1.9,0.2,0.0
5.0,3.0,1.6,0.2,0.0
5.0,3.4,1.6,0.4,0.0
5.2,3.5,1.5,0.2,0.0
5.2,3.4,1.4,0.2,0.0
4.7,3.2,1.6,0.2,0.0
4.8,3.1,1.6,0.2,0.0
5.4,3.4,1.5,0.4,0.0
5.2,4.1,1.5,0.1,0.0
5.5,4.2,1.4,0.2,0.0
4.9,3.1,1.5,0.2,0.0
5.0,3.2,1.2,0.2,0.0
5.5,3.5,1.3,0.2,0.0
4.9,3.6,1.4,0.1,0.0
4.4,3.0,1.3,0.2,0.0
5.1,3.4,1.5,

## Introducing missing values

So that looks moderately OK. Let us see if it can take missing values.

In [8]:
df.iloc[0,0] = np.nan
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [9]:
# Tryout
arff_dict = df_to_arff(df)
print(arff.dumps(arff_dict))

@RELATION DataFrame

@ATTRIBUTE "sepal length (cm)" NUMERIC
@ATTRIBUTE "sepal width (cm)" NUMERIC
@ATTRIBUTE "petal length (cm)" NUMERIC
@ATTRIBUTE "petal width (cm)" NUMERIC
@ATTRIBUTE target NUMERIC

@DATA
?,3.5,1.4,0.2,0.0
4.9,3.0,1.4,0.2,0.0
4.7,3.2,1.3,0.2,0.0
4.6,3.1,1.5,0.2,0.0
5.0,3.6,1.4,0.2,0.0
5.4,3.9,1.7,0.4,0.0
4.6,3.4,1.4,0.3,0.0
5.0,3.4,1.5,0.2,0.0
4.4,2.9,1.4,0.2,0.0
4.9,3.1,1.5,0.1,0.0
5.4,3.7,1.5,0.2,0.0
4.8,3.4,1.6,0.2,0.0
4.8,3.0,1.4,0.1,0.0
4.3,3.0,1.1,0.1,0.0
5.8,4.0,1.2,0.2,0.0
5.7,4.4,1.5,0.4,0.0
5.4,3.9,1.3,0.4,0.0
5.1,3.5,1.4,0.3,0.0
5.7,3.8,1.7,0.3,0.0
5.1,3.8,1.5,0.3,0.0
5.4,3.4,1.7,0.2,0.0
5.1,3.7,1.5,0.4,0.0
4.6,3.6,1.0,0.2,0.0
5.1,3.3,1.7,0.5,0.0
4.8,3.4,1.9,0.2,0.0
5.0,3.0,1.6,0.2,0.0
5.0,3.4,1.6,0.4,0.0
5.2,3.5,1.5,0.2,0.0
5.2,3.4,1.4,0.2,0.0
4.7,3.2,1.6,0.2,0.0
4.8,3.1,1.6,0.2,0.0
5.4,3.4,1.5,0.4,0.0
5.2,4.1,1.5,0.1,0.0
5.5,4.2,1.4,0.2,0.0
4.9,3.1,1.5,0.2,0.0
5.0,3.2,1.2,0.2,0.0
5.5,3.5,1.3,0.2,0.0
4.9,3.6,1.4,0.1,0.0
4.4,3.0,1.3,0.2,0.0
5.1,3.4,1.5,0.

## Save to disk

Amazing, that worked! So, the library is actually capable to some extent. Now the next thing would be to really generate the file. This should be trivial tbh.

In [13]:
arff_dict = df_to_arff(df)
with open('iris.arff', 'w') as f:
    arff.dump(arff_dict, f)

Let's try loading as well

In [11]:
with open('iris.arff', 'r') as f:
    iris_arff = arff.load(f)

In [12]:
df_new = pd.DataFrame(iris_arff['data'])
df_new.isna().head() # Alright, this works

Unnamed: 0,0,1,2,3,4
0,True,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False


# Generate raw dataset and run Weka

In [28]:
# Get clean data
iris = datasets.load_iris()

features = iris.get('data')
feature_names = iris.get('feature_names')

target = iris.get('target')
target_names = iris.get('target_names')

# To dataframe (user decides how this happens)
data = np.c_[features, target]
cols = [*feature_names, 'target']

df = pd.DataFrame(data, columns=cols)



In [29]:
df['target'] = df['target'].astype('category')

In [31]:


# To arff container
af = df_to_arff(df) # af as apposed to df

# I/O
with open('../data/raw/iris.arff', 'w') as f:
    arff.dump(af, f)