In [13]:
import cv2
import pandas as pd
import numpy as np

def preprocessing(val=True):
    
    data = pd.read_csv('data/x_train_gr_smpl.csv')
    labels = pd.read_csv('data/y_train_smpl.csv')

    if(val):
        ## ---------------- Data preparation ---------------- ##
        X_train = []
        for i in range(data.shape[0]):
            img = np.uint8(data.iloc[i])
            edited = cv2.Canny(img, 10, 30)
            edited = cv2.GaussianBlur(edited, (5, 5), 0)
            X_train.append(edited.reshape((1,-1))[0])

        data = pd.DataFrame(X_train)
        ## -------------------------------------------------- ##

    return data, labels

In [14]:
# Generate reduced dataset to work with Weka

# data = pd.read_csv('data/x_train_gr_smpl.csv')
# labels = pd.read_csv('data/y_train_smpl.csv')
# data['label'] = labels

top_10_array = set()

data, _ = preprocessing()

for i in range(10):
    data = data.reindex(np.arange(data.shape[0]))
    labels = pd.read_csv(f'data/y_train_smpl_{i}.csv')
    data['label'] = labels
    data = data.sample(frac=1)
    
    corr_label = data.drop("label", axis=1).apply(lambda x: x.corr(data.label))
    corr_label = [(index, abs(corr_val), i) for index, corr_val in enumerate(corr_label)]
    corr_label = sorted(corr_label, key=lambda tup: tup[1], reverse=True)  # Order by correlation value
            
    for tup in corr_label[:10]:
        top_10_array.add(tup[0])
        
data, labels = preprocessing()
data_top_10 = data[data.columns[list(top_10_array)]].copy(deep=True)

In [15]:
data_top_10['label'] = labels
data_top_10['label'] = data_top_10['label'].map({0: 'zero', 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 6: 'six', 7: 'seven', 8: 'eight', 9: 'nine'})
data_top_10

Unnamed: 0,1025,1034,2063,2064,1039,1078,1082,1083,2111,2112,...,888,1415,1416,1417,936,1967,1464,985,986,label
0,96,64,0,0,16,80,64,16,0,0,...,16,0,0,0,0,0,0,16,64,zero
1,64,128,0,0,0,80,128,112,0,0,...,32,96,64,16,80,0,16,80,80,zero
2,32,64,0,0,64,80,80,96,0,0,...,64,112,64,32,80,64,64,96,64,zero
3,32,64,0,0,80,96,128,112,0,0,...,0,0,0,16,64,0,112,96,64,zero
4,64,64,64,96,32,80,16,0,64,96,...,0,0,16,64,0,64,0,96,64,zero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12655,0,16,0,0,16,16,32,64,0,0,...,16,96,64,16,16,0,16,96,64,nine
12656,0,64,0,0,0,64,96,64,0,0,...,96,16,0,0,64,0,0,32,64,nine
12657,0,64,0,0,0,96,64,96,0,0,...,64,64,16,0,96,0,0,64,32,nine
12658,0,96,0,0,0,64,96,64,0,0,...,96,16,0,0,64,0,0,16,64,nine


In [16]:
data = data_top_10.sample(frac=1).reset_index(drop=True)
data[:500].to_csv('data/reduce_dataset.csv', encoding='utf-8', sep=',', index=False)

In [17]:
import weka.core.jvm as jvm
from weka.core.converters import Loader, Saver
from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random
import weka.plot.graph as graph

jvm.start(max_heap_size="2048m")

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/javabridge/jars/rhino-1.7R4.jar', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/javabridge/jars/runnablequeue.jar', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/javabridge/jars/cpython.jar', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/weka/lib/python-weka-wrapper.jar', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/weka/lib/weka.jar', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/weka/lib/python-weka-wrapper.jar', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/weka/lib/weka.jar', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/weka/lib/python-weka-wrapper.jar', '/Library/Frameworks/Python.framework/Versions/3.7/l

In [19]:
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file('data/reduce_dataset_500.arff')
data.class_is_last()   # set class attribute

In [20]:
cls = Classifier(
    classname='weka.classifiers.bayes.BayesNet',
    options=['-Q', 'weka.classifiers.bayes.net.search.local.K2', '--', '-P', '1', '-S', 'BAYES', '-E', 'weka.classifiers.bayes.net.estimate.SimpleEstimator', '--', '-A', '0.5']
)

In [21]:
cls.build_classifier(data)

JavaException: GC overhead limit exceeded

In [22]:
cls
#print(cls.graph)
#graph.plot_dot_graph(cls.graph)

In [23]:
jvm.stop()