In [1]:
import cv2
import pandas as pd
import numpy as np

def preprocessing(data, labels, val=True):
    
    data = pd.read_csv(data)
    labels = pd.read_csv(labels)

    if(val):
        ## ---------------- Data preparation ---------------- ##
        X_train = []
        for i in range(data.shape[0]):
            img = np.uint8(data.iloc[i])
            edited = cv2.Canny(img, 10, 30)
            edited = cv2.GaussianBlur(edited, (5, 5), 0)
            X_train.append(edited.reshape((1,-1))[0])

        data = pd.DataFrame(X_train)
        ## -------------------------------------------------- ##

    return data, labels

In [2]:
# 1. Training set

x = 'data/x_train_gr_smpl.csv'
y = 'data/y_train_smpl.csv'

top_10_array = set()

data, _ = preprocessing(data=x, labels=y)

for i in range(10):
    data = data.reindex(np.arange(data.shape[0]))
    labels = pd.read_csv(f'data/y_train_smpl_{i}.csv')
    data['label'] = labels
    data = data.sample(frac=1)
    
    corr_label = data.drop("label", axis=1).apply(lambda x: x.corr(data.label))
    corr_label = [(index, abs(corr_val), i) for index, corr_val in enumerate(corr_label)]
    corr_label = sorted(corr_label, key=lambda tup: tup[1], reverse=True)  # Order by correlation value
            
    for i, tup in enumerate(corr_label[:10]):
        top_10_array.add(tup[0])
        
data, labels = preprocessing(data=x, labels=y)

trainingData_top10 = data[data.columns[list(top_10_array)]].copy(deep=True)
trainingData_top10['label'] = labels
trainingData_top10['label'] = trainingData_top10['label'].map({0: 'zero', 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 6: 'six', 7: 'seven', 8: 'eight', 9: 'nine'})
trainingData_top10 = trainingData_top10.sample(frac=1).reset_index(drop=True)
trainingData_top10.to_csv('DecisionTrees/src/main/resources/trainingData_top10.csv', encoding='utf-8', sep=',', index=False)

In [3]:
# 2. Testing set

x = 'data/x_test_gr_smpl.csv'
y = 'data/y_test_smpl.csv'

data, labels = preprocessing(data=x, labels=y)

testingData_top10 = data[data.columns[list(top_10_array)]].copy(deep=True)
testingData_top10['label'] = labels
testingData_top10['label'] = testingData_top10['label'].map({0: 'zero', 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 6: 'six', 7: 'seven', 8: 'eight', 9: 'nine'})
testingData_top10 = testingData_top10.sample(frac=1).reset_index(drop=True)
testingData_top10.to_csv('DecisionTrees/src/main/resources/testingData_top10.csv', encoding='utf-8', sep=',', index=False)

In [4]:
testingData_top10_4000 = testingData_top10.append(trainingData_top10.iloc[-4000:]).reset_index(drop=True)
trainingData_top10_4000 = trainingData_top10.iloc[:-4000]
testingData_top10_4000.to_csv('DecisionTrees/src/main/resources/testingData_top10_4000.csv', encoding='utf-8', sep=',', index=False)
trainingData_top10_4000.to_csv('DecisionTrees/src/main/resources/trainingData_top10_4000.csv', encoding='utf-8', sep=',', index=False)

In [5]:
testingData_top10_9000 = testingData_top10.append(trainingData_top10.iloc[-9000:]).reset_index(drop=True)
trainingData_top10_9000 = trainingData_top10.iloc[:-9000]
testingData_top10_9000.to_csv('DecisionTrees/src/main/resources/testingData_top10_9000.csv', encoding='utf-8', sep=',', index=False)
trainingData_top10_9000.to_csv('DecisionTrees/src/main/resources/trainingData_top10_9000.csv', encoding='utf-8', sep=',', index=False)