In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

In [15]:
# Train datasets
original_train = pd.read_csv('../datasets/covertype_train.csv')
original_norm_train = pd.read_csv('../datasets/covertype_norm_train.csv')
lda_train = pd.read_csv('../datasets/covertype_lda_train_raw.csv')
lda_norm_train = pd.read_csv('../datasets/covertype_lda_train.csv')

# Targets
target_original_train = original_train.iloc[:,-1]
target_original_norm_train = original_norm_train.iloc[:,-1]
target_lda_train = lda_train.iloc[:,-1]
target_lda_norm_train = lda_norm_train.iloc[:,-1]

# Dataset without classes
data_original_train = original_train.iloc[:,:-1]
data_original_norm_train = original_norm_train.iloc[:,:-1]
data_lda_train = lda_train.iloc[:,:-1]
data_lda_norm_train = lda_norm_train.iloc[:,:-1]

In [16]:
# Test datasets
original_test = pd.read_csv('../datasets/covertype_test.csv')
original_norm_test = pd.read_csv('../datasets/covertype_norm_test.csv')
lda_test = pd.read_csv('../datasets/covertype_lda_test_raw.csv')
lda_norm_test = pd.read_csv('../datasets/covertype_lda_test.csv')

# Targets
target_original_test = original_test.iloc[:,-1]
target_original_norm_test = original_norm_test.iloc[:,-1]
target_lda_test = lda_test.iloc[:,-1]
target_lda_norm_test = lda_norm_test.iloc[:,-1]

# Dataset without classes
data_original_test = original_test.iloc[:,:-1]
data_original_norm_test = original_norm_test.iloc[:,:-1]
data_lda_test = lda_test.iloc[:,:-1]
data_lda_norm_test = lda_norm_test.iloc[:,:-1]

In [17]:
def perform_decision_tree(train, test):
    '''
    Performs decision tree for a given dataset.
    '''
    # Split the datasets in data and target
    train_target = train['cover_type']
    train_data   = train.loc[:, train.columns != 'cover_type']
    test_target  = test['cover_type']
    test_data    = test.loc[:, test.columns != 'cover_type']
    
    d_tree = DecisionTreeClassifier(random_state=0)    
    result = cross_validate(d_tree, train_data, train_target, cv=10, return_estimator=True)
    
    acc_best = 0
    best_estimator = result['estimator'][0]

    for estimator in result['estimator']:
        score = estimator.score(test_data, test_target)
        if score > acc_best:
            acc_best = score
            best_estimator = estimator
            
    return [acc_best, best_estimator]

In [18]:
result_original = perform_decision_tree(original_train, original_test)
result_original_norm = perform_decision_tree(original_norm_train, original_norm_test)
result_lda = perform_decision_tree(lda_train, lda_test)
result_lda_norm = perform_decision_tree(lda_norm_train, lda_norm_test)

In [19]:
print("Original: ", result_original[0])
print("Original norm: ", result_original_norm[0])
print("LDA: ", result_lda[0])
print("LDA norm: ", result_lda_norm[0])

Original:  0.8136439267886856
Original norm:  0.8136439267886856
LDA:  0.7680948419301165
LDA norm:  0.7680948419301165
