In [1]:
from tasks.preprocessing import TaskTrainTestSplit, ProblemType, TaskPrepareXY
import d6tflow
from collections import Counter

d6tflow.settings.log_level = 'WARNING' # 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'

Loading postgres module without psycopg2 installed. Will crash at runtime if postgres functionality is used.
Welcome to d6tflow!


In [2]:
def print_class_distribution(problem_type, input_path):
    t = TaskTrainTestSplit(input_src_path=input_path, problem_type=problem_type, oversampling_enabled=False, undersampling_enabled=False)
    d6tflow.run(t)
    
    y_train = t.output()["y_train"].load()
    y_train_dev = t.output()["y_train_dev"].load()
    y_test = t.output()["y_test"].load()
    print("\n",problem_type)
    print ("Statistics for Train, train_dev and test")
    for data in [y_train, y_train_dev, y_test]:
        c = Counter()
        c.update(data)
        label_0 = c.most_common()[0][1]
        label_1 = c.most_common()[1][1]
        print(f"Label 0: {label_0 / len(data) * 100}%")
        print(f"Label 1: {label_1 / len(data) * 100}%\n")

In [3]:
print_class_distribution(ProblemType.RETURN_NONE, "second_large_dataset")
print_class_distribution(ProblemType.CONDITION_COMPARISON_SIMPLE, "second_large_dataset")
print_class_distribution(ProblemType.CONDITION_COMPARISON, "second_large_dataset")


 ProblemType.RETURN_NONE
Statistics for Train, train_dev and test
Label 0: 99.7855871515693%
Label 1: 0.21441284843069855%

Label 0: 99.78755111879357%
Label 1: 0.21244888120643102%

Label 0: 99.78333533439589%
Label 1: 0.21666466560411773%


 ProblemType.CONDITION_COMPARISON_SIMPLE
Statistics for Train, train_dev and test
Label 0: 97.32605508004417%
Label 1: 2.6739449199558267%

Label 0: 97.30596206459668%
Label 1: 2.694037935403316%

Label 0: 97.30963688727999%
Label 1: 2.690363112720008%


 ProblemType.CONDITION_COMPARISON
Statistics for Train, train_dev and test
Label 0: 95.10341073022659%
Label 1: 4.896589269773418%

Label 0: 95.12760680642955%
Label 1: 4.872393193570443%

Label 0: 95.13164394773285%
Label 1: 4.868356052267143%



In [4]:
def print_dataset_class_distribution(problem_type, src_path):
    t = TaskPrepareXY(input_src_path=src_path, problem_type=problem_type)
    d6tflow.run(t)
    _, y = t.outputLoad()
    c = Counter()
    c.update(y)
    label_0 = c.most_common()[0][1]
    label_1 = c.most_common()[1][1]

    print(problem_type)
    print(f"Label 0: {label_0 / len(y) * 100}%")
    print(f"Label 1: {label_1 / len(y) * 100}%\n")


In [5]:
print_dataset_class_distribution(ProblemType.RETURN_NONE, "second_large_dataset")
print_dataset_class_distribution(ProblemType.CONDITION_COMPARISON_SIMPLE, "second_large_dataset")
print_dataset_class_distribution(ProblemType.CONDITION_COMPARISON, "second_large_dataset")

ProblemType.RETURN_NONE
Label 0: 99.78533318449084%
Label 1: 0.21466681550916875%

ProblemType.CONDITION_COMPARISON_SIMPLE
Label 0: 97.3207621379038%
Label 1: 2.6792378620962043%

ProblemType.CONDITION_COMPARISON
Label 0: 95.1114769850448%
Label 1: 4.888523014955188%



In [6]:
print_dataset_class_distribution(ProblemType.RETURN_NONE, "validation")
print_dataset_class_distribution(ProblemType.CONDITION_COMPARISON_SIMPLE, "validation")
print_dataset_class_distribution(ProblemType.CONDITION_COMPARISON, "validation")

ProblemType.RETURN_NONE
Label 0: 99.78163439403347%
Label 1: 0.21836560596651672%

ProblemType.CONDITION_COMPARISON_SIMPLE
Label 0: 97.38235699993099%
Label 1: 2.6176430000689987%

ProblemType.CONDITION_COMPARISON
Label 0: 95.00942461250887%
Label 1: 4.990575387491139%



In [7]:
print_dataset_class_distribution(ProblemType.RETURN_NONE, "final_dataset")
print_dataset_class_distribution(ProblemType.CONDITION_COMPARISON_SIMPLE, "final_dataset")
print_dataset_class_distribution(ProblemType.CONDITION_COMPARISON, "final_dataset")

ProblemType.RETURN_NONE
Label 0: 99.78535630062055%
Label 1: 0.21464369937946182%

ProblemType.CONDITION_COMPARISON_SIMPLE
Label 0: 97.35525733390531%
Label 1: 2.6447426660946767%

ProblemType.CONDITION_COMPARISON
Label 0: 95.13910212144158%
Label 1: 4.860897878558415%



In [8]:
print_dataset_class_distribution(ProblemType.RETURN_NONE, "final_validation")
print_dataset_class_distribution(ProblemType.CONDITION_COMPARISON_SIMPLE, "final_validation")
print_dataset_class_distribution(ProblemType.CONDITION_COMPARISON, "final_validation")

ProblemType.RETURN_NONE
Label 0: 99.78161427227163%
Label 1: 0.2183857277283813%

ProblemType.CONDITION_COMPARISON_SIMPLE
Label 0: 97.23273097382521%
Label 1: 2.767269026174793%

ProblemType.CONDITION_COMPARISON
Label 0: 94.89631862961612%
Label 1: 5.103681370383884%

