In [78]:
from time import perf_counter_ns
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [99]:
df = pd.read_csv("datasets/predict+students+dropout+and+academic+success/data.csv", delimiter=";")

In [100]:
classes = df.Target.unique().tolist()

In [101]:
df.Target = df.Target.apply(lambda x: classes.index(x))

In [102]:
X = df.drop("Target", axis=1)
y = df.Target

In [103]:
X[X.columns] = MinMaxScaler().fit_transform(X)

In [104]:
SEED = 42

In [105]:
def get_metrics(perf, time):
    result = {
        "precision": perf["macro avg"]["precision"],
        "recall": perf["macro avg"]["recall"],
        "f1-score": perf["macro avg"]["f1-score"],
        "accuracy": perf["accuracy"],
        "time (ms)": time
    }
    return pd.DataFrame([result])

def train_and_evaluate(estimator, X_train, y_train, X_test, y_test):
    train_start = perf_counter_ns()
    estimator.fit(X_train, y_train)
    train_end = perf_counter_ns()
    train_time = int((train_end - train_start)/1000000)
    
    test_start = perf_counter_ns()
    y_test_pred = estimator.predict(X_test)
    test_end = perf_counter_ns()
    test_time = int((test_end - test_start)/1000000)
    
    y_train_pred = estimator.predict(X_train)
    train_perf = classification_report(y_train, y_train_pred, target_names=classes, output_dict=True)
    test_perf = classification_report(y_test, y_test_pred, target_names=classes, output_dict=True)
    report = {
        "test": get_metrics(test_perf, test_time),
        "train": get_metrics(train_perf, train_time),
    }
    return report

In [106]:
def mean_performance(results):
    mean = results.mean(axis=0).values
    return pd.DataFrame([mean], columns=results.columns)

In [113]:
def train_single_model(X, y, Clf, estimator_params):
    train_results, test_results = pd.DataFrame(), pd.DataFrame()

    split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=SEED)
    for train_idx, test_idx in split.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        estimator = Clf(**estimator_params, random_state=SEED)
        report = train_and_evaluate(estimator, X_train, y_train, X_test, y_test)
        train_results = pd.concat([train_results, report["train"]], ignore_index=True)
        test_results = pd.concat([test_results, report["test"]], ignore_index=True)
    return train_results, test_results

In [114]:
Clf = DecisionTreeClassifier
clf_params = {}
train_res, test_res = train_single_model(X, y, Clf, clf_params)

In [115]:
mean_performance(test_res)

Unnamed: 0,precision,recall,f1-score,accuracy,time (ms)
0,0.617784,0.619724,0.618372,0.679443,1.2
