In [3]:
import pandas as pd
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
import time

In [4]:
def measure_decision_tree_grid_search(path_train, path_test, criterion = 'gini', max_depth = None):
  data_train = pd.read_csv(path_train)
  data_test = pd.read_csv(path_test)

  x_train = data_train.loc[:, data_train.columns != 'AT_RISK']
  y_train = data_train['AT_RISK']
  x_test = data_test.loc[:, data_test.columns != 'AT_RISK']
  y_test = data_test['AT_RISK']

  clf = tree.DecisionTreeClassifier(criterion = criterion, max_depth = max_depth)
  train_bg = time.time()
  clf = clf.fit(x_train, y_train)
  end_bg = time.time()

  prediction = clf.predict(x_test)
  acc = accuracy_score(y_test, prediction)
  precision = precision_score(y_test, prediction)
  recall = recall_score(y_test, prediction)
  f1 = f1_score(y_test, prediction)
  training_time = end_bg - train_bg

  return acc, precision, recall, f1, training_time

In [6]:
train_1 = "/content/drive/MyDrive/Self/Covid 19 Risk Prediction/lan1/train.csv"
test_1 = "/content/drive/MyDrive/Self/Covid 19 Risk Prediction/lan1/test.csv"

train_2 = "/content/drive/MyDrive/Self/Covid 19 Risk Prediction/lan2/train.csv"
test_2 = "/content/drive/MyDrive/Self/Covid 19 Risk Prediction/lan2/test.csv"

train_3 = "/content/drive/MyDrive/Self/Covid 19 Risk Prediction/lan3/train.csv"
test_3 = "/content/drive/MyDrive/Self/Covid 19 Risk Prediction/lan3/test.csv"

for criterion in ['gini', 'entropy']:
  for max_depth in [None, 15, 20, 25]:
    acc1, precision1, recall1, f11, training_time1 = measure_decision_tree_grid_search(train_1, test_1, criterion, max_depth)
    acc2, precision2, recall2, f12, training_time2 = measure_decision_tree_grid_search(train_2, test_2, criterion, max_depth)
    acc3, precision3, recall3, f13, training_time3 = measure_decision_tree_grid_search(train_3, test_3, criterion, max_depth)

    mean_acc = (acc1 + acc2 + acc3) / 3
    mean_precision = (precision1 + precision2 + precision3) / 3
    mean_recall = (recall1 + recall2 + recall3) / 3
    mean_f1 = (f11 + f12 + f13) / 3
    mean_training = (training_time1 + training_time2 + training_time3) / 3

    print("----------------------------------------------------------")
    print(f"criterion = {criterion}, max_depth = {max_depth}")
    print("Mean accuracy: ", mean_acc)
    print("Mean precision: ", mean_precision)
    print("Mean recall: ", mean_recall)
    print("Mean f1: ", mean_f1)
    print("Mean traning time: ", mean_training)

----------------------------------------------------------
criterion = gini, max_depth = 15
Mean accuracy:  0.9191199653831017
Mean precision:  0.6446715380660147
Mean recall:  0.5253126187202111
Mean f1:  0.5788917613390261
Mean traning time:  2.721557378768921
----------------------------------------------------------
criterion = gini, max_depth = 20
Mean accuracy:  0.9199951727691996
Mean precision:  0.6364663658461057
Mean recall:  0.5689547227794273
Mean f1:  0.6008176814561151
Mean traning time:  3.237191677093506
----------------------------------------------------------
criterion = gini, max_depth = 25
Mean accuracy:  0.9186546377293751
Mean precision:  0.6283259640182829
Mean recall:  0.5663130731779238
Mean f1:  0.5957084957156062
Mean traning time:  3.4674110412597656
----------------------------------------------------------
criterion = gini, max_depth = None
Mean accuracy:  0.9181566936513216
Mean precision:  0.6262734882134756
Mean recall:  0.5620807777570808
Mean f1:  0.