In [11]:
%run ../include/util.ipynb

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline


# /Users/dduru/PythonProjects/data/Decision Trees/ad-dataset/ad.data
ad_data = read_csv_frame(path='/Users/dduru/PythonProjects/data/Decision Trees/ad-dataset/ad.data')

explanatory_variable_columns = list(ad_data.columns.values)
explanatory_variable_columns.remove(explanatory_variable_columns[len(explanatory_variable_columns) - 1])

response_variable_column = ad_data.iloc[:, -1]

y = [1 if e == 'ad.' else 0 for e in response_variable_column]
X = ad_data[list(explanatory_variable_columns)].copy()
X.replace(to_replace=' *?', value=-1, regex=True, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)


pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])
parameters = {
    'clf__max_depth': (150, 155, 160),
    'clf__min_samples_split': (2, 3),
    'clf__min_samples_leaf': (1, 2, 3)
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)


best_parameters = grid_search.best_estimator_.get_params()
print('Best score: %.3f' % grid_search.best_score_)
print('Best parameters set:')

for param_name in sorted(parameters.keys()):
    print('t%s: %r' % (param_name, best_parameters[param_name]))


predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))



Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   13.2s finished
Best score: 0.884
Best parameters set:
tclf__max_depth: 155
tclf__min_samples_leaf: 1
tclf__min_samples_split: 3
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       706
           1       0.96      0.89      0.93       114

    accuracy                           0.98       820
   macro avg       0.97      0.94      0.96       820
weighted avg       0.98      0.98      0.98       820

