In [1]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from rpart.DecisionTreeClassifier import DecisionTreeClassifier
from rpart.RandomForestClassifier import RandomForestClassifier

In [2]:
adult = pd.read_csv('../data/adult.csv')

X = adult.drop('income', axis=1)
y = adult['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

results = []

In [3]:
## Models
models = [
    {'name': 'Decision Tree (Max Depth 5, Metrics Gini)', 'model': DecisionTreeClassifier(max_depth=5, metric='gini')},
    {'name': 'Decision Tree (Max Depth 5, Metrics Entropy)', 'model': DecisionTreeClassifier(max_depth=5, metric='entropy')},
    
]

for model in models:
    # Print the model name
    print(model['name'])
    # Create and train the model
    clf = model['model']
    start_time = time.time()
    clf.fit(X_train, y_train)
    training_time = time.time() - start_time

    # Predict on the test set
    start_time = time.time()
    y_pred = clf.predict(X_test)
    inference_time = time.time() - start_time

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # Append results to the list
    results.append([model['name'], training_time, inference_time, accuracy, report])
    # print the results
    print(f'Training time: {training_time}')
    print(f'Inference time: {inference_time}')
    print(f'Accuracy: {accuracy}')
    print()


Decision Tree (Max Depth 5, Metrics Gini)
Training time: 370.3860499858856
Inference time: 0.5521750450134277
Accuracy: 0.853618589415498

Decision Tree (Max Depth 5, Metrics Entropy)
Training time: 366.383585691452
Inference time: 0.5480780601501465
Accuracy: 0.8512642030914116



In [None]:
# Create dataframe from the results list
df_results = pd.DataFrame(results, columns=['Model Name', 'Training Time', 'Inference Time', 'Accuracy', 'Classification Report'])

In [None]:
df_results

In [None]:
df_results.to_csv('../data/results.csv', index=False)

In [None]:

## RTree Models

models = [
    {'name': 'Random Forest (N Estimators 150, Max Depth 5, Metrics Gini)', 'model': RandomForestClassifier(n_estimators=150, max_depth=5, metric='gini')},
    {'name': 'Random Forest (N Estimators 150, Max Depth 5, Metrics Entropy)', 'model': RandomForestClassifier(n_estimators=150, max_depth=5, metric='entropy')}
]

for model in models:
    # Print the model name
    print(model['name'])
    # Create and train the model
    clf = model['model']
    start_time = time.time()
    clf.fit(X_train, y_train)
    training_time = time.time() - start_time

    # Predict on the test set
    start_time = time.time()
    y_pred = clf.predict(X_test)
    inference_time = time.time() - start_time

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # Append results to the list
    results.append([model['name'], training_time, inference_time, accuracy, report])
    # print the results
    print(f'Training time: {training_time}')
    print(f'Inference time: {inference_time}')
    print(f'Accuracy: {accuracy}')
    print()
