In [1]:
from sklearn.datasets import load_iris, load_wine
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Ignore the warning messages from sklearn
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Datasets and random seeds we will use
x_iris, y_iris = load_iris(return_X_y=True)
x_wine, y_wine = load_wine(return_X_y=True)

datasets = [('iris', x_iris, y_iris), ('wine', x_wine, y_wine)]
random_seeds = [100, 500, 1000, 5000, 10000]

In [4]:
# Function to calculate COD
def cod(h1, h2, true):
    assert len(h1) == len(h2) == len(true)

    n = len(true)
    n10 = ((h1 == true) & (h1 != h2)).sum()
    n01 = ((h2 == true) & (h2 != h1)).sum()
    n00_d = ((h1 != true) & (h2 != true) & (h1 != h2)).sum()

    return (n10 + n01 + n00_d) / n

### Part 1

In [5]:
dataset_results = []

# Obtain results on each dataset
for dataset_name, x, y in datasets:
    trained_models = []
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75)

    # Train versions of DT on different training sets
    for random_seed in random_seeds:
        x_train_new, _, y_train_new, _ = train_test_split(x_train, y_train, train_size=0.75, random_state=random_seed)
        dt = DecisionTreeClassifier()
        dt.fit(x_train_new, y_train_new)
        
        trained_models.append(dt)

    # Get COD scores for each pair of trained DT models
    results = []

    for i in range(len(trained_models) - 1):
        model_i = trained_models[i]

        for j in range(i + 1, len(trained_models)):
            model_j = trained_models[j]
            pred_i, pred_j = model_i.predict(x_test), model_j.predict(x_test)
            result = cod(pred_i, pred_j, y_test)

            results.append((i, j, result))

    dataset_results.append((dataset_name, results))

In [6]:
# Display the COD scores for each dataset and model pair
for dataset_name, results in dataset_results:
    print(F'Results on {dataset_name}:')

    for i, j, result in results:
        print(f'COD on models {i + 1} and {j + 1}: {result}')

    print()

Results on iris:
COD on models 1 and 2: 0.05263157894736842
COD on models 1 and 3: 0.05263157894736842
COD on models 1 and 4: 0.05263157894736842
COD on models 1 and 5: 0.05263157894736842
COD on models 2 and 3: 0.0
COD on models 2 and 4: 0.0
COD on models 2 and 5: 0.0
COD on models 3 and 4: 0.0
COD on models 3 and 5: 0.0
COD on models 4 and 5: 0.0

Results on wine:
COD on models 1 and 2: 0.17777777777777778
COD on models 1 and 3: 0.1111111111111111
COD on models 1 and 4: 0.1111111111111111
COD on models 1 and 5: 0.08888888888888889
COD on models 2 and 3: 0.1111111111111111
COD on models 2 and 4: 0.1111111111111111
COD on models 2 and 5: 0.08888888888888889
COD on models 3 and 4: 0.0
COD on models 3 and 5: 0.022222222222222223
COD on models 4 and 5: 0.022222222222222223



### Part 2

In [7]:
# Similar approach as Part 1, but this time with 1 dataset and 2 different models

# Models we will use and final results (initialized to empty)
models, model_results = [('DT', DecisionTreeClassifier), ('MLP', MLPClassifier)], []

# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(x_wine, y_wine, train_size=0.75)

# Obtain results for each model
for model_name, model_type in models:
    trained_models = []

    # Train versions of the model on the same training set
    for _ in random_seeds:
        model = model_type()
        model.fit(x_train, y_train)
        
        trained_models.append(model)

    # Get COD scores for each pair of trained models
    results = []

    for i in range(len(trained_models) - 1):
        model_i = trained_models[i]

        for j in range(i + 1, len(trained_models)):
            model_j = trained_models[j]
            pred_i, pred_j = model_i.predict(x_test), model_j.predict(x_test)
            result = cod(pred_i, pred_j, y_test)

            results.append((i, j, result))

    model_results.append((model_name, results))

In [8]:
# Display the COD scores for each model type and pair
for model_name, results in model_results:
    print(F'Results for {model_name}:')

    for i, j, result in results:
        print(f'COD on models {i + 1} and {j + 1}: {result}')

    print()

Results for DT:
COD on models 1 and 2: 0.1111111111111111
COD on models 1 and 3: 0.0
COD on models 1 and 4: 0.1111111111111111
COD on models 1 and 5: 0.1111111111111111
COD on models 2 and 3: 0.1111111111111111
COD on models 2 and 4: 0.0
COD on models 2 and 5: 0.0
COD on models 3 and 4: 0.1111111111111111
COD on models 3 and 5: 0.1111111111111111
COD on models 4 and 5: 0.0

Results for MLP:
COD on models 1 and 2: 0.7555555555555555
COD on models 1 and 3: 0.1111111111111111
COD on models 1 and 4: 0.35555555555555557
COD on models 1 and 5: 0.7555555555555555
COD on models 2 and 3: 0.7777777777777778
COD on models 2 and 4: 0.8444444444444444
COD on models 2 and 5: 0.0
COD on models 3 and 4: 0.3333333333333333
COD on models 3 and 5: 0.7777777777777778
COD on models 4 and 5: 0.8444444444444444



### Discussion

For part 1 I used a Decision Tree (DT) for the classification algorithm.  I also used 5 different random seeds for selecting from the training set, as well as the iris and wine datasets.  For each training set, I fit a model and then used COD to compare all of the pairs of models (as instructed).  From both the paper and intuition, COD reflects the probability that two models will make different predictions.  By looking at the results at the end of the code in part 1, one can see that nearly all of the COD scores/probabilities are low, which indicates that <b style="color:red;">DT likely has a good amount of bias</b> (i.e. regardless of the training set, DT tends to make the same predictions and errors). <br>

For part 2, I used the wine dataset, DT again for the non-stochastic algorithm, and MLP for the stochastic algorithm.  However, this time I fit multiple models using the same training set.  One can see from the output that the COD values for DT are essentially 0, whereas the values for MLP are significantly higher.  This indicates that <b style="color:red;">DT probably has stronger bias than MLP and MLP has more variance</b> (which seems to make sense due to the more stochastic nature of MLP, combined with the results we saw for DT in part 1).