In [1]:
import json
import os

import pandas as pd
import xgboost as xgb

from classes import Model, Node, Tree, SplitType, ParamT

In [2]:
base_dir = "./models/"
fivedays_dir = base_dir + "5days/"
incremental_dir = base_dir + "incremental/"
daily_dir = base_dir + "daily/"

In [3]:
def load_model(path: str) -> Model:
    with open(path) as f:
        model_json = json.load(f)
    return Model(model_json)

In [4]:
def get_model_dict(model_dir: str) -> dict:
    model_dict = {}

    for model_file in os.listdir(model_dir):
        model = load_model(os.path.join(model_dir, model_file))

        model_dict[model_file] = model
    
    return model_dict
    

In [5]:
def get_number_duplicate_trees(model1: Model, model2: Model):
    num_duplicates = 0
    
    for tree1 in model1.trees:
        for tree2 in model2.trees:
            if tree1 == tree2:
                num_duplicates += 1
    
    return num_duplicates


In [6]:
def get_dir_duplicate_stats(model_dict: dict) -> pd.DataFrame:
    match_stats = []

    for name1, model1 in model_dict.items():
        for name2, model2 in model_dict.items():
            if name1 == name2: continue # Skip trivial case
            
            num_duplicates = get_number_duplicate_trees(model1, model2)
            
            match_stats.append({
                "Model1": name1,
                "Model2": name2,
                "NumDuplicates": num_duplicates
            })

    df = pd.DataFrame(match_stats)
    return df


In [7]:
fivedays_duplicate_stats = get_dir_duplicate_stats(get_model_dict(fivedays_dir))
daily_duplicate_stats = get_dir_duplicate_stats(get_model_dict(daily_dir))
incremental_duplicate_stats = get_dir_duplicate_stats(get_model_dict(incremental_dir))

In [8]:
display(fivedays_duplicate_stats)
fivedays_duplicate_stats.describe()

Unnamed: 0,Model1,Model2,NumDuplicates
0,xgboost_1000_trees_5days_2016-01-24_2016-01-28...,xgboost_1000_trees_5days_2016-01-27_2016-01-31...,0
1,xgboost_1000_trees_5days_2016-01-24_2016-01-28...,xgboost_1000_trees_5days_2016-01-14_2016-01-18...,0
2,xgboost_1000_trees_5days_2016-01-24_2016-01-28...,xgboost_1000_trees_5days_2016-01-02_2016-01-06...,0
3,xgboost_1000_trees_5days_2016-01-24_2016-01-28...,xgboost_1000_trees_5days_2016-01-19_2016-01-23...,0
4,xgboost_1000_trees_5days_2016-01-24_2016-01-28...,xgboost_1000_trees_5days_2016-01-05_2016-01-09...,0
...,...,...,...
697,xgboost_1000_trees_5days_2016-01-21_2016-01-25...,xgboost_1000_trees_5days_2016-01-12_2016-01-16...,0
698,xgboost_1000_trees_5days_2016-01-21_2016-01-25...,xgboost_1000_trees_5days_2016-01-06_2016-01-10...,0
699,xgboost_1000_trees_5days_2016-01-21_2016-01-25...,xgboost_1000_trees_5days_2016-01-25_2016-01-29...,0
700,xgboost_1000_trees_5days_2016-01-21_2016-01-25...,xgboost_1000_trees_5days_2016-01-07_2016-01-11...,0


Unnamed: 0,NumDuplicates
count,702.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [9]:
display(daily_duplicate_stats)
daily_duplicate_stats.describe()

Unnamed: 0,Model1,Model2,NumDuplicates
0,xgboost_1000_trees_daily_2016-01-29.json,xgboost_1000_trees_daily_2016-01-09.json,0
1,xgboost_1000_trees_daily_2016-01-29.json,xgboost_1000_trees_daily_2016-01-08.json,0
2,xgboost_1000_trees_daily_2016-01-29.json,xgboost_1000_trees_daily_2016-01-02.json,0
3,xgboost_1000_trees_daily_2016-01-29.json,xgboost_1000_trees_daily_2016-01-31.json,0
4,xgboost_1000_trees_daily_2016-01-29.json,xgboost_1000_trees_daily_2016-01-04.json,0
...,...,...,...
925,xgboost_1000_trees_daily_2016-01-13.json,xgboost_1000_trees_daily_2016-01-23.json,0
926,xgboost_1000_trees_daily_2016-01-13.json,xgboost_1000_trees_daily_2016-01-20.json,0
927,xgboost_1000_trees_daily_2016-01-13.json,xgboost_1000_trees_daily_2016-01-01.json,0
928,xgboost_1000_trees_daily_2016-01-13.json,xgboost_1000_trees_daily_2016-01-07.json,0


Unnamed: 0,NumDuplicates
count,930.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [10]:
display(incremental_duplicate_stats)
incremental_duplicate_stats.describe()

Unnamed: 0,Model1,Model2,NumDuplicates
0,xgboost_4_trees_incremental_2016-01-04.json,xgboost_9_trees_incremental_2016-01-09.json,4
1,xgboost_4_trees_incremental_2016-01-04.json,xgboost_16_trees_incremental_2016-01-16.json,4
2,xgboost_4_trees_incremental_2016-01-04.json,xgboost_20_trees_incremental_2016-01-20.json,4
3,xgboost_4_trees_incremental_2016-01-04.json,xgboost_19_trees_incremental_2016-01-19.json,4
4,xgboost_4_trees_incremental_2016-01-04.json,xgboost_5_trees_incremental_2016-01-05.json,4
...,...,...,...
925,xgboost_22_trees_incremental_2016-01-22.json,xgboost_26_trees_incremental_2016-01-26.json,22
926,xgboost_22_trees_incremental_2016-01-22.json,xgboost_8_trees_incremental_2016-01-08.json,8
927,xgboost_22_trees_incremental_2016-01-22.json,xgboost_10_trees_incremental_2016-01-10.json,10
928,xgboost_22_trees_incremental_2016-01-22.json,xgboost_25_trees_incremental_2016-01-25.json,22


Unnamed: 0,NumDuplicates
count,930.0
mean,10.666667
std,7.184083
min,1.0
25%,5.0
50%,9.0
75%,16.0
max,30.0
