In [71]:
import json
import pickle 
import pandas 
import numpy as np
import scipy 

## Feature vectors


In [3]:
with open("feature-clusters.json", "r") as f:
    full_features = json.load(f)

In [5]:
total_features = list()
for group in full_features:
    for feature in full_features[group]:
        total_features.append(feature)

In [8]:
len(total_features)

53

## Reduced features


In [14]:
oresults_path = '../predicted-results/original/nbins-10'
mresults_path = '../predicted-results/mixed5050/nbins-10'
eresults_path = '../predicted-results/everything/nbins-10'

omodels_path = '../models/original'
mmodels_path = '../models/mixed5050/nbins-10'
emodels_path = '../models/everything/nbins-10'

test_sets = ['5050', '2575', 'everything']

mpoints = [100, 200, 300, 500, 1000, 1500, 2000, 2500, 'all']
epoints = [100, 200, 300, 500, 1000, 1500, 2000, 2500, 4000, 6000, 'all']



In [50]:
# Consider 5050 models and everything models at the 2500 mark 
m5models = dict()
emodels = dict()
omodels = dict()
for tset in ['5050', '2575', 'everything']:
    m5models[tset] = dict()
    emodels[tset] = dict()
    omodels[tset] = dict()
    
    '''First load the original models'''
    for target in ['COF', 'intercept']:
        with open(f'{omodels_path}/{target}.pickle', 'rb') as f:
            model = pickle.load(f)
        with open(f'{omodels_path}/{target}.ptxt', 'rb') as f:
            features = pickle.load(f)
        with open(f'{oresults_path}/{target}_on_{tset}.json', 'r') as f:
            data = json.load(f)
        omodels[tset][target] = {'model': model,
                                 'features': features,
                                 'data': data,
                                 'n_train': len(model.oob_prediction_),
                                 'r_square': data[target]['r_square']}
        
    '''Then load the mixed5050 models'''
    for point in mpoints:
        for i in range(5):
            for target in ['COF', 'intercept']:
                with open(f'{mmodels_path}/set_{i}/{target}_{point}.pickle', 'rb') as f:
                    model = pickle.load(f)
                with open(f'{mmodels_path}/set_{i}/{target}_{point}.ptxt', 'rb') as f:
                    features = pickle.load(f)
                with open(f'{mresults_path}/set_{i}/{target}_{point}_on_{tset}.json', 'r') as f :
                    data = json.load(f)
                if i == 0:
                    if not m5models[tset].get(target):
                        m5models[tset][target] = dict()
                    m5models[tset][target][point] = {
                      'model': model,
                      'features': [features],
                      'data': [data],
                      'n_train': len(model.oob_prediction_),
                      'r_square': [data[target]['r_square']]}
                else:
                    m5models[tset][target][point]['features'].append(features)
                    m5models[tset][target][point]['data'].append(data)
                    m5models[tset][target][point]['r_square'].append(data[target]['r_square'])
                    
    '''Finally load the combined models'''
    for point in epoints:
        # Lastly deal with the everything models
        for i in range(5):
            for target in ['COF', 'intercept']:
                with open(f'{emodels_path}/set_{i}/{target}_{point}.pickle', 'rb') as f:
                    model = pickle.load(f)
                with open(f'{emodels_path}/set_{i}/{target}_{point}.ptxt', 'rb') as f:
                    features = pickle.load(f)
                with open(f'{eresults_path}/set_{i}/{target}_{point}_on_{tset}.json', 'r') as f :
                    data = json.load(f)
                if i == 0:
                    if not emodels[tset].get(target):
                        emodels[tset][target] = dict()
                    emodels[tset][target][point] = {
                      'model': model,
                      'features': [features],
                      'data': [data],
                      'n_train': len(model.oob_prediction_),
                      'r_square': [data[target]['r_square']]}
                else:
                    emodels[tset][target][point]['features'].append(features)
                    emodels[tset][target][point]['data'].append(data)
                    emodels[tset][target][point]['r_square'].append(data[target]['r_square'])


In [76]:
# Original model
for target in ['COF', 'intercept']:
    print(target, len(omodels["everything"][target]["features"]))


COF 41
intercept 41


In [64]:
# mmodels model
mmodels_features = dict()
for point in mpoints:
    mmodels_features[point] = dict()
    for target in ['COF', 'intercept']:
        mmodels_features[point][target] = m5models["everything"][target][point]["features"]


In [73]:
for point in mpoints:
    print(point)
    for target in ["COF", "intercept"]:
        n_features = [len(mmodels_features[point][target][i]) for i in range(5)]
        print(target, n_features, np.average(n_features), round(np.std(n_features), 3))
    print()

100
COF [41, 35, 40, 42, 39] 39.4 2.417
intercept [40, 43, 40, 37, 39] 39.8 1.939

200
COF [40, 40, 39, 38, 39] 39.2 0.748
intercept [38, 45, 42, 38, 37] 40.0 3.033

300
COF [41, 44, 38, 39, 40] 40.4 2.059
intercept [38, 41, 43, 40, 35] 39.4 2.728

500
COF [40, 41, 39, 39, 42] 40.2 1.166
intercept [37, 41, 39, 39, 38] 38.8 1.327

1000
COF [38, 39, 37, 39, 38] 38.2 0.748
intercept [36, 37, 40, 35, 37] 37.0 1.673

1500
COF [35, 38, 35, 35, 38] 36.2 1.47
intercept [35, 36, 35, 35, 37] 35.6 0.8

2000
COF [32, 34, 35, 33, 35] 33.8 1.166
intercept [33, 35, 35, 34, 36] 34.6 1.02

2500
COF [33, 33, 34, 32, 32] 32.8 0.748
intercept [33, 34, 34, 34, 36] 34.2 0.98

all
COF [32, 32, 32, 32, 32] 32.0 0.0
intercept [34, 34, 34, 34, 34] 34.0 0.0



In [88]:
# emodels model
emodels_features = dict()
for point in epoints:
    emodels_features[point] = dict()
    for target in ['COF', 'intercept']:
        emodels_features[point][target] = emodels["everything"][target][point]["features"]


In [89]:
for point in epoints:
    print(point)
    for target in ["COF", "intercept"]:
        n_features = [len(emodels_features[point][target][i]) for i in range(5)]
        print(target, n_features, np.average(n_features), round(np.std(n_features), 3))
    print()

100
COF [41, 43, 40, 45, 39] 41.6 2.154
intercept [42, 44, 37, 39, 34] 39.2 3.544

200
COF [39, 38, 40, 38, 40] 39.0 0.894
intercept [41, 38, 37, 38, 34] 37.6 2.245

300
COF [40, 39, 39, 37, 40] 39.0 1.095
intercept [40, 39, 39, 37, 36] 38.2 1.47

500
COF [38, 38, 42, 40, 38] 39.2 1.6
intercept [40, 38, 37, 36, 38] 37.8 1.327

1000
COF [40, 37, 38, 38, 37] 38.0 1.095
intercept [38, 39, 36, 37, 37] 37.4 1.02

1500
COF [37, 37, 37, 36, 37] 36.8 0.4
intercept [38, 39, 34, 38, 37] 37.2 1.72

2000
COF [36, 36, 38, 35, 37] 36.4 1.02
intercept [39, 39, 34, 36, 37] 37.0 1.897

2500
COF [36, 37, 36, 37, 37] 36.6 0.49
intercept [38, 38, 34, 37, 36] 36.6 1.497

4000
COF [37, 37, 34, 37, 38] 36.6 1.356
intercept [38, 37, 34, 37, 36] 36.4 1.356

6000
COF [33, 36, 34, 36, 35] 34.8 1.166
intercept [36, 36, 34, 36, 35] 35.4 0.8

all
COF [34, 34, 34, 34, 34] 34.0 0.0
intercept [35, 35, 35, 35, 35] 35.0 0.0



In [92]:
# COF
target = "COF"
COF_n_features = list() 
# original
COF_n_features.append(len(omodels["everything"][target]["features"]))

# mix 50 
for point in mpoints:
    for i in range(5):
        COF_n_features.append(len(mmodels_features[point][target][i]))
    
# everything 
for point in epoints:
    for i in range(5):
        COF_n_features.append(len(emodels_features[point][target][i]))

In [96]:
# Number of features in various COF models
print(f"Range: {min(COF_n_features)} - {max(COF_n_features)}")
print(f"Average: {np.average(COF_n_features)}; std: {np.std(COF_n_features)}")

Range: 32 - 45
Average: 37.24752475247525; std: 2.905801437118781


In [97]:
# Intercept
target = "intercept"
intercept_n_features = list() 
# original
intercept_n_features.append(len(omodels["everything"][target]["features"]))

# mix 50 
for point in mpoints:
    for i in range(5):
        intercept_n_features.append(len(mmodels_features[point][target][i]))
    
# everything 
for point in epoints:
    for i in range(5):
        intercept_n_features.append(len(emodels_features[point][target][i]))

In [98]:
# Number of features in various intercept models
print(f"Range: {min(intercept_n_features)} - {max(intercept_n_features)}")
print(f"Average: {np.average(intercept_n_features)}; std: {np.std(intercept_n_features)}")

Range: 33 - 45
Average: 37.0990099009901; std: 2.531015334537574
