In [1]:
import json
import matplotlib
from matplotlib import pyplot as plt
import pickle
import sklearn
import numpy as np
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import statistics
%matplotlib inline

In [2]:
oresults_path = '../predicted-results/original/nbins-10'
mresults_path = '../predicted-results/mixed5050/nbins-10'
eresults_path = '../predicted-results/everything/nbins-10'

omodels_path = '../models/original'
mmodels_path = '../models/mixed5050/nbins-10'
emodels_path = '../models/everything/nbins-10'

test_sets = ['5050', '2575', 'everything']

mpoints = [100, 200, 300, 500, 1000, 2000, 2500, 'all']
epoints = [100, 200, 300, 500, 1000, 2000, 2500, 4000, 6000, 'all']


In [3]:
# Consider 5050 models and everything models at the 2500 mark 
m5models = dict()
emodels = dict()
omodels = dict()
# 5050 model
for tset in ['5050', '2575', 'everything']:
    m5models[tset] = dict()
    emodels[tset] = dict()
    omodels[tset] = dict()
    
    '''First load the original models'''
    for target in ['COF', 'intercept']:
        with open(f'{omodels_path}/{target}.pickle', 'rb') as f:
            model = pickle.load(f)
        with open(f'{omodels_path}/{target}.pickle', 'rb') as f:
            features = pickle.load(f)
        with open(f'{oresults_path}/{target}_on_{tset}.json', 'r') as f:
            data = json.load(f)
        omodels[tset][target] = {'model': model,
                                 'features': features,
                                 'data': data,
                                 'n_train': len(model.oob_prediction_),
                                 'r_square': data[target]['r_square']}
        
    '''Then load the mixed5050 models'''
    for point in mpoints:
        for i in range(5):
            for target in ['COF', 'intercept']:
                with open(f'{mmodels_path}/set_{i}/{target}_{point}.pickle', 'rb') as f:
                    model = pickle.load(f)
                with open(f'{mmodels_path}/set_{i}/{target}_{point}.ptxt', 'rb') as f:
                    features = pickle.load(f)
                with open(f'{mresults_path}/set_{i}/{target}_{point}_on_{tset}.json', 'r') as f :
                    data = json.load(f)
                if i == 0:
                    if not m5models[tset].get(target):
                        m5models[tset][target] = dict()
                    m5models[tset][target][point] = {
                      'model': model,
                      'features': features,
                      'data': [data],
                      'n_train': len(model.oob_prediction_),
                      'r_square': [data[target]['r_square']]}
                else:
                    m5models[tset][target][point]['data'].append(data)
                    m5models[tset][target][point]['r_square'].append(data[target]['r_square'])
                    
    '''Finally load the combined models'''
    for point in epoints:
        # Lastly deal with the everything models
        for i in range(5):
            for target in ['COF', 'intercept']:
                with open(f'{emodels_path}/set_{i}/{target}_{point}.pickle', 'rb') as f:
                    model = pickle.load(f)
                with open(f'{emodels_path}/set_{i}/{target}_{point}.ptxt', 'rb') as f:
                    features = pickle.load(f)
                with open(f'{eresults_path}/set_{i}/{target}_{point}_on_{tset}.json', 'r') as f :
                    data = json.load(f)
                if i == 0:
                    if not emodels[tset].get(target):
                        emodels[tset][target] = dict()
                    emodels[tset][target][point] = {
                      'model': model,
                      'features': features,
                      'data': [data],
                      'n_train': len(model.oob_prediction_),
                      'r_square': [data[target]['r_square']]}
                else:
                    emodels[tset][target][point]['data'].append(data)
                    emodels[tset][target][point]['r_square'].append(data[target]['r_square'])


In [4]:
for tset in ['5050', '2575', 'everything']:
    for point in mpoints:
        for target in ['COF', 'intercept']:

            m5models[tset][target][point]['ave_data'] = dict() 
            m5models[tset][target][point]['std_data'] = dict()
            m5models[tset][target][point]['ratio-to-simulated'] = dict()
            m5models[tset][target][point]['error'] = dict()
            for j in m5models[tset][target][point]['data'][0][target]:
                for i in range(5):
                    if i==0 and j!='r_square':
                        m5models[tset][target][point]['ave_data'][j] = [m5models[tset][target][point]['data'][i][target][j][f'predicted-{target}']]
                    elif j!='r_square':
                        m5models[tset][target][point]['ave_data'][j].append(m5models[tset][target][point]['data'][i][target][j][f'predicted-{target}'])
                if j!='r_square':
                    m5models[tset][target][point]['std_data'][j] = statistics.stdev(m5models[tset][target][point]['ave_data'][j])
                    m5models[tset][target][point]['ave_data'][j] = statistics.mean(m5models[tset][target][point]['ave_data'][j])
                    m5models[tset][target][point]['ratio-to-simulated'][j] = m5models[tset][target][point]['ave_data'][j]/m5models[tset][target][point]['data'][i][target][j][f'simulated-{target}']
                    m5models[tset][target][point]['error'][j] = m5models[tset][target][point]['ave_data'][j] - m5models[tset][target][point]['data'][i][target][j][f'simulated-{target}']
    for point in epoints:
        for target in ['COF', 'intercept']:
            emodels[tset][target][point]['ave_data'] = dict()
            emodels[tset][target][point]['std_data'] = dict()
            emodels[tset][target][point]['ratio-to-simulated'] = dict()
            emodels[tset][target][point]['error'] = dict()
            for j in emodels[tset][target][point]['data'][0][target]:
                for i in range(5):
                    if i==0 and j!='r_square':
                        emodels[tset][target][point]['ave_data'][j] = [emodels[tset][target][point]['data'][i][target][str(j)][f'predicted-{target}']]
                    elif j!='r_square':
                        emodels[tset][target][point]['ave_data'][j].append(emodels[tset][target][point]['data'][i][target][str(j)][f'predicted-{target}'])
                if j!='r_square':
                    emodels[tset][target][point]['std_data'][j] = statistics.stdev(emodels[tset][target][point]['ave_data'][j])
                    emodels[tset][target][point]['ave_data'][j] = statistics.mean(emodels[tset][target][point]['ave_data'][j])
                    emodels[tset][target][point]['ratio-to-simulated'][j] = emodels[tset][target][point]['ave_data'][j]/emodels[tset][target][point]['data'][i][target][j][f'simulated-{target}']
                    emodels[tset][target][point]['error'][j] = emodels[tset][target][point]['ave_data'][j] - emodels[tset][target][point]['data'][i][target][j][f'simulated-{target}']
    for target in ['COF', 'intercept']:
        omodels[tset][target]['ratio-to-simulated'] = dict()
        omodels[tset][target]['error'] = dict()
        for j in omodels[tset][target]['data'][target]:
            if j!='r_square':
                omodels[tset][target]['ratio-to-simulated'][j] = omodels[tset][target]['data'][target][j][f'predicted-{target}']/omodels[tset][target]['data'][target][j][f'simulated-{target}']
                omodels[tset][target]['error'][j] = omodels[tset][target]['data'][target][j][f'predicted-{target}'] - omodels[tset][target]['data'][target][j][f'simulated-{target}']