# Online Local Adaptive Model - Notebook 4


* Prior Probability Shift is one of the common problems encountered in Machine Learning algortihms.   
* There are some approaches for dealing with this problem in a 'static' scenario. But there are situations in which we need a model which deals with secvential data as input (e.g. a server which gets input from different users, with different data distributions).   
* In this project, we try to build a model which self adapts its predictions based on the local label distribution. 

### About notebook 4

In this notebook we build a testing framework: a data flow for prediction for simulating an online testing scenario; we analyze if the models trained in Notebook3 are able to adapt to local distribution

git log --pretty=tformat:'%h %an %ci' --numstat

## Notebook setup and data preparation


### Notebook setup

In [None]:
from IPython.core.display import display, HTML
from IPython.display import Image
display(HTML("<style>.container { width:100% !important; }</style>"))
% matplotlib inline
# % matplotlib notebook

# %matplotlib qt
% load_ext autoreload
% autoreload 2


### Imports

In [None]:
import matplotlib.pyplot as plt
import matplotlib.animation
plt.rcParams['animation.ffmpeg_path'] = 'C:/ffmpeg/bin/ffmpeg.exe'  # used for animation

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix

import os
import pickle
import time

# os.chdir(r'C:\Users\diaco\Desktop\ML\Licenta\finalproject\mnist')
from dataset import MNISTDataset, Dataset, CIFAR10Dataset
import utils
from lenet5 import Lenet5
from lenet5_with_distr import Lenet5WithDistr
import PIL.Image

# numpy print options
np.set_printoptions(linewidth=150)
np.set_printoptions(edgeitems=10)
np.set_printoptions(precision=3)
pd.set_option('display.precision', 3)


### Set seed

In [None]:
Dataset.seed = 112358

### Import MNIST dataset

In [None]:
MNIST_TRAIN_IMG_PATH = 'MNIST_dataset/train-images.idx3-ubyte'
MNIST_TRAIN_LABELS_PATH = 'MNIST_dataset/train-labels.idx1-ubyte'
MNIST_TEST_IMG_PATH = 'MNIST_dataset/t10k-images.idx3-ubyte'
MNIST_TEST_LABELS_PATH = 'MNIST_dataset/t10k-labels.idx1-ubyte'

mnist_ds = MNISTDataset(MNIST_TRAIN_IMG_PATH, MNIST_TRAIN_LABELS_PATH, MNIST_TEST_IMG_PATH, MNIST_TEST_LABELS_PATH)


### Import CIFAR10 dataset

In [None]:
CIFAR10_DATASET_DIR = './cifar10_dataset/cifar-10-batches-py'
cifar10_ds = CIFAR10Dataset(CIFAR10_DATASET_DIR)
print(cifar10_ds.summary)

In [None]:
def plot_results_distributions(test_model, test_ds, train_distr):
    test_loss, test_acc, total_predict, total_actual, wrong_predict_images, total_softmax_output_probs = test_model.test_data(test_ds, use_only_one_batch=True)

#     print('test_loss = {:.4f}, test_acc = {:.1f}% ({}/{})'.format(test_loss, test_acc * 100, mnist_subset.test.num_examples - len(wrong_predict_images), mnist_subset.test.num_examples))
    
    # sort wrong_predict_images by target label
    correct_predict = total_predict[total_actual == total_predict]
    wrong_predict = total_predict[total_actual != total_predict]
    wrong_predict_softmax_output_probs = total_softmax_output_probs[total_actual != total_predict]
    wrong_actual = total_actual[total_actual != total_predict]
    wrong_predict_images = np.array(wrong_predict_images)
    wrong_predict_images_sorted = wrong_predict_images[wrong_actual.argsort(), ]
    wrong_predict_images_sorted = [image for image in wrong_predict_images_sorted]

    count_figures = 6
    fig = plt.figure(figsize=(30, 3))
    fig.suptitle(y = 1.1, t = 'test_acc = {:.1f}% ({}/{})'.format(test_acc * 100, test_ds.num_examples - len(wrong_predict_images), test_ds.num_examples), fontsize=18, fontweight='bold')

    k = 1
    plt.subplot(1,count_figures, k)
    plt.bar(range(10), train_distr)
    plt.xticks(range(0, 10))
    plt.title('train label distr')
    
    k+=1
    plt.subplot(1,count_figures, k)
    plt.bar(range(10), test_ds.label_distr)
    plt.xticks(range(0, 10))
    plt.title('test label distr')

    k+=1
    plt.subplot(1,count_figures, k)
    plt.hist(correct_predict, bins=np.arange(11), rwidth=0.8, normed=False)
    plt.xticks(range(0, 10))
    plt.title('correct predicted label distr')
    
    k+=1
    plt.subplot(1,count_figures, k)
    plt.hist(wrong_predict, bins=np.arange(11), rwidth=0.8, normed=False)
    plt.xticks(range(0, 10))
    plt.title('wrong predicted label distr')
    
    k+=1
    plt.subplot(1,count_figures, k)
    plt.hist(wrong_actual, bins=np.arange(11), rwidth=0.8, normed=False)
    plt.xticks(range(0, 10))
    plt.title('wrong actual label distr')
    
    k+=1
    plt.subplot(1,count_figures, k)
    plt.bar(range(0, 10), np.average(wrong_predict_softmax_output_probs, axis=0))
    plt.xticks(range(0, 10))
    plt.title('wrong predicted: avg. of softmax output probs.')

#     plt.savefig(os.path.join(ckpt_dir, plot_filename))
    plt.show()


### Build a color list which will be later used for seq. of distributions plot

In [None]:
T10_palette_rgb = []
T10_palette_rgb_normed = []

def hex2rgb(rgb):
    temp = rgb.lstrip('#')
    return tuple(int(temp[i:i+2], 16) for i in (0, 2, 4))

def hex2rgb_normed(rgb):
    temp = rgb.lstrip('#')
    return tuple(int(temp[i:i+2], 16)/255.0 for i in (0, 2, 4))

x = np.arange(10)
y = np.arange(10)
for i in range(10):
    p = plt.plot(x,y*i, linewidth=5)
    T10_palette_rgb.append(hex2rgb(p[0].get_color()))
    T10_palette_rgb_normed.append(hex2rgb_normed(p[0].get_color()))
plt.show()

---
## Section 1 - testing using a list of distributions and a corresponding list of how many data to generate from each, sequentially
## Use Bayesian output probabilities adjusting
---

### Build a list of distributions

#### 1 - use those from previous notebooks

In [None]:
distrs_used_for_training = []

# uniform distribution
distr = np.array([1,1,1,1,1,1,1,1,1,1])
distrs_used_for_training.append(distr/np.sum(distr))
# normal distribution centered about label 4-5
r = 2
distr = [r**1,r**2,r**3,r**4,r**5,r**5,r**4,r**3,r**2,r**1]
distrs_used_for_training.append(distr/np.sum(distr))

# skewed normal distribution centered about 2
distr = [r**3,r**4,r**5,r**4.5,r**4,r**3.5,r**3,r**2.5,r**2,r**1.5]
distrs_used_for_training.append(distr/np.sum(distr))

# skwed normal distribution centered about 7
distr = [r**1.5,r**2,r**2.5,r**3,r**3.5,r**4,r**4.5,r**5,r**4,r**3]
distrs_used_for_training.append(distr/np.sum(distr))

# bimodal normal distribution
distr = [r**1,r**2,r**3,r**2,r**1,r**1,r**2,r**3,r**2,r**1]
distrs_used_for_training.append(distr/np.sum(distr))

# bimodal skewed normal distribution
distr = [r**3.5,r**4,r**3,r**2,r**1,r**1,r**2,r**3,r**4,r**3.5]
distrs_used_for_training.append(distr/np.sum(distr))


# exponential distribution
r=1.4
distr = [r**1,r**2,r**3,r**4,r**5,r**6,r**7,r**8,r**9,r**10]
distrs_used_for_training.append(distr/np.sum(distr))

# exponential distribution
r=1.4
distr = [r**10,r**9,r**8,r**7,r**6,r**5,r**4,r**3,r**2,r**1]
distrs_used_for_training.append(distr/np.sum(distr))

print('#distributions used for training = {}'.format(len(distrs_used_for_training)))
for idx, distr in enumerate(distrs_used_for_training):
    print('idx = {}: distr = {}'.format(idx,distr))
#     plt.bar(range(10), distr)
#     plt.show()

#### 2 - the special case when data is ordered by label

In [None]:
distrs_used_for_training = []

for k in range(10):
    distr = np.zeros(10)
    distr[k] = 1
    distrs_used_for_training.append(distr)

print('#distributions used for training = {}'.format(len(distrs_used_for_training)))
for idx, distr in enumerate(distrs_used_for_training):
    print('idx = {}: distr = {}'.format(idx,distr))
#     plt.bar(range(10), distr)
#     plt.show()

#### 3 - build a sequence of random distributions, using Dirichlet distribution

In [None]:
distrs_used_for_training = []
no_distrs_to_generate = 25
# scale_factor_for_Dirichlet = 1.e-1
scale_factor_for_Dirichlet = 1.0
# scale_factor_for_Dirichlet = 1.e+1
shift_factor = 20 # in percents

step = int(shift_factor * no_distrs_to_generate / 100)
print(step)
for k in range(0,no_distrs_to_generate,step):
    dirichlet_params = scale_factor_for_Dirichlet * np.random.randint(1,10, 10 , dtype=np.int64)
    distrs_used_for_training += list((np.random.dirichlet(dirichlet_params, step)))
if len(distrs_used_for_training) < no_distrs_to_generate:
    dirichlet_params = scale_factor_for_Dirichlet * np.random.randint(1,10,10, dtype=np.int64)
    distrs_used_for_training += list(np.random.dirichlet(dirichlet_params, no_distrs_to_generate - len(distrs_used_for_training)))
    

### plot the sequence of distribution

In [None]:

sequence_of_images_length = len(distrs_used_for_training)
INPUT_DISTR_LIST = distrs_used_for_training
NO_EXAMPLES_TO_EXTRACT_FROM_EACH_DISTR = 10
list_no_examples_from_each_distr = np.full(len(INPUT_DISTR_LIST), fill_value=NO_EXAMPLES_TO_EXTRACT_FROM_EACH_DISTR)

sequence_of_images_length = np.sum(list_no_examples_from_each_distr)
sequence_of_imposed_distributions = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])

next_pos_to_write = 0
for id_distr, distr in enumerate(INPUT_DISTR_LIST):
    current_no_examples_to_extract = list_no_examples_from_each_distr[id_distr]
#     print('Current input distr. {:3d}                  = {}'.format(id_distr, distr))
    sequence_of_imposed_distributions[next_pos_to_write: next_pos_to_write + current_no_examples_to_extract] = distr
    next_pos_to_write += current_no_examples_to_extract

list_no_examples_from_each_distr = np.full(len(INPUT_DISTR_LIST), fill_value=NO_EXAMPLES_TO_EXTRACT_FROM_EACH_DISTR)
figtitle = 'Sequence of imposed distributions (seq_length = {})'.format(sequence_of_images_length)
plt.figure(figsize=(80,5))
plt.imshow(utils.distr_sequence_to_rgb_image(T10_palette_rgb, sequence_of_imposed_distributions, width=300))
plt.savefig('test-{}.png'.format(utils.now_as_str()), )
plt.show()

### Check some empirical rules for estimating how many data we need for build a histogram 

In [None]:
for idx, distr in enumerate(distrs_used_for_training):
    print('idx = {}: distr = {}'.format(idx,distr))
    cs = np.cumsum(distr)
#     print(cs)
    iqr = np.argmax(cs>0.75) - np.argmax(cs>0.25)
#     print(iqr)
    print('Freedman-Diaconis rule: n={}'.format(int((2*iqr)**3)))
    std = np.sqrt(np.sum((np.arange(10) ** 2) * distr) - (np.sum(np.arange(10) * distr))**2)
    print('Scott\'s normal ref. rule: n={}'.format(int((3.5 * std)**3)))
    print('Rice rule: n={}'.format((10//2)**3))
    print('Sqrt rule: n={}'.format(10 ** 2))
    print('Sturges formula: n={}'.format(2**(10-1)))
    print()

---
### Build a sequence of images w.r.t. a list of distributions
---

In [None]:
INPUT_DISTR_LIST = distrs_used_for_training
NO_EXAMPLES_TO_EXTRACT_FROM_EACH_DISTR = 500
list_no_examples_from_each_distr = np.full(len(INPUT_DISTR_LIST), fill_value=NO_EXAMPLES_TO_EXTRACT_FROM_EACH_DISTR)
print(list_no_examples_from_each_distr)

# build a sequence of images which will be used for simulating an online testing
sequence_of_images_length = np.sum(list_no_examples_from_each_distr)
sequence_of_images_indices = np.empty(sequence_of_images_length, dtype=np.int32)
sequence_of_imposed_distributions = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
sequence_of_real_distributions = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])

next_pos_to_write = 0
for id_distr, distr in enumerate(INPUT_DISTR_LIST):
    current_no_examples_to_extract = list_no_examples_from_each_distr[id_distr]
    print('Current input distr. {:3d}                  = {}'.format(id_distr, distr))
    mnist_ds = MNISTDataset(MNIST_TRAIN_IMG_PATH, MNIST_TRAIN_LABELS_PATH, MNIST_TEST_IMG_PATH, MNIST_TEST_LABELS_PATH)
    indices_wrt_distr = utils.get_indices_wrt_distr(labels=np.argmax(mnist_ds.test.labels, axis=1), weights=distr, max_no_examples=current_no_examples_to_extract)
    sequence_of_images_indices[next_pos_to_write: next_pos_to_write + current_no_examples_to_extract] = indices_wrt_distr
    sequence_of_imposed_distributions[next_pos_to_write: next_pos_to_write + current_no_examples_to_extract] = distr
    real_distr = np.bincount(np.argmax(mnist_ds.test.labels[indices_wrt_distr], axis=1), minlength=10) / current_no_examples_to_extract
    sequence_of_real_distributions[next_pos_to_write: next_pos_to_write + current_no_examples_to_extract] = real_distr
    next_pos_to_write += current_no_examples_to_extract
    print('Label distr. after extracted {:3d} examples = {}\n'.format(current_no_examples_to_extract, real_distr))

print(sequence_of_images_indices)

# build a Dataset containing the above sequence
mnist_ds = MNISTDataset(MNIST_TRAIN_IMG_PATH, MNIST_TRAIN_LABELS_PATH, MNIST_TEST_IMG_PATH, MNIST_TEST_LABELS_PATH)  # use the sequence_indices on the original mnist_ds
sequence_test_ds = Dataset(images=mnist_ds.test.images[sequence_of_images_indices], labels=mnist_ds.test.labels[sequence_of_images_indices], num_classes=MNISTDataset.num_classes)
print(sequence_test_ds.num_examples) 
print('Overall distribution: {}'.format(sequence_test_ds.label_distr))
print(np.argmax(sequence_test_ds.labels, axis=1))

 ** - plot sequence of imposed distributions (those used for generating the sequence of images); just for comparing it with the one based on actual/predicted labels and to analyze transition periods **

In [None]:
# figtitle = 'Sequence of imposed distributions (seq_length = {})'.format(sequence_of_images_length)
# utils.plot_sequence_of_distr(list_no_examples_from_each_distr, color_list=T10_palette_rgb, distr_sequence=sequence_of_imposed_distributions, method='bar_plot', fig_title=figtitle, window_length=None, save_to_file=True)

 ** - plot sequence of real distributions (those resulting after imposing distributions for generating the sequence of images)  **

In [None]:
# figtitle = 'Sequence of real distributions (seq_length = {})'.format(sequence_of_images_length)
# plot_sequence_of_distr(list_no_examples_from_each_distr, distr_sequence=sequence_of_real_distributions, method='bar_plot', fig_title=figtitle, window_length=None, save_to_file=True)

---
### I. Testing without outputs adjusting
#### This model must be trained on the same amount of data as the one used later, for local distribution adapting
---

In [None]:
# WORK_DIR = './results/Lenet5WithDistr_8distr_[]ex_4/1000ex/'
# ckpt_file = 'Lenet5_8distrs_distrPos_None_1000examples_50batchSize_45epochs_2018_06_05---12_09.model.ckpt'

# WORK_DIR = './results/PriorProbabilityShift_CIFAR10_2018_06_18---04_55/1000ex/'
WORK_DIR = './results/PriorProbabilityShift_experiment_D1_2018_06_10---01_30/1000ex/'
ckpt_files = utils.get_all_files_from_dir_ending_with(WORK_DIR, "ckpt.meta", without_file_extension=True)
ckpt_file = ckpt_files[7]

# DATASET_NAME = 'CIFAR10'
DATASET_NAME = 'MNIST'

def read_dataset(dataset_name):
    if dataset_name == 'MNIST':
        return MNISTDataset(MNIST_TRAIN_IMG_PATH, MNIST_TRAIN_LABELS_PATH, MNIST_TEST_IMG_PATH, MNIST_TEST_LABELS_PATH)
    else:
        return CIFAR10Dataset(CIFAR10_DATASET_DIR)

ds = read_dataset(DATASET_NAME)
restored_distr_pos = utils.restore_variable_from_checkpoint(ckpt_dir=WORK_DIR, ckpt_file=ckpt_file, var_name='distr_pos')
current_model_train_distr = utils.restore_variable_from_checkpoint(ckpt_dir=WORK_DIR, ckpt_file=ckpt_file, var_name='train_distr')
current_model_test_distr = utils.restore_variable_from_checkpoint(ckpt_dir=WORK_DIR, ckpt_file=ckpt_file, var_name='test_distr')
train_num_examples = utils.restore_variable_from_checkpoint(ckpt_dir=WORK_DIR, ckpt_file=ckpt_file, var_name='train_num_examples')
test_model = Lenet5WithDistr(dataset=ds, verbose=False, distr_pos=restored_distr_pos)
test_model.restore_session(ckpt_dir=WORK_DIR, ckpt_filename=ckpt_file)

print('Testing model on the entire sequence, without adapting to local distribution')
# print(np.argmax(sequence_test_ds.labels, axis=1))

ds = read_dataset(DATASET_NAME)
sequence_test_ds = Dataset(images=ds.test.images[sequence_of_images_indices], labels=ds.test.labels[sequence_of_images_indices], num_classes=MNISTDataset.num_classes)

test_loss, test_acc, total_predict, total_actual, wrong_predict_images, output_probs = test_model.test_data(sequence_test_ds, use_only_one_batch=True)
# print(np.argmax(sequence_test_ds.labels, axis=1))

sequence_of_predicted_labels = total_predict
sequence_of_output_probs = output_probs
sequence_of_actual_labels =  total_actual
acc_with_no_adj = test_acc
num_correct_predictions_with_no_adj = sequence_test_ds.num_examples - len(wrong_predict_images)

plot_results_distributions(test_model=test_model, test_ds=sequence_test_ds, train_distr=current_model_train_distr)

print('Accuracy with no ajusting: {:.3f}% ({}/{})'.format(acc_with_no_adj * 100, num_correct_predictions_with_no_adj, sequence_of_images_length))

---
### II a. Adjusting the output probabilities using the Bayesian rule - always estimating priors using the original model's predictions
#### Build the confusion matrix and show how it is used for prior estimation

In [None]:
cm = confusion_matrix(total_actual, total_predict, labels=range(0, mnist_ds.num_classes))
counts_per_class = cm.sum(axis=1)
counts_per_class[counts_per_class == 0] = 1  # in order to prevent division by zero
cm_normalized = cm.astype('float') / counts_per_class[:, np.newaxis]

predicted_label_frequencies = np.bincount(sequence_of_predicted_labels, minlength=10) / sequence_of_images_length
real_priors = np.bincount(sequence_of_actual_labels, minlength=10) / sequence_of_images_length
estimated_priors, _ ,_ ,_ = np.linalg.lstsq(cm_normalized.T, predicted_label_frequencies, rcond=None)  # use lstsq instead of solve to prevent the case of singular matrices

uniform_distr = np.full(shape = (10), fill_value=0.1, dtype=np.float32)
estimated_priord_for_uniform_distr, _ ,_ ,_ = np.linalg.lstsq(cm_normalized.T, uniform_distr, rcond=None)  # use lstsq instead of solve to prevent the case of singular matrices

fig = plt.figure(figsize=(15, 3))
fig.suptitle(y = 1.1, t = 'Priors estimation using confusion matrix', fontsize=18, fontweight='bold')

plt.subplot(1, 2, 1)
plt.bar(range(10), current_model_train_distr)
plt.xticks(range(0, 10))
plt.title('train label distr.', fontsize=14)

plt.subplot(1, 2, 2)
plt.bar(range(10), real_priors)
plt.xticks(range(0, 10))
plt.title('test label distr. (real priors)', fontsize=14)

plt.show()

fig = plt.figure(figsize=(15, 3))
plt.subplot(1, 2, 1)
plt.bar(range(10), predicted_label_frequencies)
plt.xticks(range(0, 10))
plt.title('predicted label distr.', fontsize=14)

plt.subplot(1, 2, 2)
plt.bar(range(10), estimated_priors)
plt.xticks(range(0, 10))
plt.title('estimated priors', fontsize=14)

plt.show()

fig = plt.figure(figsize=(15, 3))
plt.subplot(1, 2, 1)
plt.bar(range(10), np.abs(estimated_priord_for_uniform_distr))
plt.xticks(range(0, 10))
plt.title('estimated priors for uniform distr.', fontsize=14)

plt.show()


---
#### Build and plot the sequences of distributions which will be used in the adjusting operation
 1. adjusting using real priors: use what the model should predict, from the future; this case should give the best accuracy
 2. adjusting using estimated priors by the recent actual labels (what the model should have predicted in the recent past)
 3. adjusting using estimated priors by the recent predicted labels (what the model predicted in the recent past)
 4. adjusting using estimated priors by the recent predicted labels (what the model predicted in the recent past); estimate using confusion matrix method


In [None]:
ts = time.time()
# build the sequence of distributions in order to maximize the correlation between them and the impose distribution
WINDOW_LENGTH = 100

def L1_distance(distr1, distr2):
    return np.sum(np.abs(distr1 - distr2))  # L1 distance
def corr_coeff_distance(distr1, distr2):
    return 1 - np.corrcoef(distr1, distr2)[0][1]
def cosine_distance(distr1, distr2):
    return (1 - np.sum(distr1 * distr2) / np.sqrt(np.sum(distr1 * distr1) * np.sum(distr2 * distr2)))

# distance_measure_function = L1_distance
# distance_thr = 0.38 # L1 distance should be greater than distance_thr to decide that there is a changepoint

# distance_measure_function = corr_coeff_distance
# distance_thr = 0.85 # correlation coefficient should be less than distance_thr to decide that there is a changepoint

distance_measure_function = cosine_distance
distance_thr = 0.05 # correlation coefficient should be less than distance_thr to decide that there is a changepoint

sequence_of_distributions_based_on_actual_labels = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
adapted_sequence_of_distributions_based_on_actual_labels = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
# adapted_sequence_of_distributions_based_on_predicted_labels = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
# adapted_sequence_of_distributions_based_on_predicted_labels_and_cm = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
3
distances_between_distrs = np.empty(sequence_of_images_length)
start_point_of_current_extended_window = 0

for k in range(len(sequence_of_images_indices)):
    if k == 0:
        sequence_of_distributions_based_on_actual_labels[k, :] = uniform_distr
        adapted_sequence_of_distributions_based_on_actual_labels[k, :] = uniform_distr
    elif k < WINDOW_LENGTH:
        sequence_of_distributions_based_on_actual_labels[k, :] = np.bincount(sequence_of_actual_labels[0:k], minlength=10) / k
        adapted_sequence_of_distributions_based_on_actual_labels[k, :] = sequence_of_distributions_based_on_actual_labels[k, :]
    else:
        sequence_of_distributions_based_on_actual_labels[k, :] = np.bincount(sequence_of_actual_labels[k - WINDOW_LENGTH:k], minlength=10) / WINDOW_LENGTH
        current_window_distr = np.bincount(sequence_of_actual_labels[k - WINDOW_LENGTH:k], minlength=10) / WINDOW_LENGTH
        current_imposed_distr = adapted_sequence_of_distributions_based_on_actual_labels[k-1]
        dist = distance_measure_function(current_imposed_distr, current_window_distr)

        if dist > distance_thr:
            start_point_of_current_extended_window = k - WINDOW_LENGTH
        adapted_sequence_of_distributions_based_on_actual_labels[k, :] = np.bincount(sequence_of_actual_labels[start_point_of_current_extended_window:k], minlength=10) / (k - start_point_of_current_extended_window)
    distances_between_distrs[k] = distance_measure_function(sequence_of_imposed_distributions[k], sequence_of_distributions_based_on_actual_labels[k])

print((time.time() - ts) * 1000)

In [None]:
# plot the sequence of distribution
figtitle = 'Sequence of imposed distributions (seq_length = {})'.format(sequence_of_images_length)
utils.plot_sequence_of_distr(list_no_examples_from_each_distr, distr_sequence=sequence_of_imposed_distributions, 
                             method='my_distr_img_plot', fig_title=figtitle, color_list=T10_palette_rgb, window_length=None, save_to_file=False)

figtitle = 'Sequence of distributions based on actual labels (window_length = {}, seq_length = {})'.format(WINDOW_LENGTH, sequence_of_images_length)
utils.plot_sequence_of_distr(list_no_examples_from_each_distr, distr_sequence=sequence_of_distributions_based_on_actual_labels, 
                             method='my_distr_img_plot', fig_title=figtitle, color_list=T10_palette_rgb, window_length=WINDOW_LENGTH, save_to_file=False)


figtitle = 'Adapted sequence of distributions based on actual labels (window_length = {}, seq_length = {})'.format(WINDOW_LENGTH, sequence_of_images_length)
utils.plot_sequence_of_distr(list_no_examples_from_each_distr, distr_sequence=adapted_sequence_of_distributions_based_on_actual_labels, 
                             method='my_distr_img_plot', fig_title=figtitle, color_list=T10_palette_rgb, window_length=WINDOW_LENGTH, save_to_file=False)

In [None]:
corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, sequence_of_distributions_based_on_actual_labels.flat)[0][1]
print('corrcoef(sequence_of_imposed_distributions, sequence_of_distributions_based_on_actual_labels)                                                = {:.5f}'.format(corr_coeff))
corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, adapted_sequence_of_distributions_based_on_actual_labels.flat)[0][1]
print('corrcoef(sequence_of_imposed_distributions, adapted_sequence_of_distributions_based_on_actual_labels)                                        = {:.5f}'.format(corr_coeff))

In [None]:
old_priors = current_model_train_distr
uniform_distr = np.full(shape = (10), fill_value=0.1, dtype=np.float32)

DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES = True
distr_to_assume_for_the_first_window_length_images=uniform_distr

# 1. adjusting using real priors: use what the model should predict, from the future; this case should give the best accuracy 
real_priors = sequence_of_real_distributions
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
preds_with_adj_using_real_priors = np.argmax(adj_output_probs, axis=1)

# 2. adjusting using estimated priors by the recent actual labels (what the model should have predicted in the recent past)
real_priors = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
    real_priors[:WINDOW_LENGTH] = distr_to_assume_for_the_first_window_length_images
    real_priors[WINDOW_LENGTH:] = sequence_of_distributions_based_on_actual_labels[WINDOW_LENGTH:]
else:
    real_priors = sequence_of_distributions_based_on_actual_labels
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
preds_with_adj_using_estimated_priors_from_recent_actual_labels = np.argmax(adj_output_probs, axis=1)

# 3. adjusting using estimated priors by the recent actual labels (what the model should have predicted in the recent past), with adaptatation for stability
if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
    real_priors[:WINDOW_LENGTH] = distr_to_assume_for_the_first_window_length_images
    real_priors[WINDOW_LENGTH:] = adapted_sequence_of_distributions_based_on_actual_labels[WINDOW_LENGTH:]
else:
    real_priors = sequence_of_distributions_based_on_predicted_labels
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
adapted_preds_with_adj_using_estimated_priors_from_recent_actual_labels = np.argmax(adj_output_probs, axis=1)

    
print('Accuracy with no ajusting:                                                                  {:.3f}% ({}/{})'.format(acc_with_no_adj * 100, num_correct_predictions_with_no_adj, sequence_of_images_length))

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_real_priors)
acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
print('Accuracy with ajusting using real priors:                                                   {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_estimated_priors_from_recent_actual_labels)
acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
print('Accuracy with ajusting using estimated priors from recent actual labels:                    {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == adapted_preds_with_adj_using_estimated_priors_from_recent_actual_labels)
acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
print('Accuracy with ajusting using estimated priors from recent actual labels and adapting:       {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

In [None]:
WINDOW_LENGTH=100

corr_coeff_between_real_distr_and_adapted_estimates_list = []
acc_list = []

# distance_measure_function = L1_distance
# distance_thr = 0.38 # L1 distance should be greater than distance_thr to decide that there is a changepoint

# distance_measure_function = corr_coeff_distance
# distance_thr = 0.85 # correlation coefficient should be less than distance_thr to decide that there is a changepoint

distance_measure_function = cosine_distance
distance_thr = 0.05 # correlation coefficient should be less than distance_thr to decide that there is a changepoint

for distance_thr in np.arange(0, 2.01, 0.05):
    distance_thr = 0.05
    print(distance_thr)
    sequence_of_distributions_based_on_actual_labels = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
    adapted_sequence_of_distributions_based_on_actual_labels = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
    # adapted_sequence_of_distributions_based_on_predicted_labels = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
    # adapted_sequence_of_distributions_based_on_predicted_labels_and_cm = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])

    distances_between_distrs = np.empty(sequence_of_images_length)
    start_point_of_current_extended_window = 0

    for k in range(len(sequence_of_images_indices)):
        if k == 0:
            sequence_of_distributions_based_on_actual_labels[k, :] = uniform_distr
            adapted_sequence_of_distributions_based_on_actual_labels[k, :] = uniform_distr
        elif k < WINDOW_LENGTH:
            sequence_of_distributions_based_on_actual_labels[k, :] = np.bincount(sequence_of_actual_labels[0:k], minlength=10) / k
            adapted_sequence_of_distributions_based_on_actual_labels[k, :] = sequence_of_distributions_based_on_actual_labels[k, :]
        else:
            sequence_of_distributions_based_on_actual_labels[k, :] = np.bincount(sequence_of_actual_labels[k - WINDOW_LENGTH:k], minlength=10) / WINDOW_LENGTH
            current_window_distr = np.bincount(sequence_of_actual_labels[k - WINDOW_LENGTH:k], minlength=10) / WINDOW_LENGTH
            current_imposed_distr = adapted_sequence_of_distributions_based_on_actual_labels[k-1]
            dist = distance_measure_function(current_imposed_distr, current_window_distr)
            if dist > distance_thr:
                start_point_of_current_extended_window = k - WINDOW_LENGTH
            adapted_sequence_of_distributions_based_on_actual_labels[k, :] = np.bincount(sequence_of_actual_labels[start_point_of_current_extended_window:k], minlength=10) / (k - start_point_of_current_extended_window)
    #     distances_between_distrs[k] = np.corrcoef(sequence_of_imposed_distributions[k], sequence_of_distributions_based_on_actual_labels[k])[0][1]
    #     corr_coeffs2[k] = np.corrcoef(sequence_of_imposed_distributions[k], adapted_sequence_of_distributions_based_on_actual_labels[k])[0][1]
        distances_between_distrs[k] = distance_measure_function(sequence_of_imposed_distributions[k], sequence_of_distributions_based_on_actual_labels[k])
        
        
    corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, sequence_of_distributions_based_on_actual_labels.flat)[0][1]
    print('corrcoef(sequence_of_imposed_distributions, sequence_of_distributions_based_on_actual_labels)                                                = {:.5f}'.format(corr_coeff))
    corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, adapted_sequence_of_distributions_based_on_actual_labels.flat)[0][1]
    print('corrcoef(sequence_of_imposed_distributions, adapted_sequence_of_distributions_based_on_actual_labels)                                        = {:.5f}'.format(corr_coeff))
    
    corr_coeff_between_real_distr_and_adapted_estimates_list.append(corr_coeff)
    
    old_priors = current_model_train_distr
    uniform_distr = np.full(shape = (10), fill_value=0.1, dtype=np.float32)

    DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES = True
    distr_to_assume_for_the_first_window_length_images=uniform_distr

    # 1. adjusting using real priors: use what the model should predict, from the future; this case should give the best accuracy 
    real_priors = sequence_of_real_distributions
    initial_output_probs = sequence_of_output_probs
    adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
    preds_with_adj_using_real_priors = np.argmax(adj_output_probs, axis=1)

    # 2. adjusting using estimated priors by the recent actual labels (what the model should have predicted in the recent past)
    real_priors = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
    if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
        real_priors[:WINDOW_LENGTH] = distr_to_assume_for_the_first_window_length_images
        real_priors[WINDOW_LENGTH:] = sequence_of_distributions_based_on_actual_labels[WINDOW_LENGTH:]
    else:
        real_priors = sequence_of_distributions_based_on_actual_labels
    initial_output_probs = sequence_of_output_probs
    adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
    preds_with_adj_using_estimated_priors_from_recent_actual_labels = np.argmax(adj_output_probs, axis=1)

    # 3. adjusting using estimated priors by the recent actual labels (what the model should have predicted in the recent past), with adaptatation for stability
    if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
        real_priors[:WINDOW_LENGTH] = distr_to_assume_for_the_first_window_length_images
        real_priors[WINDOW_LENGTH:] = adapted_sequence_of_distributions_based_on_actual_labels[WINDOW_LENGTH:]
    else:
        real_priors = sequence_of_distributions_based_on_predicted_labels
    initial_output_probs = sequence_of_output_probs
    adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
    adapted_preds_with_adj_using_estimated_priors_from_recent_actual_labels = np.argmax(adj_output_probs, axis=1)


    print('Accuracy with no ajusting:                                                                  {:.3f}% ({}/{})'.format(acc_with_no_adj * 100, num_correct_predictions_with_no_adj, sequence_of_images_length))

    num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_real_priors)
    acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
    print('Accuracy with ajusting using real priors:                                                   {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
    print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

    num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_estimated_priors_from_recent_actual_labels)
    acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
    print('Accuracy with ajusting using estimated priors from recent actual labels:                    {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
    print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

    num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == adapted_preds_with_adj_using_estimated_priors_from_recent_actual_labels)
    acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
    print('Accuracy with ajusting using estimated priors from recent actual labels and adapting:       {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
    print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))
    
    acc_list.append(acc_with_adj)
    
    print('\n\n\n')
    break

In [None]:
plt.figure(figsize=(30,5))
plt.plot(acc_list, linewidth=3, label='accuracy')
plt.xticks(np.arange(len(acc_list)), np.round(np.arange(0, 2, 0.05), decimals=2))
plt.grid()
plt.legend()
plt.show()

plt.figure(figsize=(30,5))
plt.plot(corr_coeff_between_real_distr_and_adapted_estimates_list, linewidth=3, label='corr(imposed, adapted actual)')
plt.xticks(np.arange(len(acc_list)), np.round(np.arange(0, 2, 0.05), decimals=2))
plt.grid()
plt.legend()
plt.show()

In [None]:
# plot the sequence of distribution
figtitle = 'Sequence of imposed distributions (seq_length = {})'.format(sequence_of_images_length)
utils.plot_sequence_of_distr(list_no_examples_from_each_distr, distr_sequence=sequence_of_imposed_distributions, 
                             method='my_distr_img_plot', fig_title=figtitle, color_list=T10_palette_rgb, window_length=None, save_to_file=True)

figtitle = 'Sequence of distributions based on actual labels (window_length = {}, seq_length = {})'.format(WINDOW_LENGTH, sequence_of_images_length)
utils.plot_sequence_of_distr(list_no_examples_from_each_distr, distr_sequence=sequence_of_distributions_based_on_actual_labels, 
                             method='my_distr_img_plot', fig_title=figtitle, color_list=T10_palette_rgb, window_length=WINDOW_LENGTH, save_to_file=True)

figtitle = 'Sequence of distributions based on predicted labels (window_length = {}, seq_length = {})'.format(WINDOW_LENGTH, sequence_of_images_length)
utils.plot_sequence_of_distr(list_no_examples_from_each_distr, distr_sequence=sequence_of_distributions_based_on_predicted_labels, 
                             method='my_distr_img_plot', fig_title=figtitle, color_list=T10_palette_rgb, window_length=WINDOW_LENGTH, save_to_file=True)

figtitle = 'Sequence of distributions based on predicted labels and confusion matrix (window_length = {}, seq_length = {})'.format(WINDOW_LENGTH, sequence_of_images_length)
utils.plot_sequence_of_distr(list_no_examples_from_each_distr, distr_sequence=sequence_of_distributions_based_on_predicted_labels_and_cm, 
                             method='my_distr_img_plot', fig_title=figtitle, color_list=T10_palette_rgb, window_length=WINDOW_LENGTH, save_to_file=True)

In [None]:
old_priors = current_model_train_distr
uniform_distr = np.full(shape = (10), fill_value=0.1, dtype=np.float32)

DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES = True
distr_to_assume_for_the_first_window_length_images=uniform_distr

# 1. adjusting using real priors: use what the model should predict, from the future; this case should give the best accuracy 
real_priors = sequence_of_real_distributions
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
preds_with_adj_using_real_priors = np.argmax(adj_output_probs, axis=1)

# 1'. adjusting using overall distribution as priors
real_priors = sequence_test_ds.label_distr
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
preds_with_adj_using_overall_distr = np.argmax(adj_output_probs, axis=1)

# 2. adjusting using estimated priors by the recent actual labels (what the model should have predicted in the recent past)
real_priors = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
    real_priors[:WINDOW_LENGTH] = distr_to_assume_for_the_first_window_length_images
    real_priors[WINDOW_LENGTH:] = sequence_of_distributions_based_on_actual_labels[WINDOW_LENGTH:]
else:
    real_priors = sequence_of_distributions_based_on_actual_labels
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
preds_with_adj_using_estimated_priors_from_recent_actual_labels = np.argmax(adj_output_probs, axis=1)

# 3. adjusting using estimated priors by the recent predicted labels (what the model predicted in the recent past)
if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
    real_priors[:WINDOW_LENGTH] = distr_to_assume_for_the_first_window_length_images
    real_priors[WINDOW_LENGTH:] = sequence_of_distributions_based_on_predicted_labels[WINDOW_LENGTH:]
else:
    real_priors = sequence_of_distributions_based_on_predicted_labels
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
preds_with_adj_using_estimated_priors_from_recent_predicted_labels = np.argmax(adj_output_probs, axis=1)

# 4. adjusting using estimated priors by the recent predicted labels (what the model predicted in the recent past) and confusion matrix method
if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
    real_priors[:WINDOW_LENGTH] = distr_to_assume_for_the_first_window_length_images
    real_priors[WINDOW_LENGTH:] = sequence_of_distributions_based_on_predicted_labels_and_cm[WINDOW_LENGTH:]
else:
    real_priors = sequence_of_distributions_based_on_predicted_labels_and_cm
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
preds_with_adj_using_estimated_priors_from_recent_predicted_labels_and_cm = np.argmax(adj_output_probs, axis=1)
    
    
print('Accuracy with no ajusting:                                                                  {:.3f}% ({}/{})'.format(acc_with_no_adj * 100, num_correct_predictions_with_no_adj, sequence_of_images_length))

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_real_priors)
acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
print('Accuracy with ajusting using real priors:                                                   {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_overall_distr)
acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
print('Accuracy with ajusting using global distribution:                                           {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_estimated_priors_from_recent_actual_labels)
acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
print('Accuracy with ajusting using estimated priors from recent actual labels:                    {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_estimated_priors_from_recent_predicted_labels)
acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
print('Accuracy with ajusting using estimated priors from recent predicted labels:                 {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_estimated_priors_from_recent_predicted_labels_and_cm)
acc_with_adj = num_correct_predictions_with_adj / sequence_of_images_length
print('Accuracy with ajusting using estimated priors from recent predictions and confusion matrix: {:.3f}% ({}/{})'.format(acc_with_adj * 100, num_correct_predictions_with_adj, sequence_of_images_length))
print('Diff = {:.3f}%'.format((acc_with_adj - acc_with_no_adj) * 100))

---
### Study the correlation between estimated priors with different window lengths and the accuracy
---

In [None]:
corr_coeff = np.corrcoef(sequence_of_distributions_based_on_actual_labels.flat, sequence_of_distributions_based_on_predicted_labels.flat)[0][1]
print('corrcoef(sequence_of_distributions_based_on_actual_labels, sequence_of_distributions_based_on_predicted_labels)                      = {:.5f}'.format(corr_coeff))

corr_coeff = np.corrcoef(sequence_of_distributions_based_on_actual_labels.flat, sequence_of_distributions_based_on_predicted_labels_and_cm.flat)[0][1]
print('corrcoef(sequence_of_distributions_based_on_actual_labels, sequence_of_distributions_based_on_predicted_labels_and_confusion_matrix) = {:.5f}'.format(corr_coeff))

corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, sequence_of_distributions_based_on_actual_labels.flat)[0][1]
print('corrcoef(sequence_of_imposed_distributions, sequence_of_distributions_based_on_actual_labels)                                        = {:.5f}'.format(corr_coeff))

corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, sequence_of_distributions_based_on_predicted_labels.flat)[0][1]
print('corrcoef(sequence_of_imposed_distributions, sequence_of_distributions_based_on_predicted_labels)                                     = {:.5f}'.format(corr_coeff))

corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, sequence_of_distributions_based_on_predicted_labels_and_cm.flat)[0][1]
print('corrcoef(sequence_of_imposed_distributions, sequence_of_distributions_based_on_predicted_labels_and_confusion_matrix)                = {:.5f}'.format(corr_coeff))


In [None]:
WINDOW_LENGTHS = [10, 15, 25, 50, 75, 100, 125, 150, 200, 250]
temp = 500
while temp < sequence_of_images_length:
    WINDOW_LENGTHS.append(temp)
    temp *= 2
if WINDOW_LENGTHS[-1] != sequence_of_images_length:
    WINDOW_LENGTHS.append(sequence_of_images_length)
    
WINDOW_LENGTHS = [100]

print(WINDOW_LENGTHS)

DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES = False
distr_to_assume_for_the_first_window_length_images=uniform_distr

old_priors = current_model_train_distr
uniform_distr = np.full(shape = (10), fill_value=0.1, dtype=np.float32)

full_results_dict = {}
idx_model = 0
full_results_dict[ckpt_file] = {'idx_model':idx_model, 'train_distr': current_model_train_distr, 'test_distr': current_model_test_distr, 'train_num_examples': train_num_examples, 'test_seq_length':sequence_of_images_length,
                                   'test_results': {'id_test':[], 'method':[], 'window_length':[], 'test_acc':[], 'num_correct_preds':[], 'corr_coeff': []}}
id_test = 0

def save_test_results_to_dict(test_dict, id_test, method, window_length, num_correct_preds, sequence_of_images_length, corr_coeff):
    test_dict['id_test'].append(id_test)
    test_dict['method'].append(method)
    test_dict['window_length'].append(window_length)
    acc = num_correct_predictions_with_adj / sequence_of_images_length
    test_dict['test_acc'].append(num_correct_preds / sequence_of_images_length)
    test_dict['num_correct_preds'].append(num_correct_preds)
    test_dict['corr_coeff'].append(corr_coeff)

save_test_results_to_dict(test_dict=full_results_dict[ckpt_file]['test_results'], id_test=id_test, method='no_adj',
                          window_length=None, num_correct_preds=num_correct_predictions_with_no_adj, sequence_of_images_length=sequence_of_images_length, corr_coeff=None)

# 1. adjusting using real priors: use what the model should predict, from the future; this case should give the best accuracy 
real_priors = sequence_of_real_distributions
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
preds_with_adj_using_real_priors = np.argmax(adj_output_probs, axis=1)

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_real_priors)
corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, sequence_of_real_distributions.flat)[0][1]
save_test_results_to_dict(test_dict=full_results_dict[ckpt_file]['test_results'], id_test=id_test, method='adj_using_real_priors',
                          window_length=None, num_correct_preds=num_correct_predictions_with_adj, sequence_of_images_length=sequence_of_images_length, corr_coeff=corr_coeff)

# 1'. adjusting using overall distribution as priors
real_priors = sequence_test_ds.label_distr
initial_output_probs = sequence_of_output_probs
adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
preds_with_adj_using_overall_distr = np.argmax(adj_output_probs, axis=1)

num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_overall_distr)
save_test_results_to_dict(test_dict=full_results_dict[ckpt_file]['test_results'], id_test=id_test, method='adj_using_overall_distr',
                          window_length=None, num_correct_preds=num_correct_predictions_with_adj, sequence_of_images_length=sequence_of_images_length, corr_coeff=None)

for id_window, window_length in enumerate(WINDOW_LENGTHS):
    print('\rProgress = {:.0f}%'.format((id_window + 1)/len(WINDOW_LENGTHS) * 100), end='')
    sequence_of_distributions_based_on_actual_labels = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
    sequence_of_distributions_based_on_predicted_labels = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
    sequence_of_distributions_based_on_predicted_labels_and_cm = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
    
    for k in range(sequence_of_images_length):
        if k == 0:
            sequence_of_distributions_based_on_actual_labels[k] = uniform_distr
            sequence_of_distributions_based_on_predicted_labels[k] = uniform_distr
        elif k < window_length:
            sequence_of_distributions_based_on_actual_labels[k] = np.bincount(sequence_of_actual_labels[0:k], minlength=10) / k
            sequence_of_distributions_based_on_predicted_labels[k] = np.bincount(sequence_of_predicted_labels[0:k], minlength=10) / k
        else:
            sequence_of_distributions_based_on_actual_labels[k] = np.bincount(sequence_of_actual_labels[k - window_length:k], minlength=10) / window_length
            sequence_of_distributions_based_on_predicted_labels[k] = np.bincount(sequence_of_predicted_labels[k - window_length:k], minlength=10) / window_length

        sequence_of_distributions_based_on_predicted_labels_and_cm[k] = np.linalg.solve(cm_normalized.T, sequence_of_distributions_based_on_predicted_labels[k])
        sequence_of_distributions_based_on_predicted_labels_and_cm[k][sequence_of_distributions_based_on_predicted_labels_and_cm[k] < 0] = 0 # if there are negative weights (probably due to numerical precision), make them zero
        sequence_of_distributions_based_on_predicted_labels_and_cm[k] /= sum(sequence_of_distributions_based_on_predicted_labels_and_cm[k])  # make it sum to 1
        
    # 2. adjusting using estimated priors by the recent actual labels (what the model should have predicted in the recent past)
    real_priors = np.empty(shape=[sequence_of_images_length, mnist_ds.num_classes])
    if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
        real_priors[:window_length] = distr_to_assume_for_the_first_window_length_images
        real_priors[window_length:] = sequence_of_distributions_based_on_actual_labels[window_length:]
    else:
        real_priors = sequence_of_distributions_based_on_actual_labels
    initial_output_probs = sequence_of_output_probs
    adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
    preds_with_adj_using_estimated_priors_from_recent_actual_labels = np.argmax(adj_output_probs, axis=1)

    # 3. adjusting using estimated priors by the recent predicted labels (what the model predicted in the recent past)
    if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
        real_priors[:window_length] = distr_to_assume_for_the_first_window_length_images
        real_priors[window_length:] = sequence_of_distributions_based_on_predicted_labels[window_length:]
    else:
        real_priors = sequence_of_distributions_based_on_predicted_labels
    initial_output_probs = sequence_of_output_probs
    adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
    preds_with_adj_using_estimated_priors_from_recent_predicted_labels = np.argmax(adj_output_probs, axis=1)

    # 4. adjusting using estimated priors by the recent predicted labels (what the model predicted in the recent past) and confusion matrix method
    if DONT_ADJUST_FIRST_WINDOW_LENGTH_IMAGES:
        real_priors[:window_length] = distr_to_assume_for_the_first_window_length_images
        real_priors[window_length:] = sequence_of_distributions_based_on_predicted_labels_and_cm[window_length:]
    else:
        real_priors = sequence_of_distributions_based_on_predicted_labels_and_cm
    initial_output_probs = sequence_of_output_probs
    adj_output_probs = (real_priors / old_priors) *  initial_output_probs/ (np.sum((real_priors / old_priors) * initial_output_probs))
    preds_with_adj_using_estimated_priors_from_recent_predicted_labels_and_cm = np.argmax(adj_output_probs, axis=1)
    
    num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_estimated_priors_from_recent_actual_labels)
    corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, sequence_of_distributions_based_on_actual_labels.flat)[0][1]
    save_test_results_to_dict(test_dict=full_results_dict[ckpt_file]['test_results'], id_test=id_test, method='adj_using_estimated_priors_from_recent_actual_labels', 
                              window_length=window_length, num_correct_preds=num_correct_predictions_with_adj, sequence_of_images_length=sequence_of_images_length, corr_coeff=corr_coeff)
    
    num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_estimated_priors_from_recent_predicted_labels)
    corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, sequence_of_distributions_based_on_predicted_labels.flat)[0][1]
    save_test_results_to_dict(test_dict=full_results_dict[ckpt_file]['test_results'], id_test=id_test, method='adj_using_estimated_priors_from_recent_predicted_labels', 
                              window_length=window_length, num_correct_preds=num_correct_predictions_with_adj, sequence_of_images_length=sequence_of_images_length, corr_coeff=corr_coeff)
    
    num_correct_predictions_with_adj = np.sum(sequence_of_actual_labels == preds_with_adj_using_estimated_priors_from_recent_predicted_labels_and_cm)
    corr_coeff = np.corrcoef(sequence_of_imposed_distributions.flat, sequence_of_distributions_based_on_predicted_labels_and_cm.flat)[0][1]
    save_test_results_to_dict(test_dict=full_results_dict[ckpt_file]['test_results'], id_test=id_test, method='adj_using_estimated_priors_from_recent_predicted_labels_and_cm', 
                              window_length=window_length, num_correct_preds=num_correct_predictions_with_adj, sequence_of_images_length=sequence_of_images_length, corr_coeff=corr_coeff)


In [None]:
restored_perf_dict = full_results_dict
perf_df = pd.DataFrame(restored_perf_dict[ckpt_file]['test_results'], columns=list(restored_perf_dict[ckpt_file]['test_results'].keys()))
display(perf_df.head())
temp_df = perf_df[perf_df.method == 'adj_using_estimated_priors_from_recent_actual_labels']
display(temp_df)

In [None]:
plt.figure(figsize=(40,10))
plt.subplot(1,2,1)
plt.title('Accuracy comparison', fontsize=30)

bar_width = 0.2
line_width = 5
y = float(perf_df[perf_df.method == 'no_adj'].test_acc)
plt.plot([0, len(temp_df.window_length) - 1], [y, y], '--', linewidth=line_width, alpha=1, label='no adjusting', color=T10_palette_rgb_normed[7])

y = float(perf_df[perf_df.method == 'adj_using_real_priors'].test_acc)
plt.plot([0, len(temp_df.window_length) - 1], [y, y], '-', linewidth=line_width, alpha=1, label='adjusting using real priors', color=T10_palette_rgb_normed[2])

y = float(perf_df[perf_df.method == 'adj_using_overall_distr'].test_acc)
plt.plot([0, len(temp_df.window_length) - 1], [y, y], ':', linewidth=line_width, alpha=1, label='adjusting using global distribution', color=T10_palette_rgb_normed[3])

y = np.array(perf_df[perf_df.method == 'adj_using_estimated_priors_from_recent_actual_labels'].test_acc)
# plt.bar(np.arange(len(temp_df.window_length)) - bar_width, y, width=bar_width,  label='adjusting using estimated priors from recent actual labels', color=T10_palette_rgb_normed[0])
plt.plot(y, '--', linewidth=3, label='adjusting using estimated priors from recent actual labels', color=T10_palette_rgb_normed[0])

y = np.array(perf_df[perf_df.method == 'adj_using_estimated_priors_from_recent_predicted_labels'].test_acc)
# plt.bar(np.arange(len(temp_df.window_length)), y, width=bar_width,  label='adjusting using estimated priors from recent predicted labels', color=T10_palette_rgb_normed[5])
plt.plot(y, ':', linewidth=3, label='adjusting using estimated priors from recent predicted labels', color=T10_palette_rgb_normed[5])

y = np.array(perf_df[perf_df.method == 'adj_using_estimated_priors_from_recent_predicted_labels_and_cm'].test_acc)
# plt.bar(np.arange(len(temp_df.window_length)) + bar_width, y, width=bar_width,  label='adjusting using estimated priors from recent predicted labels and confusion matrix', color=T10_palette_rgb_normed[6])
plt.plot(y, '-', linewidth=3, label='adjusting using estimated priors from recent predicted labels and confusion matrix', color=T10_palette_rgb_normed[6])

# plt.grid(axis='y')
plt.grid()
plt.legend(loc='lower right', fontsize=20)
plt.xlabel('Window length', fontsize=20)
plt.ylabel('Accuracy', fontsize=20)
plt.tick_params(labelsize=20)
# plt.yticks(np.arange(0, 1.01, 0.1))
plt.xticks(np.arange(len(temp_df.window_length)), temp_df.window_length.astype(int))

plt.subplot(1,2,2)
plt.title('Correlation coefficient between sequence of imposed priors and real priors', fontsize=30)

y = np.array(perf_df[perf_df.method == 'adj_using_estimated_priors_from_recent_actual_labels'].corr_coeff)
# plt.bar(np.arange(len(temp_df.window_length)) - bar_width, y, width=bar_width,  label='adjusting using estimated priors from recent actual labels', color=T10_palette_rgb_normed[0])
plt.plot(y, '--', linewidth=3, label='adjusting using estimated priors from recent actual labels', color=T10_palette_rgb_normed[0])

y = np.array(perf_df[perf_df.method == 'adj_using_estimated_priors_from_recent_predicted_labels'].corr_coeff)
# plt.bar(np.arange(len(temp_df.window_length)), y, width=bar_width,  label='adjusting using estimated priors from recent predicted labels', color=T10_palette_rgb_normed[5])
plt.plot(y, ':', linewidth=3, label='adjusting using estimated priors from recent predicted labels', color=T10_palette_rgb_normed[5])

y = np.array(perf_df[perf_df.method == 'adj_using_estimated_priors_from_recent_predicted_labels_and_cm'].corr_coeff)
# plt.bar(np.arange(len(temp_df.window_length)) + bar_width, y, width=bar_width,  label='adjusting using estimated priors from recent predicted labels and confusion matrix', color=T10_palette_rgb_normed[6])
plt.plot(y, '-', linewidth=3, label='adjusting using estimated priors from recent predicted labels and confusion matrix', color=T10_palette_rgb_normed[6])

# plt.grid(axis='y')
plt.grid()
plt.legend(loc='lower right', fontsize=20)
plt.xlabel('Window length', fontsize=20)
plt.ylabel('Accuracy', fontsize=20)
plt.tick_params(labelsize=20)
# plt.yticks(np.arange(0, 1.01, 0.1))
plt.xticks(np.arange(len(temp_df.window_length)), temp_df.window_length.astype(int))

plt.show()
