In this notebook we will compare the MBR scores calculated with the monte-carlo estimates and those from the predictive model
We will try to analyse failure points and think about how to tackle those.



In [None]:
# Some setup code for imports
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from custom_datasets.BayesRiskDatasetLoader import BayesRiskDatasetLoader


In [None]:

# First we will load the validation set with the calculated scores.
dataset_loader = BayesRiskDatasetLoader("validation_predictive", n_hypotheses=100, n_references=1000, sampling_method='ancestral')
validation_dataset = dataset_loader.load(type="pandas")


In [None]:
import numpy as np
def calculate_mbr_scores(entry):
    scores = []
    count = np.sum(entry["utilities_count"])
    for hyp, util in zip(entry["hypotheses"], entry["utilities"]):
        score = np.sum(util * entry["utilities_count"])/count
        scores.append(score)
    return scores

In [None]:
### Next up we will read the trained model and calculate the score given by the heads
from models.pl_predictive.PLPredictiveModelFactory import PLPredictiveModelFactory

model_name = "student-t-3-repeated"
path = './{}/'.format(model_name)
model_path = "C:/Users/gerso/FBR/predictive/tatoeba-de-en/models/"+ model_name + '/'
model, factory = PLPredictiveModelFactory.load(model_path)
model = model.eval()

In [None]:
from models.MBR_model.GaussianMixtureMBRModel import GaussianMixtureMBRModel
from models.MBR_model.StudentTMixtureMBRModel import StudentTMixtureMBRModel
wrapped_model = StudentTMixtureMBRModel(model)




In [None]:
import seaborn as sns


def get_repeated_utils(utilities, count):
    
    repeated_utils = []
    for util in utilities:
        
        r = []
        for u, c in zip(util, count):
            r += [u] * c
        repeated_utils.append(r)
    return repeated_utils

In [None]:
import matplotlib.pyplot as plt
def clean(s):
    return s.replace('?', '')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


def plot_distributions(source, target, hypotheses, utilities, model, max_show=5, path='./validtion_imgs/'):
    cleaned_source = clean(source)
    cleaned_target = clean(target)
    
    sns.set_style("dark")
    
    
    samples = model.get_samples([source]*len(hypotheses), hypotheses, n_samples=1000)
    
    samples = samples.permute([1, 0, 2]).squeeze(-1).cpu().numpy().tolist()
 
    
    
#     monte_carlo_1_sorted = monte_carlo_scores[sorted_indices][:max_show]
#     predicted_scores_1_sorted = predicted_scores[sorted_indices][:max_show]
#     hypotheses_sorted = hypotheses[sorted_indices][:max_show]
    
    utilities = utilities[:max_show]
    samples = samples[:max_show]
    hypotheses = hypotheses[:max_show]
    data = {
        "Utility": [],
        "Hypothesis": [],
        "Samples": [] # Either Monte-carlo or predictive
        
    }
    
    
    
    
    for u, h in zip(utilities, hypotheses):
        
        data["Utility"] += u
        data["Hypothesis"] += [h] * len(u)
        data["Samples"] += ["Data"] * len(u)
    for s, h in zip(samples, hypotheses):
        data["Utility"] += s
        data["Hypothesis"] += [h] * len(s)
        data["Samples"] += ["Model"] * len(s)
    df = pd.DataFrame.from_dict(data)
    g = sns.displot(df, x='Utility', col="Hypothesis", row='Samples', facet_kws=dict(margin_titles=True))
    title = 'Source: {} \ntarget: {}'.format(source, target)
    
    g.fig.subplots_adjust(top=0.9) # adjust the Figure in rp
    g.fig.suptitle(title)

    def specs(x, **kwargs):
        plt.axvline(x.mean(), c="red", alpha=0.5, linestyle='--')
        

    g.map(specs, 'Utility')
    fig = g.fig
    
    name = "/{}.png".format(cleaned_source)
    save_file = str(path) + name 
    fig.savefig(save_file) 
    
    
    
#     sns.distplot()


In [None]:
from pathlib import Path

# We want to create some plots for random samples
def compare_random_samples(data, model, n_examples=5, seed=1, max_show=5, save=True, path='./'):
    np.random.seed(seed)
    indices = np.random.choice(len(data), size=n_examples)
    
    for i in indices:
        entry = data.iloc[i]
        source = entry["source"]
        hypotheses = np.array(entry["hypotheses"])
        
        source = entry["source"]
        target = entry["target"]
        hypotheses = entry["hypotheses"].tolist()
        utilities = get_repeated_utils(entry["utilities"], entry["utilities_count"])
        plot_distributions(source, target, hypotheses, utilities, wrapped_model, path=path)
        
img_path_str =  path + 'validation_imgs/'
img_path = Path(img_path_str)
img_path.mkdir(parents=True, exist_ok=True)
compare_random_samples(validation_dataset.data, wrapped_model, path=img_path)
        
    
    
    





In [None]:

### Do the same for the training data:
train_dataset_loader = BayesRiskDatasetLoader("train_predictive", n_hypotheses=100, n_references=1000, sampling_method='ancestral')
dataset_train = train_dataset_loader.load(type="pandas")

In [None]:
        
img_path_str =  path + 'training_imgs/'
img_path = Path(img_path_str)
img_path.mkdir(parents=True, exist_ok=True)
compare_random_samples(dataset_train.data, wrapped_model, path=img_path)
        
    


In [None]:
### Next up we want to compute the MSE

# First we get the data mean of each hypothesis


# Then we get the predicted mean
from tqdm import tqdm
all_mbr_scores = []
all_predicted_scores = []

for row in tqdm(validation_dataset.data.iterrows(), total=2500):
    entry = row[1]
        
    mbr_scores = calculate_mbr_scores(entry)

    src = entry["source"]
    hypotheses = entry["hypotheses"].tolist()
    srcs = [src] * len(hypotheses)
    predicted_scores = wrapped_model.get_mean(srcs, hypotheses)

    all_mbr_scores.append(mbr_scores)
    all_predicted_scores.append(predicted_scores)







In [None]:
# Next we are going to calculate the mean squared error
differences = []
c = 0
for target, predicted in zip(all_mbr_scores, all_predicted_scores):
    c += 1
    
    t = np.array(target)
    p = np.array(predicted).flatten()

    differences.append((t - p).tolist())
    
    



In [None]:
# Next we calc the MSE:
total_squared_error = 0
count = 0

squared_errors = []

for diff_list in differences:
    count += len(diff_list)
    
    squared_errors.append(np.array(diff_list)**2)
    
    total_squared_error += np.sum(np.array(diff_list)**2)
    
print(count)
print(total_squared_error)
print(total_squared_error/count)


In [None]:
# Next we plot the squared errors:

flattened_squarred_errors = []
for e in squared_errors:
    flattened_squarred_errors += list(e)

print(flattened_squarred_errors)


In [None]:
sns.histplot(flattened_squarred_errors)
    

In [None]:

target_best_errors = []
predicted_best_errors = []

# Next we get the error of the top pick compared to predicted one
for target, predicted, errors in zip(all_mbr_scores, all_predicted_scores, squared_errors):
    best_target_index = np.argmax(target)
    best_predicted_index = np.argmax(predicted)
    target_best_errors.append(errors[best_target_index])
    predicted_best_errors.append(errors[best_predicted_index])

In [None]:
sns.histplot(target_best_errors)
print(np.mean(target_best_errors))

In [None]:
sns.histplot(predicted_best_errors)
print(np.mean(predicted_best_errors))