## Importy

In [None]:
import os
from PIL import Image
import json
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import class_weight
import matplotlib.pyplot as plt

from Trail import Trail
from peak_computations import finished_cipher, get_team_time
from ImageTextEmbeddingModelFusion import ImageTextEmbeddingModelFusion

from HintModel import HintModel
from RobeczechModel import RobeczechModel

import tensorflow as tf
from tensorflow.keras import layers, Input, optimizers
import tensorflow_hub as hub
import tensorflow_text as TF_text

from keras.models import Model

from transformers import AutoTokenizer, TFAutoModelForMaskedLM



LEGENDS_DIR='data/legends'
DATA_DIR='data/resized_images'
TRAIN_TEST_DIR = 'data/output_resized'
TRAIN_TEST_ORIGINALS_DIR = 'data/output'
LABELS=['easy', 'hard']
IMG_SIZE = 512

SAVED_DATA='data/saved/'
CHECKPOINT_PATH='data/checkpoints/'


'''
pokud bude na stavena True, bude ponecháno rozdělení na trénovací a testovací data
jako bylo v modelu predikce obtížnosti, jinak se přerozdělí znovu
'''
KEEP_TRAIN_TEST=False

## Predikce nápověd

In [None]:
def load_datasets():
    tr_st=np.load(SAVED_DATA+"tr_stats.npy")
    val_st=np.load(SAVED_DATA+"val_stats.npy")
    tr_trl=np.load(SAVED_DATA+"tr_trail.npy")
    val_trl=np.load(SAVED_DATA+"val_trail.npy")
    tr_task=np.load(SAVED_DATA+"tr_task.npy")
    val_task=np.load(SAVED_DATA+"val_task.npy")    
    tr_lay=np.load(SAVED_DATA+"tr_layer.npy")
    val_lay=np.load(SAVED_DATA+"val_layer.npy")    
    tr_h=np.load(SAVED_DATA+"tr_hints.npy") 
    val_h=np.load(SAVED_DATA+"val_hints.npy")     
    tr_leg=np.load(SAVED_DATA+"tr_legends.npy") 
    val_leg=np.load(SAVED_DATA+"val_legends.npy") 
    tr_names=np.load(SAVED_DATA+"tr_names.npy")
    val_names=np.load(SAVED_DATA+"val_names.npy")
    return tr_st, val_st, tr_trl, val_trl, tr_task, val_task, tr_lay, val_lay, tr_leg, val_leg, tr_h, val_h, tr_names, val_names

## Matice záměn

In [None]:
from sklearn.metrics import f1_score
def make_confusion_matrix(target_y, predicted_y):
    binary_predictions=np.around(predicted_y)
    target_y = list(map(lambda x: x[1], target_y))
    predicted_y = list(map(lambda x: x[1], binary_predictions))
    
    my_f1_score=f1_score(target_y, predicted_y)
    print("F1 score: " + str(my_f1_score))
    labels = ['nevyužili', 'využili']
    cm=confusion_matrix(target_y, predicted_y)
    plot_confusion_matrix(cm, labels, title="")

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=False):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / np.sum(cm).astype('float')
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.rcParams.update({'font.size': 18})
    plt.figure(figsize=(8, 6))
    cax=plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.rcParams.update({'font.size': 18})
    plt.tight_layout()
    plt.ylabel('Reálné využití')
    plt.xlabel('Predikované využití'.format(accuracy, misclass))
    plt.show()

## Načtení dat do proměnných

In [None]:
tr_st, val_st, \
tr_trl, val_trl, \
tr_task, val_task, \
tr_lay, val_lay, \
tr_leg, val_leg, \
tr_h, val_h, \
tr_names, val_names=load_datasets()

if not KEEP_TRAIN_TEST:
    stats=np.concatenate((tr_st, val_st))
    trl=np.concatenate((tr_trl, val_trl))
    task=np.concatenate((tr_task, val_task))
    lay=np.concatenate((tr_lay, val_lay))
    leg=np.concatenate((tr_leg, val_leg))
    h=np.concatenate((tr_h, val_h))
    names=np.concatenate((tr_names, val_names))

    tr_st, val_st, \
    tr_trl, val_trl, \
    tr_task, val_task, \
    tr_lay, val_lay, \
    tr_leg, val_leg, \
    tr_h, val_h, \
    tr_names, val_names= train_test_split(
        stats, trl, task, lay, leg, h, names, test_size=0.33, random_state=42)

### Vyváženost dat

In [None]:
# positive jsou 1
tr_neg, tr_pos = np.bincount(tr_h)
tr_total = tr_neg + tr_pos
print('Training examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    tr_total, tr_pos, 100 * tr_pos / tr_total))
initial_bias = np.log([tr_pos/tr_neg])

val_neg, val_pos = np.bincount(val_h)
val_total = val_neg + val_pos
print('Validation examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    val_total, val_pos, 100 * val_pos / val_total))

### Vytvoření trénovací a testovací sady

Legendy jsou zvlášť, protože každý jazykový model je zpracovává jinak.

In [None]:
tr_h_tf=tf.one_hot(tr_h, 2)
val_h_tf=tf.one_hot(val_h, 2)
x_train=[tr_st, tr_task, tr_trl, tr_lay]
y_train=tr_h_tf

x_val=[val_st, val_task, val_trl, val_lay]
y_val=val_h_tf

In [None]:
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=tr_h)


In [None]:
class_weights

In [None]:
CLASS_WEIGHTS={0:class_weights[0], 1:class_weights[1]}

### Model

In [None]:
def get_checkpoint_path(chckpt):
    return CHECKPOINT_PATH + chckpt + '/cp.ckpt'    

In [None]:
def train_model(nn_model, lang_model, chckpt, x_train, y_train, x_val, y_val, tr_leg, val_leg, optimizer, batch_size, epochs, continue_previous=False, prev_chckpt='a'):    
    chckpt_path=get_checkpoint_path(chckpt)    
    if nn_model==0:
        my_model=HintModel(lang_model, False, optimizer=optimizer)
        my_x_train = x_train + [tr_leg]
        my_x_val = x_val + [val_leg]
    elif nn_model==1:
        my_model=RobeczechModel(optimizer)
        tr_ids, tr_mask, tr_segs=my_model.tokenize(tr_leg)
        val_ids, val_mask, val_segs=my_model.tokenize(val_leg)
        
        my_x_train = x_train + [tr_ids, tr_mask]
        my_x_val = x_val+ [val_ids, val_mask]

    if continue_previous:
        my_model.set_checkpoint_path(get_checkpoint_path(prev_chckpt))
        my_model.load_weights()
    my_history=my_model.train_model(my_x_train, y_train, my_x_val, y_val, chckpt_path, batch_size=batch_size, epochs=epochs, class_weights=CLASS_WEIGHTS)    
    
    return my_history

In [None]:
'''
vzhledem k velikosti modelu a časové náročnosti výpočtu je vyhodnocen výsledek jen na validační sadě
stejně tak lze přidat trénovací
'''

def eval_model(nn_model, lang_model, chckpt, x_train, y_train, x_val, y_val, tr_leg, val_leg):
    
    chckpt_path=get_checkpoint_path(chckpt)     
    if nn_model==0:
        my_model=HintModel(lang_model, False, optimizer='adam')
        my_x_val = x_val + [val_leg]
    elif nn_model==1:
        my_model=RobeczechModel('adam')
        val_ids, val_mask, val_segs=my_model.tokenize(val_leg)
        my_x_val = x_val+ [val_ids, val_mask]
    
    my_model.set_checkpoint_path(chckpt_path)
    
    my_model.load_weights()
    val_pred = my_model.model.predict(my_x_val)

    print("--------- Val cf ----------")
    make_confusion_matrix(y_val, val_pred)

    val_acc=my_model.model.evaluate(my_x_val, y_val, verbose=0)[1]

    print(val_acc)
    
    return my_model, val_pred

In [None]:
'''
učení modelu s jazykovým modelem RoBERTa, tu lze vyměnit za BERT, s parametrem lang_model='bert'
'''
opt = tf.keras.optimizers.Adam(learning_rate=0.00015)
hist2=train_model(0, 'roberta', 'roberta_resampled_2', x_train, y_train, x_val, y_val, tr_leg, val_leg, opt, 1536, 30, continue_previous=True, prev_chckpt='roberta_resampled')

In [None]:
'''
učení modelu s jazykovým modelem RobeCzech
'''
opt=tf.keras.optimizers.Adam(learning_rate=0.000005)
hist=train_model(1, 'roberta', 'hint_robeczech_resampled', x_train, y_train, x_val, y_val, tr_leg, val_leg, opt, 1, 10, continue_previous=False, prev_chckpt='')

### Vyhodnocení průběhu přesnosti během učení modelu

In [None]:
history=hist
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
#plt.title('přesnost modelu')
plt.ylabel('přesnost')
plt.xlabel('epocha')
plt.legend(['trénovací', 'validační'], loc='upper left')
plt.show()
# loss modelu
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('loss modelu')
plt.ylabel('loss')
plt.xlabel('epocha')
plt.legend(['trénovací', 'validační'], loc='upper left')
plt.show()

### Vyhodnocení modelu s nejvyšší přesností

In [None]:
my_mod, val_pred=eval_model(0, 'roberta', 'roberta_resampled', x_train, y_train, x_val, y_val, tr_leg, val_leg)