In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import json
from myGenerator import DataGenerator

### Carga de datos

In [None]:
train = pd.read_pickle("pan22-authorship-verification-training-dataset/particionesXid/PanTrain.plk")
val = pd.read_pickle("pan22-authorship-verification-training-dataset/particionesXid/PanVal.plk")

In [None]:
def spectraLoader(inputPath: str) -> pd.DataFrame:
    df = pd.DataFrame()
    with open(inputPath, encoding='utf-8') as f:
        for line in f:
            jsonline = json.loads(line)
            lineDf = pd.DataFrame({'id':jsonline['id'], 'spectra':[np.array(jsonline['spectra'])]})
            df = pd.concat([df, lineDf])
    df = df.reset_index(drop=True)

    return df

In [None]:
full_spectra = spectraLoader("pan22-authorship-verification-training-dataset/particionesXid/FullSpectra.jsonl")

In [None]:
def loadLabels(inputPath:str, labelDict:dict) -> dict:
    with open(inputPath, encoding='utf-8') as f:
        for line in f:
            jsonline = json.loads(line)
            labelDict[jsonline['id']] = 1 if jsonline['value'] else 0
        
    return labelDict

In [None]:
labels = dict()
labels = loadLabels('pan22-authorship-verification-training-dataset/particiones/train_truth.jsonl',labels)
labels = loadLabels('pan22-authorship-verification-training-dataset/particiones/val_truth.jsonl',labels)

In [None]:
print("Instances: {}".format((train.shape[0] + val.shape[0])/2))
print("Labels: ",len(labels))

In [None]:
partition = {'train': train.id.unique().tolist(), 'validation': val.id.unique().tolist()}

In [None]:
print("Train: ", len(partition['train']))
print("Validation: ", len(partition['validation']))
print("Total: ", len(partition['train'])+len(partition['validation']))

### Definición de la arquitectura del modelo

In [None]:
# Modelo de red siamesa utilizando una red residual y un perceptron multicapa SIN CAPAS CUSTOM

def setModel():
    x1 = tf.keras.Input(shape=(1200,), name="input_1")
    x2 = tf.keras.Input(shape=(1200,), name="input_2")

    # Batch normalization
    # bn_layer = tf.keras.layers.BatchNormalization(name="Batch_normalization")
    # x1 = bn_layer(x1)
    # x2 = bn_layer(x2)

    # # Gaussian noise
    # gaussian_noise_layer = tf.keras.layers.GaussianNoise(1.0, name="Gaussian_noise")
    # x1 = gaussian_noise_layer(x1)
    # x2 = gaussian_noise_layer(x2)

    # # Dropout
    # dropout_layer = tf.keras.layers.Dropout(0.4, name="Dropout_1")
    # x1 = dropout_layer(x1)
    # x2 = dropout_layer(x2)

    # Residual network
    dense_layer_1 = tf.keras.layers.Dense(600, activation='relu', name="resdense_1")
    o1_1 = dense_layer_1(x1)
    o1_2 = dense_layer_1(x2)

    dense_layer_2 = tf.keras.layers.Dense(600, activation='relu', name="resdense_2")
    o2_1 = dense_layer_2(o1_1)
    o2_2 = dense_layer_2(o1_2)

    # Residual layer
    dense_layer_3 = tf.keras.layers.Dense(600, activation='relu', name="resdense_3")
    o3_1 = dense_layer_3(tf.subtract(o1_1, o2_1))
    o3_2 = dense_layer_3(tf.subtract(o1_2, o2_2))

    # dense_layer_4 = tf.keras.layers.Dense(256, activation='relu', name="resdense_4")
    # o4_1 = dense_layer_4(o3_1)
    # o4_2 = dense_layer_4(o3_2)

    # # Residual layer
    # dense_layer_5 = tf.keras.layers.Dense(256, activation='relu', name="resdense_5")
    # o5_1 = dense_layer_5(tf.subtract(o3_1, o4_1))
    # o5_2 = dense_layer_5(tf.subtract(o3_2, o4_2))

    # dense_layer_6 = tf.keras.layers.Dense(256, activation='relu', name="resdense_6")
    # o6_1 = dense_layer_6(o5_1)
    # o6_2 = dense_layer_6(o5_2)

    # # Residual layer
    # dense_layer_7 = tf.keras.layers.Dense(256, activation='relu', name="resdense_7")
    # o7_1 = dense_layer_7(tf.subtract(o5_1, o6_1))
    # o7_2 = dense_layer_7(tf.subtract(o5_2, o6_2))

    # dense_layer_8 = tf.keras.layers.Dense(256, activation='relu', name="resdense_8")
    # o8_1 = dense_layer_8(o7_1)
    # o8_2 = dense_layer_8(o7_2)

    # # Residual layer
    # dense_layer_9 = tf.keras.layers.Dense(256, activation='relu', name="resdense_9")
    # o9_1 = dense_layer_9(tf.subtract(o7_1, o8_1))
    # o9_2 = dense_layer_9(tf.subtract(o7_2, o8_2))

    # Merging
    # merge = tf.math.abs(tf.subtract(o9_1,o9_2))
    merge = tf.math.abs(tf.subtract(o3_1,o3_2))

    # MLPNN Classifier
    hidden = tf.keras.layers.Dense(300, activation='relu', name="hidden_layer")(merge)
    # dropout_layer2 = tf.keras.layers.Dropout(0.4, name="Dropout_2")(hidden)
    output = tf.keras.layers.Dense(1, activation="sigmoid", name="output_layer")(hidden)#(dropout_layer2)

    model = tf.keras.Model([x1, x2], output)
    model.compile(loss = 'binary_crossentropy', optimizer = "adam", metrics=["binary_accuracy"])

    return model

In [None]:
model = setModel()
model.summary()

### Entrenamiento del modelo

In [None]:
# Parameters
params = {'dim': (1200,),
            'batch_size': 32,
            'shuffle': True}

In [None]:
# Generators
training_generator = DataGenerator(partition['train'], train, labels, full_spectra, **params)
validation_generator = DataGenerator(partition['validation'], val, labels, full_spectra, **params)

In [None]:
tf.test.gpu_device_name()

In [None]:
# Train model on dataset
with tf.device('/device:GPU:0'):
    model = setModel()
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
    history = model.fit(x=training_generator,
                        validation_data=validation_generator,
                        callbacks=[callback],
                        # use_multiprocessing=True,
                        # workers=4,
                        epochs=100,
                        verbose=1)

In [None]:
model.save('models/ClassifierModel-v7')

### Resultados del entrenamiento

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")

In [None]:
history.history.keys()

In [None]:
acc = pd.DataFrame({'accuracy':history.history['binary_accuracy'], 'type':'train'}).reset_index().rename(columns={'index':'epoch'})
acc = pd.concat([acc, pd.DataFrame({'accuracy':history.history['val_binary_accuracy'], 'type':'val'}).reset_index().rename(columns={'index':'epoch'})]).reset_index(drop=True)
loss = pd.DataFrame({'loss':history.history['loss'], 'type':'train'}).reset_index().rename(columns={'index':'epoch'})
loss = pd.concat([loss, pd.DataFrame({'loss':history.history['val_loss'], 'type':'val'}).reset_index().rename(columns={'index':'epoch'})]).reset_index(drop=True)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
sns.lineplot(x='epoch', y='accuracy', data=acc, hue='type', ax=ax[0])
sns.lineplot(x='epoch', y='loss', data=loss, hue='type', ax=ax[1])
plt.show()

In [None]:
acc.loc[acc.epoch == acc.epoch.max()]

In [None]:
loss.loc[loss.epoch == loss.epoch.max()]

### Evaluación de datos de prueba

In [None]:
!python PredictionsTest.py -i testinput -o testoutput/v7

In [None]:
!python pan22_verif_evaluator.py -i testinput -a testoutput/v7 -o testoutput/v7