In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import json
from myGenerator import DataGenerator

### Carga de datos

In [3]:
train = pd.read_pickle("pan22-authorship-verification-training-dataset/particionesXid/PanTrain.plk")
val = pd.read_pickle("pan22-authorship-verification-training-dataset/particionesXid/PanVal.plk")
test = pd.read_pickle("pan22-authorship-verification-training-dataset/particionesXid/PanTest.plk")
docDict = pd.read_pickle("pan22-authorship-verification-training-dataset/particionesXid/UniquePanDocs.plk")

In [4]:
def spectraLoader(inputPath: str) -> pd.DataFrame:
    df = pd.DataFrame()
    with open(inputPath, encoding='utf-8') as f:
        for line in f:
            jsonline = json.loads(line)
            lineDf = pd.DataFrame({'id':jsonline['id'], 'spectra':[np.array(jsonline['spectra'])]})
            df = pd.concat([df, lineDf])
    df = df.reset_index(drop=True)

    return df

In [5]:
full_spectra = spectraLoader("pan22-authorship-verification-training-dataset/particionesXid/FullSpectra.jsonl")

In [6]:
def loadLabels(inputPath:str, labelDict:dict) -> dict:
    with open(inputPath, encoding='utf-8') as f:
        for line in f:
            jsonline = json.loads(line)
            labelDict[jsonline['id']] = 1 if jsonline['value'] else 0
        
    return labelDict

In [7]:
labels = dict()
labels = loadLabels('pan22-authorship-verification-training-dataset/particiones/train_truth.jsonl',labels)
labels = loadLabels('pan22-authorship-verification-training-dataset/particiones/val_truth.jsonl',labels)

In [8]:
print("Instances: {}".format((train.shape[0] + val.shape[0])/2))
print("Labels: ",len(labels))

Instances: 16486.0
Labels:  16486


In [9]:
labels["a09fdc6b-ed15-48c5-9d2e-572f989b9b4500000"]

0

In [10]:
partition = {'train': train.id.unique().tolist(), 'validation': val.id.unique().tolist()}

In [11]:
print("Train: ", len(partition['train']))
print("Validation: ", len(partition['validation']))
print("Total: ", len(partition['train'])+len(partition['validation']))

Train:  15732
Validation:  754
Total:  16486


In [12]:
train.loc[train.id == '3bceab9e-aebe-4ccd-8fa5-63f6a741b33404268'].merge(full_spectra, left_on='idtext', right_on='id').spectra


0    [[[24.29242515563965, 16.52275276184082, 15.91...
1    [[[33.50175476074219, 19.5318546295166, 18.490...
Name: spectra, dtype: object

### Definición de la arquitectura de red siamesa

In [13]:
# Modelo de red siamesa utilizando una red residual y un perceptron multicapa

x1 = tf.keras.Input(shape=(1200,), name="Input_1")
x2 = tf.keras.Input(shape=(1200,), name="Input_2")

# Batch normalization
bn_layer = tf.keras.layers.BatchNormalization(name="Batch_normalization")
x1 = bn_layer(x1)
x2 = bn_layer(x2)

# Gaussian noise
gaussian_noise_layer = tf.keras.layers.GaussianNoise(1.0, name="Gaussian_noise")
x1 = gaussian_noise_layer(x1)
x2 = gaussian_noise_layer(x2)

# Dropout
dropout_layer = tf.keras.layers.Dropout(0.4, name="Dropout_1")
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

# Residual network

substract = lambda x: x[0] - x[1]
residual = tf.keras.layers.Lambda(function=substract, output_shape=lambda x: x[0], name='residual')

dense_layer_1 = tf.keras.layers.Dense(256, activation='relu', name="ResDense_1")
o1_1 = dense_layer_1(x1)
o1_2 = dense_layer_1(x2)

dense_layer_2 = tf.keras.layers.Dense(256, activation='relu', name="ResDense_2")
o2_1 = dense_layer_2(o1_1)
o2_2 = dense_layer_2(o1_2)

# Residual layer
dense_layer_3 = tf.keras.layers.Dense(256, activation='relu', name="ResDense_3")
o3_1 = dense_layer_3(residual([o1_1, o2_1]))
o3_2 = dense_layer_3(residual([o1_2, o2_2]))

dense_layer_4 = tf.keras.layers.Dense(256, activation='relu', name="ResDense_4")
o4_1 = dense_layer_4(o3_1)
o4_2 = dense_layer_4(o3_2)

# Residual layer
dense_layer_5 = tf.keras.layers.Dense(256, activation='relu', name="ResDense_5")
o5_1 = dense_layer_5(residual([o3_1, o4_1]))
o5_2 = dense_layer_5(residual([o3_2, o4_2]))

dense_layer_6 = tf.keras.layers.Dense(256, activation='relu', name="ResDense_6")
o6_1 = dense_layer_6(o5_1)
o6_2 = dense_layer_6(o5_2)

# Residual layer
dense_layer_7 = tf.keras.layers.Dense(256, activation='relu', name="ResDense_7")
o7_1 = dense_layer_7(residual([o5_1, o6_1]))
o7_2 = dense_layer_7(residual([o5_2, o6_2]))

dense_layer_8 = tf.keras.layers.Dense(256, activation='relu', name="ResDense_8")
o8_1 = dense_layer_8(o7_1)
o8_2 = dense_layer_8(o7_2)

# Residual layer
dense_layer_9 = tf.keras.layers.Dense(256, activation='relu', name="ResDense_9")
o9_1 = dense_layer_9(residual([o7_1, o8_1]))
o9_2 = dense_layer_9(residual([o7_2, o8_2]))

# Merging
abs_substract = lambda x: tf.keras.backend.abs(x[0] - x[1])
merge = tf.keras.layers.Lambda(function=abs_substract, output_shape=lambda x: x[0], name='merge')([o9_1,o9_2])

# MLPNN Classifier
hidden = tf.keras.layers.Dense(128, activation='relu', name="hidden_layer")(merge)
dropout_layer2 = tf.keras.layers.Dropout(0.4, name="Dropout_2")(hidden)
output = tf.keras.layers.Dense(1, activation="sigmoid", name="output_layer")(dropout_layer2)

model = tf.keras.Model([x1, x2], output)
model.compile(loss = 'binary_crossentropy', optimizer = "adam", metrics=["accuracy"])

In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1200)]       0           []                               
                                                                                                  
 ResDense_1 (Dense)             (None, 256)          307456      ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 ResDense_2 (Dense)             (None, 256)          65792       ['ResDense_1[2][0]',             
                                                                  'ResDense_1[3][0]']             
                                                                                              

### Entrenamiento de red siamesa

In [15]:
print("train: ", train.shape[0])
print("test: ", val.shape[0])

train:  31464
test:  1508


In [16]:
print("train: ", train.id.unique().size)
print("test: ", val.id.unique().size)

train:  15732
test:  754


In [17]:
# Parameters
params = {'dim': (1200,),
            'batch_size': 31,
            'shuffle': False}

In [18]:
# Generators
training_generator = DataGenerator(partition['train'], train, labels, full_spectra, **params)
validation_generator = DataGenerator(partition['train'], val, labels, full_spectra, **params)

In [19]:
# Train model on dataset
history = model.fit(x=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    workers=4,
                    epochs=10,
                    verbose=1)

Epoch 1/10

IndexError: index 0 is out of bounds for axis 0 with size 0