### Impostazione iniziale codice

In [None]:

# variables reset
%reset -f

# various imports
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
from keras import layers, models
import keras 
import os
import tensorflow
import numpy
import json
import pickle
import torch
from matplotlib import pyplot as plt
numpy.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
from transformers import AutoTokenizer, AutoModelForSequenceClassification


# obtaining BERT models
models = ["cardiffnlp/twitter-roberta-large-emotion-latest",
          "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
          "sileod/deberta-v3-small-tasksource-nli",
          "dslim/distilbert-NER"
          #"answerdotai/ModernBERT-large",
          ]


md = []

# obtaining tokenizer and pre-trained BERT models from huggingface
for model in models:

    # this code is to make the loading entirely offline once downloaded(edit USER string)
    #model = ("models/"+model).replace("/","--")
    #cached = "C:\\Users\\USER\\.cache\\huggingface\\hub\\"
    #model = cached+model+"\\snapshots\\"
    #model = model +"\\"+os.listdir(model)[0] + "\\"

    # download of the models
    tokenizer = AutoTokenizer.from_pretrained(model)#,local_files_only=True)
    model = AutoModelForSequenceClassification.from_pretrained(model,output_hidden_states=True)#,local_files_only=True)

    #appending them for later use
    md.append((tokenizer,model))

base_path = os.path.dirname(os.path.abspath("__file__"))

### Setting dataset base paths
For internal usage
* test -> dataset for doing various tests(like thresholding, etc, acts like a validation)
* dev -> dataset on witch final predictions are done(the one sent to evaluation)

In [None]:
# this is the training dataset
trainpath = base_path + "\\dataset"
train_docs = trainpath + "\\raw-documents"
traintxt = trainpath+"\\train.txt"
trainjson = "train.json"

# this is the final test dataset for the predictions
devpath = base_path + "\\dataset"
dev_docs = devpath + "\\raw-documents"
devtxt = devpath+"\\dev.txt"
devjson = "dev.json"

# this is a dataset on witch some tests are done 
testpath = base_path + "\\dataset"
test_docs = testpath + "\\raw-documents"
testtxt = testpath+"\\test.txt"
testjson = "test.json"

## Pre-processing
Here, we separate each sentence of a file into a single sample, we also process narrartives, subnarratives and argoument files for later use, beware that there could be some errors if the dataset isn't correctly formatted. to mimick 2024 dataset code, everything is converted to a json file.

In [None]:
import re 

pattern = r"(.+)\t(.+)\t(.+)"
classes_pattern = r"((URW)|(CC)):((\w|\s|[:,-\/]|)+)"
win_max_len = 3

# mode 0 -> dataset with classes
# mode 1 -> dataset without classes(the one to send)

def extractionTool(path,outfile,mode=0):
    baselist = []
    id = 0
    with open(path,"r") as f:
        # taking each line of the labeled file
        for line in f:
            v = ""
            trigger = False
            file = line.replace("\n","")
            classes = ""
            subclasses = ""
            superclass = ""
            secondsuperclass = ""
            if mode == 0:
                # Narrative and subnarrative extraction
                v = list(re.match(pattern,line).groups())
                file = v[0]
                classes = [match[3] for match in re.findall(classes_pattern, v[1])]
                subclasses = [match[3] for match in re.findall(classes_pattern, v[2])]
                superclass = [match[0] for match in re.findall(classes_pattern, v[1])]
                # obtaining argument
                if(len(superclass)==0):
                    superclass = "Other"
                    trigger = True
                    secondsuperclass = "URW" if "URW" in line else "CC"
                else:
                    superclass = superclass[0]
                if len(classes)!=len(subclasses):
                    raise Exception()
                if len(classes) == 0:
                    classes.append(" Other")
                    subclasses.append(" Other")
                
            print(f"file:{file},{classes},{subclasses},{superclass}")
            # sentence split of all the sentences of a file
            with open(train_docs+"\\"+file,"r",encoding="raw_unicode_escape") as sub_file:
                    val = ""
                    window = []
                    # extraction line by line
                    for line in sub_file:
                        if line != "\n":
                            # dictionary mapping for properties, such the main text, and a context window
                            dic = {}
                            dic["id"] = id
                            dic["text"] = line
                            dic["file"] = file
                            dic["classes"] = [superclass +":" +x+";" for x in classes]
                            dic["precedent_txt"] = val
                            dic["window_txt"] = "".join(window)
                            window.append(line)
                            if len(window) > win_max_len:
                                window.pop(0)
                            val = val + line 
                            # sottoclassi
                            dic["labels"] = [superclass +":" +x+";" for x in subclasses]
                            dic["superclass"] = [superclass]
                            if trigger:
                                dic["superclass"] = [superclass,secondsuperclass]
                            dic["fulltext"] = "0"
                            id += 1
                            baselist.append(dic)

                    # separator containing the full text
                    dic = {}
                    dic["id"] = id
                    dic["text"] = val
                    dic["file"] = file
                    dic["classes"] = [superclass +":" +x+";" for x in classes]
                    dic["precedent_txt"] = ""
                    dic["window_txt"] = ""
                    
                    # sottoclassi
                    dic["precedent_txt"] = val
                    dic["labels"] = [superclass +":" +x+";" for x in subclasses]
                    dic["superclass"] = [superclass]
                    if trigger:
                        dic["superclass"] = [superclass,secondsuperclass]
                    dic["fulltext"] = "1"
                    id += 1
                    baselist.append(dic)
            
    with open(outfile+".json","w") as f:
        json.dump(baselist, f)

extractionTool(traintxt,"train")
extractionTool(testtxt,"test")
extractionTool(devtxt,"dev",1)

Extraction of Generic Information from the dataset(single sentencies, not files), Including:

* Number of classes
* Various probabilities (some discarded)
* Class labels
* Statistics on the number of labels per instance

In [None]:

# getting the classes
with open(trainjson,"r",encoding="utf8") as f:
    trainreadjson = json.load(f)

# putting all subnarratives and narratives in each argument
superclassmap = {"URW":set(),"CC":set(),"Other":set()}
var = set()
meancalc = []
for elem in trainreadjson:
    for j in elem["labels"]:
        var.add(j)
        superclassmap[elem["superclass"][0]].add(j)

print("aguments map:")
print(superclassmap)
print()
# from set to list, the set wss used to have only one copy of subnarratives
var = sorted(var)
prob = dict.fromkeys(var, 0)
prob_2 = dict.fromkeys(range(20),0)
total = 0
for elem in trainreadjson:
    prob_2[len(elem["labels"])] += 1
    for j in elem["labels"]:
        prob[j] += 1
        total += 1
for key in prob.keys():
    prob[key] = prob[key]/total
final_prob_vec = []

print("probability of subnarratives in the dataset")
print(prob)
print()

superclassprob = {"URW":0,"CC":0,"Other":0}
for elem in superclassprob.keys():
    for pr in prob.keys():
        if pr in superclassmap[elem]:
            superclassprob[elem] += prob[pr]

print("Argoument probabilities:")
print(superclassprob)
print()

for elem in var:
    final_prob_vec.append(prob[elem])

print("Number of classes for element:\n")
print(prob_2)

for elem in prob_2.keys():
    prob_2[elem] = prob_2[elem]/total
print("Number of classes for element normalized:\n")
print(prob_2)
#var


### Embeddings and one hot encoding
This part extracts the dataset embeddings

In [None]:
def preprocess_embeddings(json):
    leghjson = len(json)
    extracted_unit = 0
    # unused
    context_previous_full_multiple_embedding = []
    context_window_multiple_embedding = []
    max_logical_window = 2

    for value in json:

        # various containers for an element of the dataset, some got unused
        value["tokenized"] = []
        value["context_CLS_embedding"] = []
        value["context_window_multiple_embedding"] = []
        value["context_previous_full_multiple_embedding"] = []
        value["context_window_single_embedding"] = []
        value["context_previous_full_single_embedding"] = []
        value["context_CLS_as_much_models_can_take"] = []
        tokenized = None

        # CLS embedding extraction
        for mod in md:

            # taking out tokenizer and model
            tokenizer = mod[0]
            model = mod[1]

            # extraction of main sentence embedding
            try:
                tokenized = tokenizer(value["text"],return_tensors="pt",max_length=512)
                v = model(**tokenized)
            except:
                print("cut")
                # handling of too long texts, i cut them in some parts and take out the middle/last part
                leng_token = len(value["text"])//2
                tokenized = tokenizer(value["text"][leng_token:2*leng_token],return_tensors="pt",max_length=512)
                v = model(**tokenized)
            
            # embedding extraction of the CLS
            last_layer = v.hidden_states[-1][0].detach().numpy()

            # appending to the list of embeddings
            value["context_CLS_embedding"].append(last_layer[0])
            
            # 2 precedent sentencies embedding, its similar to the precedent code
            try:
               tokenized = tokenizer(value["window_txt"],return_tensors="pt",truncation=True)
               v = model(**tokenized)
            except:
               print("cut")
               # handling di testi troppo lunghi, estraggo la parte centrale dato che si suppone la più importante
               leng_token = len(value["window_txt"])//2
               tokenized = tokenizer(value["window_txt"][leng_token:2*leng_token],return_tensors="pt",truncation=True)
               v = model(**tokenized)
            last_layer = v.hidden_states[-1][0].detach().numpy()
            value["context_window_single_embedding"].append(last_layer[0])

        # one hot encoding of argument, narrative and subnarrative
        try:
            # check if classes are present (its needed on submission dataset)
            if value["labels"]!=None and value["labels"]!="":

                # argouent embedding, this is unused in final model
                value["superclass_embedding"] = [1 if x in value["superclass"] else 0 for x in superclassmap.keys()]
                if value["superclass_embedding"][2] == 1:

                    # other class splits the embedding values
                    if value["superclass_embedding"][1] == 1:
                        value["superclass_embedding"][1] == 0.5
                    else:
                        value["superclass_embedding"][0] == 0.5

                # this is in_argoument one hot encoding (so we are excluding the other argoumtn from this one hot encoding)
                # unused
                value["y_single_class"] = [1 if x in value["labels"] else 0 for x in superclassmap[value["superclass"][0]]]
                
                # this is a global one hot encoding
                # in final model this was used
                value["y"] = [1 if(x in value["labels"]) else 0 for x in var]

                # number of classes embedding, this is unused
                sum = numpy.sum(value["y"])
                value["y_total"] = [1 if(x+1 == sum) else 0 for x in range(len(prob_2.keys()))]

            #print(value["y"],value["labels"])
            #print(value["y_total"],sum)
        except:
            pass
        extracted_unit += 1
        #percentuale di completamento estrazione
        print(extracted_unit/leghjson)
    

# concatenation module

# this function concatenates the embeddings for a single sample, this will give a dataset ready for keras
# (single sentences, or single sentences + 2 precedent sentencies etc..) 
def convert_to_model(dataset,mode=0):
    val2 = None
    # single sentencies
    if mode == 0:
        val = "context_CLS_embedding"

    # unused
    elif mode == 1:
        val = "context_CLS_as_much_models_can_take" 

    # main sentence + 2 precedding in the file
    elif mode == 2:
        val2 = "context_window_single_embedding" # window di 2
        val = "context_CLS_embedding"
    
    # unused
    elif mode == 3:
        val = "context_CLS_embedding"
        val2 = "context_previous_full_single_embedding"
    
    few = dataset
    # main concatenation and setting the dataset for numpy and tensorflow usage
    input_x = []
    y = []
    y_total = []

    # for each sample 
    for v in few:
        var = []
        # list of one hot encoding for later use in keras
        try:
            # subnaratives 
            y.append(v["y"])

            # number of classes encoding 
            # unused
            y_total.append(v["y_total"])
        except:
            pass
        # concatenation of all embeddings of a sample from all hugginface models
        for j in v[val]:
            var.append(j)
        # concatenation of window in case of mode with window

        if val2 != None:
            for j in v[val2]:
                var.append(j)

        # conversion of list into a single numpy array (not matrix)
        var = numpy.concatenate(var)
        var = var.flatten()

        # appending the array to the dataset
        input_x.append(var)

    # full dataset in keras
    input_x = numpy.array(input_x)
    y = numpy.array(y)

    # unused
    y_total = numpy.array(y_total)
    return(input_x,y,y_total)

# top-down appoach version
# this its present in later code, but for the final model it was unused
def convert_to_model_special(dataset,mode=0):
    val2 = None
    if mode == 0:
        val = "context_CLS_embedding"
    elif mode == 1:
        val = "context_CLS_as_much_models_can_take" 
    elif mode == 2:
        val2 = "context_window_single_embedding" 
        val = "context_CLS_embedding"
    elif mode == 3:
        val = "context_CLS_embedding"
        val2 = "context_previous_full_single_embedding"

    few = dataset
    input_x = {"URW":list(),"CC":list(),"Other":list(),"full":list()}
    y = {"URW":list(),"CC":list(),"Other":list()}
    y_total = {"URW":list(),"CC":list(),"Other":list()}
    y_superclass = {"full":list()}
    y_single_class = {"URW":list(),"CC":list(),"Other":list()}
    for v in few:
        var = []
        try:
            y[v["superclass"][0]].append(v["y"])
            y_total[v["superclass"][0]].append(v["y_total"])
            y_superclass["full"].append(v["superclass_embedding"])
            y_single_class[v["superclass"][0]].append(v["y_single_class"])
        except:
            pass
        for j in v[val]:
            var.append(j)
        if val2 != None:
            for j in v[val2]:
                var.append(j)
        
        var = numpy.concatenate(var)
        var = var.flatten()
        input_x["full"].append(var)
        try:
            input_x[v["superclass"]].append(var)
        except:
            pass


    for key,value in input_x.items():
        input_x[key] = numpy.array(value)
    for key,value in y.items():
        y[key] = numpy.array(value)
    for key,value in y_total.items():
        y_total[key] = numpy.array(value)
    for key,value in y_superclass.items():
        y_superclass[key] = numpy.array(value)
    for key,value in y_single_class.items():
        y_single_class[key] = numpy.array(value)
    
    return(input_x,y,y_total,y_superclass,y_single_class)

### Dataset extraction
The feature extraction process is very time-consuming, so the script ensures that extracted embeddings are saved, allowing the extraction to be performed only once.
#### Important
Embeddings .pkl dataset should match the files in the dataset defined folder

In [None]:
from pathlib import Path

file = Path("./train.pkl")
if not file.exists():
    with open(trainjson,"r",encoding="utf8") as f:
        trainreadjson = json.load(f)
    train = trainreadjson#[0:trainsize]
    preprocess_embeddings(train)
    with open('train.pkl', 'wb') as f:
        pickle.dump(train,f)
file = Path("./test.pkl")
if not file.exists():
    with open(testjson,"r",encoding="utf8") as f:
        testreadjson = json.load(f)
    test = testreadjson
    preprocess_embeddings(test)
    with open('test.pkl', 'wb') as f:
        pickle.dump(test,f)
file = Path("./dev.pkl")
if not file.exists():
    with open(devjson,"r",encoding="utf8") as f:
        devreadjson = json.load(f)
    dev = devreadjson
    preprocess_embeddings(dev)
    with open('dev.pkl', 'wb') as f:
        pickle.dump(dev,f)


In [None]:
#import pickle
#with open('objs25.pkl', 'rb') as f:
#        train,test,dev = pickle.load(f)

In [None]:
#with open('train.pkl', 'wb') as f:
#        pickle.dump(train,f)
#with open('test.pkl', 'wb') as f:
#        pickle.dump(test,f)
#with open('dev.pkl', 'wb') as f:
#        pickle.dump(dev,f)

## Loading of the processed dataset

In [None]:
with open('train.pkl', "rb") as f:
    train = pickle.load(f)
with open('test.pkl', "rb") as f:
    test = pickle.load(f)
with open('dev.pkl', "rb") as f:
    dev = pickle.load(f)

In [None]:
trainmode = 2

# mode 0 -> independent sentencies
# mode 2 -> main sentence + 2 preceeding sentences window

# this was used for another network to predict argoument and narrative
# its unused

input_x,y,y_total,y_superclass,y_single_class = convert_to_model_special(train,trainmode)
input_dev,y3,y3_total,y3_superclass,y3_single_class = convert_to_model_special(dev,trainmode)
input_validation,y2,y_total2,y2_superclass,y2_single_class = convert_to_model_special(test,trainmode)

# this code was for normalizzation part, but at the end it was discarded
normalizer = lambda x:x

# variables to get shapes for input and output for neural networks
anothervar = input_x["full"].shape[1]
shape_sup = len(y_superclass["full"][0])

## Neural networks creation and training

In [None]:
# this early stopped was unreliable
# its unused/disabled(watch the condition) 

class stopper(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None): 
        val_loss = logs["val_loss"]
        if val_loss < 0.000:
            self.model.stop_training = True
        
# input layer shape
inpt = layers.Input(shape=(anothervar,))        

In [None]:
# argoument classificator, this was used
# a top-down approach, but it is unused
def build_CL():
    layers = keras.layers
    NN1 = keras.models.Model
    ##########
    l8 = layers.Dense(anothervar//30, activation='relu')(inpt)
    l8 = layers.Dropout(0.3)(l8)
    l8 = layers.Dense(shape_sup, activation='linear')(l8)
    simg_predictions = layers.Dense(shape_sup, activation='sigmoid')(l8)
#    ##########
    NN1 = NN1(inpt,simg_predictions)
    return NN1

retrain = False
if not retrain:
    listt_CC = []
for j in range(1):
    iter = input_x["full"]
    y_iter = y_superclass["full"]
    if not retrain:
        CL = build_CL()
        CL.compile(loss=['binary_crossentropy'], optimizer='adam', metrics=['accuracy'])
        history = CL.fit(x=iter,y=y_iter,validation_data=(normalizer(input_validation["full"]),y2_superclass["full"]),batch_size=128, epochs=8,callbacks=[])
        retrain=True
    else:
        CL = CL
    #cerco di ridurre il più possibile la loss di validation
    #CL.compile(loss=['binary_crossentropy'], optimizer='sgd', metrics=['accuracy'])                                        #1000
    #history = CL.fit(x=iter,y=y_iter,validation_data=(normalizer(input_validation["full"]),y2_superclass["full"]),batch_size=8500, epochs=1000,callbacks=[])#1*(1+j))
    #CL.acc = history.history['val_accuracy']
    listt_CC.append(CL)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

### Real neural network training

In [None]:
# dataset convertion to numpy for keras
input_x,y,y_total = convert_to_model(train,trainmode)
input_dev,y3,y3_total = convert_to_model(dev,trainmode)
input_validation,y2,y_total2 = convert_to_model(test,trainmode)
shapey = len(y[0])

In [None]:
# neural network construction

def build_NN1(relu_layer=15,dropout=0.4):
    layers = keras.layers
    NN1 = keras.models.Model
    ##########
    l8 = layers.Dense(anothervar//relu_layer, activation='relu')(inpt)
    l8 = layers.Dropout(dropout)(l8)
    l8 = layers.Dense(shapey, activation='linear')(l8)
    simg_predictions = layers.Dense(shapey, activation='softmax')(l8)
    ##########
    NN1 = NN1(inpt,simg_predictions)
    return NN1


In [None]:
# an ensemble method was planned, but discarded due to no performance benefits
# editing div, just divides the dataset in n parts, if set to one, no real division happens
div = 1
leng_input = len(input_x)//div
input_x = [input_x[j*leng_input:(j+1)*leng_input] for j in range(div)]
y = [y[j*leng_input:(j+1)*leng_input] for j in range(div)]
y_tot = [y_total[j*leng_input:(j+1)*leng_input] for j in range(div)]


## Training
You can stop this any time without repercussions

In [None]:
# network could be retrained
# but each time is trained from zero

relu_layer = 30
dropout = 0.4

retrain = False
if not retrain:
    listt_NN1 = []
# ensemble part, but its run only once since div is 1
for j in range(div):

    # part of dataset extraction for independent enseble modulse
    # since div is 1, this is the entire dataset

    iter = input_x[j]
    y_iter = y[j]
    if not retrain:
        # neural network building
        NN1 = build_NN1(relu_layer=relu_layer,dropout=dropout)
        
        # adam optimizer training, with cross entropy
        NN1.compile(loss=['binary_crossentropy'], optimizer='adam', metrics=['accuracy'])

        # training, for few epochs,
        # the normalizer doent do anything,
        # the validation is the dataset used for tests
        history = NN1.fit(shuffle = True,x=normalizer(iter),y=y_iter,validation_data=(normalizer(input_validation),y2),batch_size=128, epochs=10,callbacks=[stopper()])
        retrain=True
    else:
        NN1 = NN1
        
    # slow training with sgd
    # the early stopping was done manually when the loss couldn't decrease anymore
    # you can go less with the epochs or more as long validation loss continues decreasing
    NN1.compile(loss=['binary_crossentropy'], optimizer='sgd', metrics=['accuracy'])                                        
    history = NN1.fit(shuffle = True,x=normalizer(iter),y=y_iter,validation_data=(normalizer(input_validation),y2),batch_size=8500, epochs=4000,callbacks=[stopper()])#1*(1+j))
    NN1.acc = history.history['val_accuracy']
    listt_NN1.append(NN1)

# various plot if we wnd the epochs to the last one(improbable)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

codice non utilizzato per il modello finale, ma su cui sono stati fatti dei test di predizione del numero di labels

In [None]:
# pre-trained Neural network model to re-import if needed, 
# you need to change the name to load the correct network for your language

load_from_files = False
if load_from_files:
    # model re-import
    NN1 = keras.saving.load_model("NN1_BG1.keras")
    CL = keras.saving.load_model("CL_BG1.keras")

In [None]:
# this was a list of models for old ensemble approach, now it just contain a main model
listt_NN1 = [NN1]
# this isnt used for predictions,
listt_CC = [CL]

### Sum of probabilities and post-processing

In [None]:
# taking the list of subnarrattives for later use
l=list(var)
print(l)

# this function does the post-processing

# validation inputs are the dataset to predict
# input_validation is the x of the dataset to predict
# validat_x its the dictionary containing the file info
# filename is the file used to write predictions
# the thresholds are the first one fo URW argoument, the second one for CC
# if less than threshold, the subnarrative is excluded
def pred_to_json_ensemble(list_NN1,list_CL,input_validation,validat_x,filename
                          ,treeshold1,treeshold2):
    # test parameters for manipulating probabilitites, aren't really used
    # this threshold was to cut probabilities for argoumtnt detection
    min_treeshold = 0.0
    # this was used to reduce Other class prevalence
    reductionthreshold = 0
    # the normalizer does nothing, it should have been used for normalizzation of input dataset
    inp = normalizer(input_validation)
    # ensemble predictions, but this its runned only once, so no ensemble
    k = list_NN1[0].predict(inp,verbose=0)
    for v in list_NN1[1:]:
        k = k + v.predict(inp,verbose=0)
    res = k
    # this was for argument prediction, its unused
    k = list_CL[0].predict(inp,verbose=0)
    for v in list_CL[1:]:
        k = k + v.predict(inp,verbose=0)
    res_CC = k
    
    # packing everything after predictions
    res = (res,res_CC)
    #res = (NN1.predict(inp,verbose=0),NN2.predict(inp,verbose=0))
    summprob = numpy.zeros(len(l))
    summclass =  numpy.zeros(3)
    newdict = []
    n_samples = 0

    # iteration for each sentence of the dataset to predtict
    for v,k,file in zip(res[0],res[1],validat_x):
        n_samples +=1

        # sum of subnarratives probabilities for each sentence of a file
        summprob += v
        # sum of argoument probablities(unused)
        summclass += k
        # old multiplicative method
        #summprob += numpy.log(v)

        # end of a file, start processing of the probabilities of the said file
        if file["fulltext"] == "1" or file["fulltext"] == 1:
            #print(file["file"])

            # normalizzation to put everything in 0 to 1
            summprob /= n_samples
            summclass /= n_samples

            # old multiplicative method
            #summprob = numpy.exp(summprob)  # Converti in probabilità
            #summprob = summprob / numpy.sum(summprob)
            #print(summprob)
            #print(summclass)
            
            # map the probabilities to the correspontig subnarrative
            probdict = {stringa : 0 for i, stringa in enumerate(l)}
            i = 0
            while i<len(l):
                probdict[l[i]] = summprob[i]
                i+=1

            # this was use for argument extraction between URW,CC,Other,
            # this part goes from subnarratives to argoument, and the greater was chosen
            UWR = 0
            Other = 0
            CC = 0
            for key,value in probdict.items():
                if key in superclassmap['URW']:
                    if value < min_treeshold:
                            Other -= reductionthreshold*value
                    else:
                            UWR += value
                elif key in superclassmap['CC']:
                    if value < min_treeshold:
                            Other -= reductionthreshold* value
                    else:
                            CC += value
                else:
                    Other +=  value
            
            # since argoument is given, i can override them looking for the filename
            if "CC" in file["file"] :
                UWR = 0
            else:
                try:
                    if "CC" in file["superclass"]:
                       UWR = 0
                    #print(file["superclass"])
                except:
                    pass  

            # multiple filenames for URW
            if "URW" in file["file"] or "UA" in file["file"] or "RU_" in file["file"]:
                CC = 0
            else:
                try:
                    if "URW" in file["superclass"] or "UA" in file["superclass"] or "RU_" in file["superclass"]:
                       CC = 0
                except:
                    pass 
                 
            # chosing the max argoument
            a_list = [UWR,CC,Other]

            # this was te approach when the argument classifier was used, predictions are just 
            # discarded since this line is commented
            #a_list = [UWR +summclass[0],CC+summclass[1],Other+summclass[2]]
            

            a_list = [x for x in a_list]
            #print(a_list)

            # taking out the higest argument
            superclass = a_list.index(max([x for x in a_list]))
            another_map = ["URW","CC","Other"]
            dic = {}
            labelcontainer = []
            tmptreeshold1 = treeshold1
            tmptreeshold2 = treeshold2

            # cutting out low probabilities
            while len(labelcontainer) == 0:
                for key,value in probdict.items():
                    if key in superclassmap['URW']:
                        if superclass == 0 and value >= tmptreeshold1:
                            labelcontainer.append(key)
                    elif key in superclassmap['CC']:
                        if superclass == 1 and value >= tmptreeshold2:
                            labelcontainer.append(key)
                    else:
                        if superclass == 2:
                            labelcontainer.append(key)
                # fallback if probabilities are too low to have at least one probability
                tmptreeshold1 -= 0.001
                tmptreeshold2 -= 0.001

            # putting the predictions for those files for writing them
            dic["labels"] = labelcontainer
            dic["class"] = another_map[superclass]
            dic["file"] = file["file"]

            # put everithing in a list for later processing
            newdict.append(dic)
            
            # reset for new file processing
            summprob = numpy.zeros(len(l))
            summclass =  numpy.zeros(3)
            n_samples = 0
    
    # writing everithing to predicton file for semeval format
    to_write = ""
    patt = r"(.*):([^:]*)"
    for elem in newdict:
        if "Other" in elem["class"]:
            to_write += elem["file"] +"\tOther\tOther\n"
        else:
            to_write += elem["file"] +"\t"+";".join([re.match(patt,x).group(1) for x in elem["labels"]])+"\t"+"".join(elem["labels"])+"\n"

    #print(to_write)
    with open(filename, "w") as outfile: 
        outfile.write(to_write)
    

### Threshold automatized finder
this uses semeval evaluation script
Avoid running this is the sample dataset since its useless

In [None]:
import subprocess
tr1 = 0
tr2 = 0
min = 0
sdv2 = 0
save = None
command = [
    'python.exe', 
    'subtask2_scorer.py', 
    '-p', 'validation25.txt', 
    '-g', './dataset/test.txt', 
    '-f', 'subtask2_subnarratives.txt', 
    '-c', 'subtask2_narratives.txt'
]
# increasing threshold 
while tr1 < 0.4:
    tr2 = 0
    while tr2 < 0.4:
        # prediction
        pred_to_json_ensemble(listt_NN1,listt_CC,input_validation,test,"validation25.txt",tr1,tr2)
        # running evaluation file
        result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        # taking score
        result = result.stdout.split("\n")[2].split(" ")
        result1 = float(result[1])
        sdv = result[2][1:5]
        sdv = float(sdv)
        
        # getting the relatively better threshold combination
        if result1 > min:
            # preferring lower std between same scores 
            if not (result1 - min < 0.01 and sdv2 < sdv):
                min = result1
                sdv2 = sdv
                save = (tr1,tr2)
                print(result1)
        tr2 += 0.01
    tr1 += 0.01


In [None]:
print(save)

In [None]:

tr1,tr2 = save
# for manual threshold just de-comment
#tr1 = 0.2
#tr2 = 0.15

pred_to_json_ensemble(listt_NN1,listt_CC,input_x,train,"training25.txt",tr1,tr2)
pred_to_json_ensemble(listt_NN1,listt_CC,input_validation,test,"validation25.txt",tr1,tr2)
pred_to_json_ensemble(listt_NN1,listt_CC,input_dev,dev,"predictions25.txt",tr1,tr2)


Manual save when models are optimal

In [None]:
#NN1.save("NN1_PT2.keras")
#CL.save("CL_PT2.keras")

### Manually registered configurations

In [None]:



"""
RU
//5
softmax
treeshold1 = 0.1
trainmode = 0

Evaluation Results:
F1@coarse: 0.713 (0.276)
F1@fine: 0.535 (0.328)

PT

//15
sigmoid
trainmode = 0
min_treeshold = 0.075
treeshold1 = 0.15
treeshold2 = 0.2

Evaluation Results:
F1@coarse: 0.723 (0.277)
F1@fine: 0.512 (0.281)


BG
//10
sigmoid
treeshold1 = 0.14
treeshold2 = 0.1

F1@coarse: 0.683 (0.294)
F1@fine: 0.533 (0.322)

HI
//15
dropout 0.4
softmax
0.04
0.06
Evaluation Results:
F1@coarse: 0.601 (0.301)
F1@fine: 0.451 (0.329)

//EN
//30
dropout 0.4
softmax
0.515
F1@coarse: 0.650 (0.332)
F1@fine: 0.520 (0.353)


"""





