In [None]:
from data_representation import InputExample, InputFeatures, truncate_pairs
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch
from torch import nn
import transformers as t
from sklearn import metrics 
import metrics
from tqdm import tqdm


import sys
sys.path.insert(1, "../lib")
from text_processer import *


import time
import random
import datetime
import re
from argparse import ArgumentParser

### Parametre:

In [None]:
OLD_LABELS = False

MODEL_PATH = "../hjemmesnekk/multilingual"
MODEL_PATH_TRAINED = "../hjemmesnekk/multilingual-trained/24.08_16_labels_8_epochs"
TOKENIZER_PATH = "../hjemmesnekk/multilingual"
CONFIG_PATH = "../hjemmesnekk/multilingual/config.json"
DO_LOWER = False
TRAINING_SIZE = 0.8
MAX_LEN = 512
EPOCHS = 8
BATCHES = 16
LR = 5e-5 
EPS = 1e-8
RANDOM_SEED = 42
DEVICE = 'cpu'

PRED_TRESH = 0.5



# Column names:

INTENTS = ["Sjekke","Kontakte","Registrere/lage","Endre/Oppdatere","Trekke/Fjerne", "Klage","Finne","Sende","Søke","Spørsmål","Ikke oppgitt", "@@@@@@@"]

CAUSES = ["Sykepenger","Sykemelding","Dagpenger","Permittert","Pensjon","Skatt/Årsoppgave","Meldekort", "Barn/Foreldre", "Ufør/AAP", "Informasjon",  
            # Dokumentasjon Labels:
            "Vedtak/Sak", "Dokumentasjon", "Skjema/søknad", "Melding/brev", # DOKUMENTER 
            # MinSide labels:
            "Personopplysning", # Kontonummer, endre egne personopplysninger osv 
            "CV","Arbeidssøker", #Fjerne arbeidssøker = Tilbake i jobb, Registrere arbeidssøker = permittert/lignende, finne arbeidssøker = Arbeidsgivere som vil finne ansatte 
            "Utbetaling","Aktivitetsplan", 
            "Veileder", "Saksbehandler/NAV",
            "Tekniske problemer", "Arbeidsgiver", "Annet språk","Annet", "SPAM"]

# Argparse fungerer ikke av en eller annen grunn...
#parser = ArgumentParser()
#parser.add_argument(" ")
#parser.add_argument("--MODEL_PATH", type=str, help = "The path to the folder where the model is located")
#parser.add_argument("--TOKENIZER_PATH", type = str, help = "The path to the folder where the tokenizer is located")
#parser.add_argument("--EPOCHS", type = int, default = 2, help = "The number of epochs the model should perform (2 is often enough")
#parser.add_argument("--BATCHES", type = int, default = 8, help = "The number of batches for each epoch")
#parser.add_argument("--LR", type = float, default = 5e-5, help = "Learning rate")
#parser.add_argument("--MAX_LEN", type = int, default= 512, help = "The max. length of each text the model supports (in config file)")
#args = parser.parse_args()


# Tokenizer

In [None]:
tokenizer = t.AutoTokenizer.from_pretrained(TOKENIZER_PATH, strip_accents = False, do_lower_case = DO_LOWER)

### Metoder for henting av data:

In [None]:
"""
def load_labeled_data(cols_to_use, old = True)-> pd.DataFrame:
    if old: 
        l_df = pd.read_csv("../hjemmesnekk/labeled_data/toppoppgaver.csv").set_index("index")
    else: 
        l_df = pd.read_csv("../hjemmesnekk/labeled_data/toppoppgaver_NYESTE.csv")
        l_df = metrics.convert_to_matrix(l_df).set_index("index")
        print(l_df)
        return
    l_df = l_df[cols_to_use]
    l_df["sum"] = l_df.sum(axis = 1)
    l_df = l_df[l_df["sum"] == 1.0].drop("sum", axis = 1)
    return l_df
"""

"""
    Returns the labeled_data in its entirety
"""
def load_labeled_data(with_intents = True) -> pd.DataFrame:
    if OLD_LABELS:
        l_df = pd.read_csv("../hjemmesnekk/labeled_data/toppoppgaver.csv").set_index("index")
        return l_df
    else:
        l_df = pd.read_csv("../hjemmesnekk/labeled_data/toppoppgaver_NYESTE.csv")
        l_df = metrics.convert_to_matrix(l_df).set_index("index")
        if not with_intents:
            l_df.drop(INTENTS, axis = 1)
        return l_df
    
"""
    Filters the labeled_data on prespecified columns
"""
def filter_labeled_data(l_df: pd.DataFrame, cols: list = None) -> pd.DataFrame:
    cols_to_use = cols if cols != None else l_df.columns.values
    l_df = l_df[cols_to_use]
    l_df["sum"] = l_df.sum(axis = 1)
    return l_df[l_df["sum"] >= 1.0].drop("sum", axis = 1)


def load_data(preprocessed = True) -> pd.DataFrame:
    t = text_processer(TOPPOPPGAVER)
    if preprocessed:
        return t.get_preprocessed_data()
    else:
        return t.get_cleaned_data()
    
def filter_on_labeled_data(df, labels) -> pd.DataFrame:
    df = df.loc[l_df.index.values,:]
    df.sort_index(axis = 0, inplace = True)
    l_df.sort_index(axis = 0, inplace = True)
    return df, l_df

def drop_cols(df, cols:list) -> pd.DataFrame:
    for col in df.columns.values:
        if col not in cols: 
            raise TypeError(f'Col: {col} not in param {cols}')
        assert col in cols
    return df.drop(cols, axis = 1)

### Metoder for parsing av data:

In [None]:
# Denne må legges inn en JSON fil etterhvert - og helst kjøres ved preprosessering av data.
def lemmatize(x, do_lower = False):
    x = x.lower() if do_lower else x
    
    lemmas = {"vedrørende":r'(\bvedr[.]?)',
              "angående":r'(\bang.)',
              "angående":r'(\bang.)',
              "telefon":r'(\btlf[.]?)',
              "offentlig":r'(\boff.)',
              "konto":r'(\bkto.)',
              "kontonummer":r'(\bkontonr.)',
              "på grunn av ": r'(\bpga.)',
              "nummer":r'(\bnr[.]?)',
              "registrere":r'(\breg.)',
              "tidligere":r'(\btidl[.]?)',
              "med vennlig hilsen":r'(\bmvh[.]?)',
              "utbetaling": r'\butbet+al[ingenrt]*|(\butbet[.]?)',
              "dagpenger": r'\bdagpeng[aenr]{0,3}',
              "finne": r'(\bfinne[r]?)',
              "ferie": r'\bferie[ern]*',
              "arbeide": r'\barbeide[ret]{1,3}',
              "korona": r'(\bcorona[viruset]{0,6}\Z)|(\bkorona[viruset]{0,6}\Z)|(\bcovid[-19]{0,3}\Z)',
              "lege": r'(\blegen?)',
              "fastlege": r'(\bfastlegen)',
              "melding": r'(\bmld)|(\bmelding[enr]{0,3})',
              "lønn": r'(\blønn[nea]{0,3})',
              "pensjon": r'(\bpensjon[en]{0,2})|(\bpension[en]{0,2})',
              "informasjon": r'(\binformasjon[en]{0,2})|(\binfo[.]?)',
              "kontonummer": r'(\bkontonummer[et]{0,2})|(\bkontonr[.]?)',
              "oppdatere": r'(\boppdatert)|(\boppdatering)',
              "dokument": r'(\bdokument[ern]{0,3})|(\bdok)(|\bdoc)',
              "aktivitetsplan": r'(\bak?tivitets?plan[en]{0,3})',
              "endre": r'(\bendre)|(\bendring)',
              "ettersende": r'(\bettersende?)|({\bettersending})',
              "arbeidsledig":r'(\barb. ledig)|(\barb.ledig)',
              "arbeids":r'(\barb.)',
              "for eksempel":r'(\bf.eks.)|(\bf. eks.)',
              ". ":r'(\b\n)',
    }
    for key in lemmas.keys():
        x = re.sub(lemmas[key], key, x)
    return x

def lemmatize_data(df: pd.DataFrame, preprocessed = True) -> pd.DataFrame:
    pred = lambda x: lemmatize(x)
    df["sentences"] = df.raw_text.apply(pred) if preprocessed else df["Hva kom du hit for å gjøre"].apply(pred)
    return df

"""
    Method to create inputExamples 
    Just instances with lemmatized texts and corresponding labels
"""
def createInputExamples(df, l_df, preprocessed = True):
    inputExamples = []   
    for i, val in enumerate(df["sentences"].values):
        labels = l_df.columns.values[np.where(l_df.values[i] == 1)]
        inputExamples.append(InputExample(i, val[0:MAX_LEN], None, labels))
    return inputExamples

### Metoder for oppbygging av dataen

In [None]:
class Custom_Data():
    """
        Takes in finished parsed data
        Creates instances to be used when training the model. 
    """
    def __init__(self, inputExamples, labels: list, tokenizer: t.tokenization_bert, MAX_LEN: int):
        self.inputExamples = inputExamples
        self.labels = labels
        self.tokenizer = tokenizer
        self.MAX_LEN = MAX_LEN
        self.label_map = {label : i for i,label in enumerate(labels)}
        
    def __len__(self):
        return len(self.inputExamples)
        
    """ 
        Method to tokenize one instance in inputExamples.
        Innebygget metode som tensordataset benytter seg av for å hente ut dataen
    """
    def __getitem__(self, ind):
        encoded_dict = self.tokenizer.encode_plus(self.inputExamples[ind].text_a,
                                            add_special_tokens = True,
                                            max_length = self.MAX_LEN,
                                            pad_to_max_length = True,
                                            return_attention_mask = True,
                                            truncation = True
                                            #return_tensors = 'pt',
        )
        input_ids = encoded_dict['input_ids']
        masks = encoded_dict['attention_mask']
        token_type_ids = encoded_dict['token_type_ids']
        
        labels = [0]*len(self.label_map.keys())
        for label in self.inputExamples[ind].labels:
            labels[self.label_map.get(label)] = 1            
            
        return {
            "text":self.inputExamples[ind].text_a,
            "input_ids":torch.tensor(input_ids, dtype=torch.long),
            "masks":torch.tensor(masks, dtype = torch.long),
            "tokens":torch.tensor(token_type_ids, dtype=torch.long),
            "labels":torch.tensor(labels, dtype = torch.float)
        }  

### Oppdeling av dataen og dataloaders for trening

In [None]:
def split_data(dataset: TensorDataset):
    train_size = int(TRAINING_SIZE * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    print('{:>5,} training samples'.format(train_size))
    print('{:>5,} validation samples'.format(val_size))
    
    return train_dataset, val_dataset

def build_loaders(train_dataset, val_dataset):
    train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = BATCHES)
    validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = BATCHES)
    return train_dataloader, validation_dataloader

### Oppdeling av dataen, og dataloaders for trening 

In [None]:
"""
    Takes in inputExamples
    Creates trainingset and testset by sampling from the inputExamples
    Returns trainingset and labelset
"""
def split_data2(inputExamples: list):
    train_dataset = random.sample(inputExamples, int(TRAINING_SIZE*len(inputExamples)))
    test_dataset = [inputExample for inputExample in inputExamples if inputExample not in train_dataset]
    return train_dataset, test_dataset

def createDataLoaders(train_dataset, test_dataset, labels, tokenizer):
    training_set = Custom_Data(train_dataset, labels, tokenizer, MAX_LEN
    )
    
    test_set = Custom_Data(test_dataset,labels,tokenizer, MAX_LEN
    )
    
    train_loader = DataLoader(training_set, shuffle = True, batch_size = BATCHES)
    test_loader = DataLoader(test_set, sampler = SequentialSampler(test_set), batch_size = BATCHES)
    return train_loader, test_loader

def texts_to_dataloader(texts, tokenizer, batch_size = 1): 
    data = []
    for i, text in enumerate(texts):
        tokenized_dict = tokenizer.encode_plus(text,
                                               add_special_tokens = True,
                                               max_length = MAX_LEN,
                                               pad_to_max_length = True,
                                               return_attention_mask = True,
                                               truncation = True
        )
        
        data.append({
            "id":i,
            "text":text,
            "input_ids":torch.tensor(tokenized_dict['input_ids'], dtype=torch.long),
            "masks":torch.tensor(tokenized_dict['attention_mask'], dtype = torch.long),
            "tokens":torch.tensor(tokenized_dict['token_type_ids'], dtype = torch.long)            
        })
    
    pred_loader = DataLoader(data, sampler = SequentialSampler(data), batch_size = batch_size)
    return pred_loader

### Skall til multilabler BERT:

In [None]:
class BertMultiLabler(nn.Module):
    """
        Class for multilabler BERT.
        Uses normal BertModel in bottom, a dropout layer and a linear classifier on top.

    """
    def __init__(self, config, isTrained):
        super(BertMultiLabler, self).__init__()
        if isTrained:
            self.base_model = t.BertModel.from_pretrained(MODEL_PATH_TRAINED)
        else:
            self.base_model = t.BertModel.from_pretrained(MODEL_PATH)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, NUM_LABELS)

        pass

    def forward(self, input_ids, masks, tokens):
        _, model_output = self.base_model(input_ids, attention_mask = masks, token_type_ids = tokens) # Model returns two params
        dropout_output = self.dropout(model_output)
        output = self.classifier(dropout_output)
        return output


### Tapsfunksjoner og optimizer

In [None]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)

def get_optimizer(model, o1 = False):
    if o1:
        return torch.optim.Adam(
            params = model.parameters(),
            lr = LR
        )
    else:
        return AdamW(
            model.parameters(), 
            lr = LR,
            eps = EPS
        )

# Kjører alt:

In [None]:
df = load_data()
filter_cols = None
l_df = load_labeled_data()
print(l_df.sum(axis=0))
cols_to_use_old = ["Finne info/oversikt", "Ettersende dokumentasjon", "Permittert", "Sjekke/endre kontonr", "Finne skjema/dokumenter","Oppdatere/lage cv", "Pensjon", "Ukjent","Dagpenger"]
intent_cols = ["Trekke/Fjerne","Klage","Spørsmål","Finne","Registrere/lage","Endre/Oppdatere","Søke","Sjekke","Sende","@@@@@@@","Ikke oppgitt"]
cols_to_use = ["Annet","Pensjon","Dagpenger","Skjema/søknad","Utbetaling","Dokumentasjon","Tekniske problemer","Permittert","Personopplysning","SPAM","Informasjon","Skatt/Årsoppgave", "Arbeidssøker","Sykepenger","Sykemelding","CV"]
l_df = filter_labeled_data(l_df, cols_to_use)
NUM_LABELS = len(l_df.columns.values)
df, l_df = filter_on_labeled_data(df, l_df)
l_df

In [None]:
#df = load_data()
#cols_to_use = ["Finne info/oversikt", "Ettersende dokumentasjon", "Permittert", "Sjekke/endre kontonr", "Finne skjema/dokumenter","Oppdatere/lage cv"]
#cols_to_use = ["Finne info/oversikt", "Ettersende dokumentasjon", "Sjekke/endre kontonr"]
#NUM_LABELS = len(cols_to_use)
#l_df = load_labeled_data(cols_to_use, old = False)
#df, l_df = filter_on_labeled_data(df,l_df)

#### Størrelse data: 

In [None]:
l_df.sum(axis=0)

In [None]:
df = lemmatize_data(df, preprocessed = True)
inputExamples = createInputExamples(df, l_df, preprocessed = False)

In [None]:
train_data, test_data = split_data2(inputExamples)
train_loader, test_loader = createDataLoaders(train_data, test_data, l_df.columns.values, tokenizer)

### Modell

In [None]:
model = BertMultiLabler(t.BertConfig.from_json_file(CONFIG_PATH), isTrained = True)
model.to(DEVICE)

### Trener opp modellen

In [None]:
optimizer = get_optimizer(model, o1 = True)

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['input_ids'].to(DEVICE, dtype = torch.long)
        mask = data['masks'].to(DEVICE, dtype = torch.long)
        token_type_ids = data['tokens'].to(DEVICE, dtype = torch.long)
        targets = data['labels'].to(DEVICE, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%10==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

#for epoch in range(EPOCHS):
#    train(epoch)
#Trener litt mer - dog må siste klassifiseringslaget trenes helt opp på nytt igjen.. :( 
for epoch in range(4,6):
    train(epoch)

In [None]:
model.base_model.save_pretrained("../hjemmesnekk/multilingual-trained/26.08_16_labels_8+2_epochs_kun_BERT")

### Validerer modellen

In [None]:
"""
   Since we have a multiclass, multilabling problem, metrics such as f1 is not straight forward. 
   Thus using both f1 and hamming distance:
"""
def conf_matrix(outputs, targets):
    targets = np.array(targets) == 1 # Converting to true false matrix
    assert outputs.shape == targets.shape
    
    tps, tns, fps, fns = 0,0,0,0
    for i, doc in enumerate(outputs):
        for j, guessed_topic in enumerate(doc):
            if guessed_topic == targets[i][j] == 1:
                tps += 1
            elif guessed_topic == targets[i][j] == 0:
                tns += 1
            elif (guessed_topic == 1) and (targets[i][j] == 0):
                fps += 1
            elif (guessed_topic == 0) and (targets[i][j] == 1):
                fns += 1
            else:
                raise TypeError("WTF")
    
    
    acc = (tps+tns)/(tps+tns+fps+fns)
    prec = (tps)/(tps+fps)
    rec = (tps)/(tps+fns)
    f1 = 2*(prec*rec)/(prec+rec)
    return {
        "accuracy":acc,
        "precision":prec,
        "recall":rec,
        "f1":f1
    }
    

In [None]:
def validate(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    
    label_map = {i : label for i, label in enumerate(l_df.columns.values)}
    texts = []
    pred_labels = []
    
    with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
            ids = data['input_ids'].to(DEVICE, dtype = torch.long)
            mask = data['masks'].to(DEVICE, dtype = torch.long)
            token_type_ids = data['tokens'].to(DEVICE, dtype = torch.long)
            targets = data['labels'].to(DEVICE, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
            #texts.extend(data['text'])
            
    return fin_outputs, fin_targets, texts

In [None]:
for epoch in range(1):
    outputs, targets, texts = validate(epoch)
    outputs = np.array(outputs) >= PRED_TRESH
    res = conf_matrix(outputs, targets)
    
    print(f"Accuracy: {np.round(res['accuracy'],4)} for epoch {epoch}")
    print(f'F1-score:{np.round(res["f1"],4)} for epoch: {epoch}')

### Predicte helt nye instanser:

In [None]:
def predict(pred_texts: list(), labels, tokenizer):
    model.eval()
    
    label_map = {i : label for i, label in enumerate(labels)}
    pred_data = texts_to_dataloader(pred_texts, tokenizer)
        
    texts = []
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(pred_data,0):
            ids = data['input_ids'].to(DEVICE, dtype = torch.long)
            mask = data['masks'].to(DEVICE, dtype = torch.long)
            tokens = data['tokens'].to(DEVICE, dtype = torch.long)
            
            outputs = model(ids, mask, tokens)
            outputs = torch.sigmoid(outputs).cpu().detach()
            preds = (np.array(outputs) >= 0.2)[0]
            inds = np.where(preds == True)[0].tolist()
            
            print(data['text'])
            print(preds)
            print(inds)
            print(type(inds))
            for ind in inds:
                print(ind, label_map.get(ind))
        

In [None]:
texts = ["Hei jeg ønsker å endre kontonr mitt.","Hei jeg vil lage cv etterpå", "Jeg har blit permitert", "Ønsker å laste opp vedlegg", "Ønsker å lage CV og ettersende dokumentasjon"]
predict(texts, l_df.columns.values, tokenizer)

# Predikerer alle tekstene i dataen, og lagrer til pickle fil:

- Denne metoden er ganske treig per nå, bruker ish 3s / tekst grunnet mye i minnet og med batch-size 1
- Prøver å øke batch size for å se hvordan den presterer da, ev. med 1 batch (trenger ikke padding da)
- Bør også gjøres med flere prosesser

In [None]:
def predict_all_texts(df, labels, tokenizer, batch_size = 1) -> pd.DataFrame:
    model.eval()
    
    df = df.iloc[0:200,:]
    
    pred_texts = df["raw_text"].apply(lambda x: lemmatize(x)).values
    label_map = {i : label for i, label in enumerate(labels)}
    pred_data = texts_to_dataloader(pred_texts, tokenizer, batch_size = batch_size)
        
    predictions = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(pred_data,0)):
            ids = data['input_ids'].to(DEVICE, dtype = torch.long)
            mask = data['masks'].to(DEVICE, dtype = torch.long)
            tokens = data['tokens'].to(DEVICE, dtype = torch.long)
            
            outputs = model(ids, mask, tokens)
            outputs = torch.sigmoid(outputs).cpu().detach()
            preds = (np.array(outputs) >= 0.2)[0]
            inds = np.where(preds == True)[0].tolist()
            
            predictions.append([str(label_map.get(ind)) for ind in inds])
    
    
    assert len(pred_texts) == len(predictions)
    
    df["Prediction"] = predictions
    
    return df
    
def save_predicted_df(df):
    pass

In [None]:
predicted_df = predict_all_texts(df,l_df.columns.values, tokenizer, batch_size = 16)

In [None]:
predicted_df

In [None]:
# Lagrer predicted df:
predicted_df_fixed = predicted_df.copy().drop("sentences", axis = 1)
predicted_df_fixed.to_csv("../hjemmesnekk/predicted_data/predicted_1833_instances_26.08.csv")
predicted_df_fixed.to_pickle("../hjemmesnekk/predicted_data/predicted_1833_instances_26_08.pkl")

# Visualiserer resultatene

In [None]:
from datapakke_custom import classifier_datapakke, convert_to_matrix

In [None]:
cd = classifier_datapakke(predicted_df_fixed)

In [None]:
fig_monthly_piechart = cd.pie_chart_co_occuring_labels(min_Antall = 1)
fig_monthly_piechart.show()

In [None]:
succ = cd._df_labels
succ  