# Document analysis with SecBERT trained classifier
Mount your own drive space as working space with the following three commands

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cd '/content/drive/MyDrive/secBert'

In [None]:
import os
os.chdir('/content/drive/MyDrive/secBert')

In [None]:
!pip install pandas
!pip3 install torch torchvision
!pip install transformers
!pip install sklearn 

In [None]:
import torch
import pandas as pd 

from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForMaskedLM, BertConfig, AutoModel
from sklearn.preprocessing import LabelEncoder

from time import sleep

In [None]:
tokenizer = AutoTokenizer.from_pretrained("jackaduma/SecBERT")

pretrained_model = AutoModelForMaskedLM.from_pretrained("jackaduma/SecBERT")
config = BertConfig.from_pretrained("jackaduma/SecBERT", output_hidden_states=True)

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
df = pd.read_csv('./dataset_new.csv')
df = df.reset_index()

df['sentence'] = df['sentence'].astype(str)
LABELS = len(df['label_tec'].value_counts())

#Encoding labels
encoder = LabelEncoder()
encoder.fit(df['label_tec'])

LabelEncoder()

In [None]:
train_dataset = pd.read_csv('./train_dataset_tram.csv')
train_dataset = train_dataset.reset_index()

test_dataset = pd.read_csv('./testset_tram_x_ours.csv')
test_dataset['enc_label'] = encoder.transform(test_dataset['label_tec'])
test_dataset

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 1e-05

In [None]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sentence = str(self.data.sentence[index])
        sentence = " ".join(sentence.split())
        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        if 'enc_label' not in self.data:
            return {
            'sentence': sentence,
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
            }

        return {
            'sentence': sentence,
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.enc_label[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0 
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (12945, 6)
TRAIN Dataset: (1185, 5)
TEST Dataset: (294, 4)


In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class SecBERTClass(torch.nn.Module):
    def __init__(self, pretrained_model_name: str, num_classes: int = None, dropout: float = 0.3):
        super().__init__()
        config = BertConfig.from_pretrained(pretrained_model_name, output_hidden_states=True)
        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config).base_model #pick only the main body of the model
        #for param in self.model.parameters():
          #param.requires_grad = False
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(dropout)
        self.classifier = torch.nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        output_1 = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

Load previous trained model 

In [None]:
#LOAD
model = SecBERTClass("jackaduma/SecBERT", LABELS)
model.load_state_dict(torch.load('trained_secbert.pt', map_location=torch.device('cpu')))

Some weights of the model checkpoint at jackaduma/SecBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [None]:
model.to(device)

In [None]:
len(train_dataset)

1185

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
def check_accuracy(loader, model):

    num_correct = 0
    num_samples = 0

    sentences = []
    predicted = []
    targets = []
    predictions_arr = []
    model.eval()

    with torch.no_grad():
      for i, data in enumerate(loader, 0):
          x = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          y = data['targets'].to(device, dtype = torch.long)

          scores = model(x, mask)
          _, predictions = scores.max(1)
          num_correct += (predictions == y).sum()
          num_samples += predictions.size(0)

          sentences += data['sentence']
          predicted += scores
          predictions_arr += predictions
          targets += y

      print(
          f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
      )

      return predicted, targets, sentences, predictions_arr

In [None]:
predicted, targets, sentences, predictions = check_accuracy(testing_loader, model)

Got 137 / 294 with accuracy 46.60


In [None]:
targets

In [None]:
print(sentences[0] + ' ' + encoder.inverse_transform([predictions[0].item()]) + ' ' + encoder.inverse_transform([targets[0].item()]))

['Additionally, a small number of campaigns over this same period also made use of various file-sharing platforms like Dropbox for hosting the malicious documents rather than directly attaching them to the messages themselves.Figure 2: Example malicious Excel documentSimilar to the technique described in our previous blog about Remcos, the contents of the documents have been intentionally made to appear as if they are blurry, with the user being prompted to enable editing to have a clearer view of the contents T1566 T1204']


In [None]:
check_accuracy(testing_loader, model)

Got 1878 / 2589 with accuracy 72.54


In [None]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy 

In [None]:
import torch
from torchmetrics import F1Score, Precision, Recall, Accuracy
f1 = F1Score(num_classes=LABELS)
preds = torch.stack(predicted)
tags = torch.tensor(targets)
f1(preds, tags)

tensor(0.4660)

In [None]:
precision = Precision(num_classes=LABELS)
precision(preds,tags)

tensor(0.4660)

In [None]:
recall = Recall(num_classes=LABELS)
recall(preds, tags)

tensor(0.4660)

In [None]:
top_k = Accuracy(num_classes=LABELS, top_k=3)
top_k(preds, tags)

tensor(0.6531)

In [None]:
predicted

In [None]:
from nltk.tokenize import sent_tokenize

def remove_empty_lines(text):
	lines = text.split("\n")
	non_empty_lines = [line for line in lines if line.strip() != ""]

	string_without_empty_lines = ""
	for line in non_empty_lines:
		if line != "\n": 
			string_without_empty_lines += line + "\n"

	return string_without_empty_lines 

def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text


In [None]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import yaml
import re

def repl(matchobj):
    return ","+ matchobj.group(1) + ","

def load_regex(filename):
    regex_list = []
    with open(filename, 'r') as val:
        document = yaml.safe_load(val)
        regex_list = document
    return regex_list

def apply_regex_to_string(regex_list, string):
    new_string = string
    for rex in regex_list:
        reg = rex.get('regex').strip()
        raw_s = r'{}'.format(reg)
        if re.search(raw_s, string):
            new_string = re.sub(raw_s, rex.get('code') + " ", string)
            break
    return new_string


In [None]:
fin6_tec_2 = ['T1134', 'T1059', 'T1562', 'T1036', 'T1588', 'T1003', 'T1021', 'T1569', 'T1078', 'T1102',
        'T1087', 'T1482', 'T1069', 'T1018', 'T1016', 'T1548', 'T1071', 'T1185', 'T1059', 
        'T1543', 'T1132', 'T1005', 'T1001', 'T1140', 'T1573', 'T1068', 'T1083',
        'T1564', 'T1562', 'T1070', 'T1105', 'T1056', 'T1112', 'T1046', 'T1095', 'T1027', 'T1137', 
        'T1003', 'T1069', 'T1057', 'T1055', 'T1572', 'T1572', 'T1090', 'T1012', 'T1620', 'T1021',
        'T1018', 'T1029', 'T1113', 'T1518', 'T1553', 'T1218', 'T1049', 'T1007', 'T1569', 'T1550', 
        'T1078', 'T1047']

fin6_tec_1 = ['T1087', 'T1560', 'T1119', 'T1547', 'T1110', 'T1059', 'T1074', 'T1573', 'T1068', 'T1070', 
        'T1046', 'T1003', 'T1572', 'T1021', 'T1018', 'T1053', 'T1078', 'T1003']

# MenuPass [8]

menuPass_tec_8 = ['T1560', 'T1119', 'T1059', 'T1005', 'T1074', 'T1210', 'T1083', 'T1574', 'T1106', 'T1027', 'T1003', 'T1199', 'T1078', 'T1047']

adFind = ['T1087', 'T1482', 'T1069', 'T1018', 'T1016']

certutil = ['T1140', 'T1105', 'T1553']

quasarRAT = ['T1059', 'T1555', 'T1573', 'T1105', 'T1056', 'T1112', 'T1090', 'T1021', 'T1053', 'T1553', 'T1082', 'T1552', 'T1125']

menuPass_tec_8.extend(adFind)
menuPass_tec_8.extend(certutil)
menuPass_tec_8.extend(quasarRAT)

# MenuPass [2]

menuPass_tec_2 = ['T1583', 'T1560', 'T1568', 'T1070', 'T1056', 'T1036', 'T1105', 'T1566', 'T1021', 'T1199', 'T1204', 'T1078']

poisonIvy = ['T1010', 'T1547', 'T1059', 'T1543', 'T1005', 'T1074', 'T1573', 'T1105', 'T1056', 'T1112', 'T1027', 
'T1055', 'T1014']

menuPass_tec_2.extend(poisonIvy)

# WizardSpider [2]

wizardSpider_tec_2 = ['T1547', 'T1059', 'T1562', 'T1135', 'T1566', 'T1055', 'T1021', 'T1053', 'T1558', 'T1204', 'T1047']

bloodHound = ['T1087', 'T1560', 'T1059', 'T1482', 'T1615', 'T1106', 'T1201', 'T1069', 'T1018', 'T1033']

cobaltStrike = ['T1548', 'T1134', 'T1087', 'T1071', 'T1197', 'T1185', 'T1059', 'T1043', 'T1543', 'T1132', 'T1005', 'T1001', 'T1030', 'T1140', 'T1573', 'T1203', 'T1068', 'T1083', 'T1564', 'T1562', 'T1070', 'T1105', 'T1056', 'T1112', 'T1026', 'T1106', 'T1046', 'T1135', 'T1095', 'T1027', 'T1137', 'T1003', 'T1069', 'T1057', 'T1055', 'T1572', 'T1090', 'T1012', 'T1620', 'T1021', 'T1018', 'T1029', 'T1113', 'T1518', 'T1553', 'T1218', 'T1016', 'T1049', 'T1007', 'T1569', 'T1550', 'T1078', 'T1047']

empire =  ['T1548', 'T1134', 'T1087', 'T1557', 'T1071', 'T1560', 'T1547', 'T1217', 'T1115', 'T1059', 'T1043', 'T1136', 'T1543', 'T1555', 'T1484', 'T1482', 'T1114', 'T1573', 'T1546', 'T1068', 'T1083', 'T1574', 'T1210', 'T1615', 'T1567', 'T1070',  'T1056', 'T1105', 'T1056', 'T1106', 'T1046', 'T1135', 'T1040', 'T1027', 'T1003', 'T1057', 'T1055', 'T1021', 'T1053', 'T1113', 'T1518', 'T1558', 'T1082', 'T1016', 'T1049', 'T1569', 'T1127', 'T1552', 'T1550', 'T1125', 'T1102', 'T1047']

mimikatz = ['T1134', 'T1098', 'T1547', 'T1555', 'T1003', 'T1207', 'T1558', 'T1552', 'T1550']

ping = ['T1018']

ryuk = ['T1134', 'T1547', 'T1059', 'T1486', 'T1083', 'T1222', 'T1562', 'T1490', 'T0828', 'T1036', 'T1106', 'T1027', 'T1057', 'T1055', 'T1021', 'T1053', 'T1489', 'T1082', 'T1614', 'T1016', 'T1205', 'T1078']

trickBot = ['T1087', 'T1087', 'T1071', 'T1547', 'T1185', 'T1110', 'T1059', 'T1059', 'T1043', 'T1543', 'T1555', 'T1555', 'T1132', 'T1005', 'T1140', 'T1482', 'T1573', 'T1041', 'T1210', 'T1008', 'T1083', 'T1495', 'T1562', 'T1105', 'T1056', 'T1559', 'T1036', 'T1112', 'T1106', 'T1135', 'T1571', 'T1027', 'T1027', 'T1069', 'T1566', 'T1566', 'T1542', 'T1057', 'T1055', 'T1055', 'T1090', 'T1219', 'T1021', 'T1018', 'T1053', 'T1553', 'T1082', 'T1016', 'T1033', 'T1007', 'T1552', 'T1552', 'T1204', 'T1497']

wizardSpider_tec_2.extend(bloodHound)
wizardSpider_tec_2.extend(cobaltStrike)
wizardSpider_tec_2.extend(empire)
wizardSpider_tec_2.extend(mimikatz)
wizardSpider_tec_2.extend(ping)
wizardSpider_tec_2.extend(ryuk)
wizardSpider_tec_2.extend(trickBot)

#WizardSpider [7]

wizardSpider_tec_7 = ['T1087', 'T1059', 'T1048', 'T1210', 'T1562', 'T1027', 'T1021', 'T1018', 'T1489', 'T1518', 'T1558', 'T1082', 'T1569']

adFind = ['T1087', 'T1482', 't1069', 'T1018', 'T1016']

#CobaltStrike

net = ['T1087', 'T1087', 'T1136', 'T1136', 'T1070', 'T1135', 'T1201', 'T1069', 'T1069', 'T1021', 'T1018', 'T1049', 'T1007', 'T1569', 'T1124']

nltest = ['T1482', 'T1018', 'T1016']

#Ping

#Ryuk

wizardSpider_tec_7.extend(adFind)
wizardSpider_tec_7.extend(cobaltStrike)
wizardSpider_tec_7.extend(net)
wizardSpider_tec_7.extend(nltest)
wizardSpider_tec_7.extend(ping)
wizardSpider_tec_7.extend(ryuk)

In [None]:
fin6_files = ['documents/FIN6/Follow The Money-Dissecting the Operations of the Cyber Crime Group FIN6[1].txt', 
                'documents/FIN6/Pick-Six-Intercepting a FIN6 Intrusion, an Actor Recently Tied to Ryuk and LockerGoga Ransomware[2].txt',
                'documents/FIN6/intelligence_summary.txt']

menuPass_files = ['documents/MenuPass/2018_12_20_united_states_v_zhu_hua_indictment[2].txt', 
                'documents/MenuPass/Japan-Linked Organizations Targeted in Long-Running and Sophisticated Attack Campaign[8].txt']

wizardSpider_files = ['documents/WizardSpider/Ryuk’s Return[7].txt',
                     'documents/WizardSpider/Ransomware Activity Targeting the Healthcare and Public Health Sector. Retrieved October 28, 2020[2].txt']

In [None]:
file_name = wizardSpider_files[1]
techniques = wizardSpider_tec_2

In [None]:
#Read report text from txt file
lines = []
file_paths = [file_name]
for file_path in file_paths:
    with open(file_path) as f:
        lines += f.readlines()
import re
## Apply regex 
regex_list = load_regex("regex.yml")

text = combine_text(lines)
text = re.sub('(%(\w+)%(\/[^\s]+))', repl, text)
text = apply_regex_to_string(regex_list, text)
text = re.sub('\(.*?\)', '', text)
text = remove_empty_lines(text)
text = text.strip()
sentences = sent_tokenize(text)

double_sentences = []

for i in range(1, len(sentences)):
    new_sen = sentences[i-1] + sentences[i]
    double_sentences.append(new_sen)
 
data = {'sentence': sentences}
df = pd.DataFrame(data, columns=['sentence'])
sentence_set = Triage(df, tokenizer, MAX_LEN)
testing_loader = DataLoader(sentence_set, **test_params)

In [None]:
len(sentences)

275

In [None]:
predicted = []
predict_proba_scores = []
with torch.no_grad():
      for i, data in enumerate(testing_loader, 0):
          x = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)

          scores = model(x, mask)
          _, predictions = scores.max(1)
          proba_scores = torch.nn.functional.softmax(scores, dim=1)

          predicted += predictions
          predict_proba_scores += proba_scores


In [None]:
predicted = [ pred.item() for pred in predicted]
predicted

In [None]:
predict_proba_scores = [pred.max().item() for pred in predict_proba_scores]
predict_proba_scores

In [None]:
predict_proba_scores

In [None]:
predicted = encoder.inverse_transform(predicted) 
predicted

In [None]:
def f_measure(recall, precision):
    if recall != 0 and precision != 0:
        return (2*precision*recall)/(precision+recall)
    else:
        return 0.01

In [None]:
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
precisions = []
recalls = []
corrected_pred = []
accepted_pred = []
correct_on_uniques = []
f1s = []

print(len(predicted))
print(predicted)

lines = len(predicted)

275
['T1102' 'T1102' 'T1071' 'T1135' 'T1189' 'T1057' 'T1070' 'T1102' 'T1036'
 'T1027' 'T1562' 'T1102' 'T1486' 'T1591' 'T1059' 'T1102' 'T1055' 'T1204'
 'T1486' 'T1218' 'T1102' 'T1574' 'T1140' 'T1218' 'T1102' 'T1588' 'T1550'
 'T1102' 'T1102' 'T1562' 'T1542' 'T1102' 'T1046' 'T1102' 'T1592' 'T1497'
 'T1102' 'T1553' 'T1562' 'T1102' 'T1562' 'T1547' 'T1189' 'T1070' 'T1074'
 'T1102' 'T1102' 'T1018' 'T1587' 'T1195' 'T1566' 'T1505' 'T1036' 'T1036'
 'T1562' 'T1102' 'T1001' 'T1046' 'T1102' 'T1102' 'T1046' 'T1048' 'T1583'
 'T1102' 'T1486' 'T1218' 'T1120' 'T1071' 'T1562' 'T1218' 'T1218' 'T1102'
 'T1608' 'T1587' 'T1480' 'T1486' 'T1102' 'T1592' 'T1027' 'T1102' 'T1102'
 'T1140' 'T1102' 'T1496' 'T1102' 'T1012' 'T1036' 'T1102' 'T1566' 'T1566'
 'T1137' 'T1029' 'T1059' 'T1027' 'T1486' 'T1021' 'T1102' 'T1102' 'T1102'
 'T1496' 'T1102' 'T1016' 'T1102' 'T1542' 'T1102' 'T1102' 'T1070' 'T1001'
 'T1003' 'T1102' 'T1137' 'T1486' 'T1490' 'T1480' 'T1016' 'T1592' 'T1106'
 'T1486' 'T1484' 'T1018' 'T1486' 'T1102' 'T1542

In [None]:
for threshold in thresholds: 
    tecs = set(techniques)
    accepted = []

    for i in range(0,len(predict_proba_scores)):
        top_class = predicted[i]
        proba = predict_proba_scores[i]
        if proba > threshold:
            accepted.append(top_class)

    correct = 0

    unique_accepted = set(accepted)

    len_tecs = len(tecs)

    for pred in accepted:
        if pred in tecs: #True Positives
            correct += 1
    print(len(accepted))
    print(correct)

    if len(accepted) != 0:
        precision = correct/len(accepted)*100
    else:
        precision = 0
    
    precision = round(precision,2)
    print(precision) #accuracy or precision?

    precisions.append(precision)

    for pred in accepted:
        if pred in tecs:
            tecs.remove(pred)

    recall = str(len_tecs-len(tecs))+ '/' + str(len_tecs)

    print(recall) #Recall

    recalls.append(recall)
    recall = (len_tecs-len(tecs))/len_tecs

    corrected_pred.append(correct) 
    accepted_pred.append(len(accepted))
    
    cou = str(len_tecs-len(tecs))+ '/' + str(len(unique_accepted))
    correct_on_uniques.append(cou)
    cou = 0 if len(unique_accepted) == 0 else (len_tecs-len(tecs))/len(unique_accepted)

    f1 = f_measure(recall=recall, precision=cou)
    f1 = round(f1,2)
    f1s.append(f1)

    print("Threshold: " + str(threshold) + ": " + str(cou) + " correct on uniques")

240
211
87.92
47/99
Threshold: 0.1: 0.7580645161290323 correct on uniques
139
127
91.37
40/99
Threshold: 0.2: 0.8333333333333334 correct on uniques
111
105
94.59
37/99
Threshold: 0.3: 0.9024390243902439 correct on uniques
93
88
94.62
32/99
Threshold: 0.4: 0.8888888888888888 correct on uniques
80
78
97.5
30/99
Threshold: 0.5: 0.967741935483871 correct on uniques
64
62
96.88
29/99
Threshold: 0.6: 0.9666666666666667 correct on uniques
57
56
98.25
28/99
Threshold: 0.7: 0.9655172413793104 correct on uniques
44
43
97.73
24/99
Threshold: 0.8: 0.96 correct on uniques


In [None]:
class Classifier_results:
    def __init__(self, title, lines, accepted_preds, correct_preds, precisions, recalls, correct_uniques, f1s):
        self.title = title
        self.lines = lines
        self.accepted_preds = accepted_preds
        self.correct_preds = correct_preds
        self.precisions = precisions
        self.recalls = recalls
        self.correct_uniques = correct_uniques
        self.f1s = f1s

class CSVOutput:
    def __init__(self, document_title, classifiers):
        self.classifiers = classifiers
        self.document_title = document_title

    def printify_array(self, array, sep = ';'):
        return sep + sep.join(str(x) for x in array)

    def _save_classifier_outputs(self, f):
        for classifier in self.classifiers:
            f.write(classifier.title + '\n')
            f.write(str(classifier.lines) + ' sentences\n')
            f.write('Accepted Predictions: {}\n'.format(self.printify_array(classifier.accepted_preds)))
            f.write('Corrected Predictions: {}\n'.format(self.printify_array(classifier.correct_preds)))
            f.write('Precision%: {}\n'.format(self.printify_array(classifier.precisions)))
            f.write('Recall%: {}\n'.format(self.printify_array(classifier.recalls)))
            f.write('Correct predictions on uniques: {}\n\n'.format(self.printify_array(classifier.correct_uniques)))

    def _save_classifier_f1(self, path):
        with open(path+'/'+self.document_title+'_f1.txt', 'w') as f:
            for classifier in self.classifiers:
                f.write(classifier.title)
                f.write(self.printify_array(classifier.f1s)+ '\n')

    def write_to_file(self, path):
        self._save_classifier_f1(path)
        with open(path+'/'+self.document_title+'.csv', 'w') as f:
            f.write('Tresholds; 0,1; 0.2; 0.3; 0.4; 0.5; 0.6; 0.7; 0.8;\n')
            self._save_classifier_outputs(f)
            
    def append_to_file(self, path):
        self._save_classifier_f1(path)
        with open(path+'/'+self.document_title+'.csv', 'a') as f:
            self._save_classifier_outputs(f)

In [None]:
result = Classifier_results( title='secBert', 
                              lines=lines,
                              accepted_preds=accepted_pred, 
                              correct_preds=corrected_pred, 
                              precisions=precisions, 
                              recalls=recalls, 
                              correct_uniques=correct_on_uniques,
                              f1s=f1s)

In [None]:
fin6_ref_1_output = CSVOutput('documents/wizardSpider_ref_2', [result])
fin6_ref_1_output.write_to_file('.')