In [84]:
import matplotlib.pyplot as plt
import json
import jsonlines
import random

In [85]:
def filter_objects_by_text(arr1, arr2):
    text_set2 = set(obj2["text"] for obj2 in arr2)
    text_set1 = set(obj1["text"] for obj1 in arr1)

    text_filter = text_set1 & text_set2

    unique_texts1 = set()
    arr1_filtered = []
    for obj1 in arr1:
        if obj1["text"] not in unique_texts1 and obj1["text"] in text_filter:
            arr1_filtered.append(obj1)
            unique_texts1.add(obj1["text"])
    return arr1_filtered

def count_labels(datasets, labels):
    count = {label: 0 for label in labels}
    label_counts = {label: label for label in labels}
    for dataset in datasets:
        if "spans" in dataset:
            for span in dataset["spans"]:
                label = span.get("label")
                if label in count.keys():
                    count[label_counts[label]] += 1
    return count

def delete_objects_with_label(input_array, desired_delete_numbers, label_to_delete):
    # Use a list comprehension to filter the objects with the specified label
    # Find all elements with the specified label
    matching_elements = [element for element in input_array if element["accept"][0] == label_to_delete]
    
    # Determine how many elements to delete
    delete_count = min(len(matching_elements), desired_delete_numbers)
    
    # Delete random elements from the list
    deleted_elements = random.sample(matching_elements, delete_count)
    new_array = [element for element in input_array if element not in deleted_elements]
    
    # Return the new array and the deleted elements
    return (new_array, deleted_elements)

def save_to_jsonl(datasets, filepath):
    with open(f'{filepath}', 'w') as f1:
        for obj in datasets:
            json_str = json.dumps(obj)
            f1.write(json_str + '\n')

In [87]:

filepath = "./assets/prodigy"


dataset1 = []
dataset2 = []
# Load the two datasets
with jsonlines.open(f'{filepath}/annotator-1/spacy-format/final.jsonl', 'r') as f1, jsonlines.open(f'{filepath}/annotator-2/spacy-format/final.jsonl', 'r') as f2:


    for data in f1:
        if(len(data['accept'])>0):
            dataset1.append(data)

    for data in f2:
        if(len(data['accept'])>0):
            dataset2.append(data)

dataset1_normalize = filter_objects_by_text(dataset1, dataset2)
dataset2_normalize = filter_objects_by_text(dataset2, dataset1)

In [122]:
dataset1_balanced, filter_delete = delete_objects_with_label(dataset1_normalize, 1600, "POSTCONDITION")
delete_ids = [obj["text"] for obj in filter_delete]
dataset2_balanced = [obj for obj in dataset2_normalize if obj["text"] not in delete_ids]

In [123]:
labels = ["PRECONDITION", "POSTCONDITION", "ACTOR", "QUALITY"]
data = {
    "annotator_1": count_labels(dataset1_balanced, labels),
    "annotator_2": count_labels(dataset2_balanced, labels)
}


In [124]:
data

{'annotator_1': {'PRECONDITION': 475,
  'POSTCONDITION': 536,
  'ACTOR': 853,
  'QUALITY': 783},
 'annotator_2': {'PRECONDITION': 410,
  'POSTCONDITION': 583,
  'ACTOR': 479,
  'QUALITY': 3}}

In [93]:
import random
# Randomly shuffle the data
random.shuffle(dataset1_normalize)

# Split the data into training and testing sets
split_idx = int(len(dataset1_normalize) * 0.8)
train_data = dataset1_normalize[:split_idx]
test_data = dataset1_normalize[split_idx:]



In [125]:
save_to_jsonl(dataset1_balanced, "./assets/prodigy/annotator-1/spacy-format/normalize-balance.jsonl")
save_to_jsonl(dataset2_balanced, "./assets/prodigy/annotator-2/spacy-format/normalize-balance.jsonl")

In [None]:
# with open(f'{filepath}/annotator-1/normalize.jsonl', 'w') as f1, open(f'{filepath}/annotator-2/normalize.jsonl', 'w') as f2:
#     for obj in dataset1_balanced:
#         json_str = json.dumps(obj)
#         f1.write(json_str + '\n')

#     for obj in dataset2_balanced:
#         json_str = json.dumps(obj)
#         f2.write(json_str + '\n')

In [None]:
for dataset in dataset1:
    if('spans' in dataset.keys()):
        dataset['spans'] = [span for span in dataset['spans'] if span['label'] in ["POSTCONDITION", "PRECONDITION"]]


In [90]:
save_to_jsonl(dataset2_normalize, "./assets/prodigy/annotator-2/spacy-format/normalize.jsonl")

In [102]:
import spacy
from spacy import displacy
from spacy.tokens import Span

In [109]:
data_test = [
    "The nurse shall be able to login to the store",
]



In [110]:
data = "./corpus/annotator-1/pure-normalize/spacy-spancat/overlapping/dev.spacy"

In [111]:
model = spacy.load("./training/test/model-best")
for item in range(len(data_test)):
    doc = model(data_test[item])
    spans = [(span.text, span.label_) for span in doc.spans["sc"]]
    displacy.render(doc, style="span")

In [112]:
from prodigy.components.db import connect
db = connect()

db.datasets

['annotator_1_normalize_test',
 'annotator_1_normalize_train',
 'annotator_2_normalize']

In [113]:
for data in db.datasets:
    db.drop_dataset(data)
db.datasets

[]

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")
data = nlp("Within the shunting group, it shall be possible for only one member of the group to transmit the link assurance signal at any time.")

token = [token for token in data]
len(token)

26

In [None]:
len("Within the shunting group, it shall be possible for only one member of the group to transmit the link assurance signal at any time.".split())

In [64]:
#  Convert to new dataset

from prodigy.components.db import connect
import spacy
from spacy.tokenizer import Tokenizer

# Flow: Final -> normalize -> balance

import spacy

nlp = spacy.load("en_core_web_sm")


db = connect()
prodigy_annotations = db.get_dataset("annotator_1_final")
examples = ((eg["text"], eg) for eg in prodigy_annotations)
nlp = spacy.blank("en")
index = 1
IOB_format2 = []

for doc, eg in nlp.pipe(examples, as_tuples=True):
    try:
        other_doc = doc
        if "spans" in eg.keys():
            # Set entities for the "ACTOR" and "QUALITY" labels in the "other" doc
            other_doc.ents = [other_doc.char_span(s["start"], s["end"], s["label"]) for s in eg["spans"] if s["label"] in ["ACTOR", "QUALITY"]]
            # Generate IOB tags for the "condition" and "other" docs separately
            other_iob_tags = [f"{t.ent_iob_}-{t.ent_type_}" if t.ent_iob_ else "O" for t in other_doc]

        data = nlp(doc.text)

        for item in range(len(data)):
                IOB_format2.append((other_iob_tags[item]))


    except:
        index = index + 1
    IOB_format2.append((""))


prodigy_annotations = db.get_dataset("annotator_1_final")
examples = ((eg["text"], eg) for eg in prodigy_annotations)
nlp = spacy.blank("en")
index = 1
IOB_format1 = []

for doc, eg in nlp.pipe(examples, as_tuples=True):
    try:
        condition_doc = doc
        other_doc = doc
        if "spans" in eg.keys():
            # Set entities for the "POSTCONDITION" and "PRECONDITION" labels in the "condition" doc
            condition_doc.ents = [condition_doc.char_span(s["start"], s["end"], s["label"]) for s in eg["spans"] if s["label"] in ["POSTCONDITION", "PRECONDITION"]]
            # Generate IOB tags for the "condition" and "other" docs separately
            condition_iob_tags = [f"{t.ent_iob_}-{t.ent_type_}" if t.ent_iob_ else "O" for t in condition_doc]

            data = nlp(doc.text)

            for item in range(len(data)):
                IOB_format1.append((data[item], condition_iob_tags[item]))
    except:
        index = index + 1

    IOB_format1.append(("","","",))


In [79]:
IOB_format1

[(Within, 'B-PRECONDITION'),
 (the, 'I-PRECONDITION'),
 (shunting, 'I-PRECONDITION'),
 (group, 'I-PRECONDITION'),
 (,, 'O-'),
 (it, 'B-POSTCONDITION'),
 (shall, 'I-POSTCONDITION'),
 (be, 'I-POSTCONDITION'),
 (possible, 'I-POSTCONDITION'),
 (for, 'I-POSTCONDITION'),
 (only, 'I-POSTCONDITION'),
 (one, 'I-POSTCONDITION'),
 (member, 'I-POSTCONDITION'),
 (of, 'I-POSTCONDITION'),
 (the, 'I-POSTCONDITION'),
 (group, 'I-POSTCONDITION'),
 (to, 'I-POSTCONDITION'),
 (transmit, 'I-POSTCONDITION'),
 (the, 'I-POSTCONDITION'),
 (link, 'I-POSTCONDITION'),
 (assurance, 'I-POSTCONDITION'),
 (signal, 'I-POSTCONDITION'),
 (at, 'I-POSTCONDITION'),
 (any, 'I-POSTCONDITION'),
 (time, 'I-POSTCONDITION'),
 (., 'I-POSTCONDITION'),
 ('', '', ''),
 (When, 'B-PRECONDITION'),
 (applicable, 'I-PRECONDITION'),
 (a, 'I-PRECONDITION'),
 (Security, 'I-PRECONDITION'),
 (Domain, 'I-PRECONDITION'),
 (shall, 'B-POSTCONDITION'),
 (Request, 'I-POSTCONDITION'),
 (the, 'I-POSTCONDITION'),
 (OPEN, 'I-POSTCONDITION'),
 (to, 'I-PO

In [65]:
merge = [(x[0], x[1], y) for x, y in zip(IOB_format1, IOB_format2)]

In [83]:
import csv

filename = 'data.csv'

# Open the CSV file for writing
with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=';')

    

    # Write the header row
    writer.writerow(['Word', 'Label1', 'Label2'])

    # Write the data rows
    for row in IOB_format1:
        row = [str(item).replace('O-', 'O') for item in row]
        # Remove the parentheses from the tuple
        row = [str(item).replace('(', '').replace(')', '') for item in row]
        writer.writerow(row)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer

def preprocess_text(data, remove_stopwords=False, stem=False, lemma=False):
    # Download stopwords and initialize stemmer and lemmatizer
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')
    stop_words = set(stopwords.words('english'))
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()

    # Define a helper function to preprocess a single word
    def preprocess_word(word):
        if remove_stopwords and word in stop_words:
            return None
        if stem:
            word = stemmer.stem(word)
        if lemma:
            word = lemmatizer.lemmatize(word)
        return word

    # Preprocess each sentence in the data
    preprocessed_data = []
    for sentence in data:
        tokens = word_tokenize(sentence[0])
        preprocessed_tokens = []
        for token in tokens:
            preprocessed_word = preprocess_word(token.lower())
            if preprocessed_word is not None:
                preprocessed_tokens.append(preprocessed_word)
        preprocessed_data.append((preprocessed_tokens, sentence[1]))

    return preprocessed_data


In [None]:
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer, PorterStemmer

# # download necessary resources
# nltk.download('stopwords')
# nltk.download('wordnet')

# # define a function to preprocess text
# def preprocess(text):
#     # split the text into tokens
#     tokens = text.split()
    
#     # remove stopwords
#     stop_words = set(stopwords.words('english'))
#     filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
#     # lemmatize the tokens
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
#     # stem the tokens
#     stemmer = PorterStemmer()
#     stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    
#     # join the tokens back into text
#     preprocessed_text = ' '.join(stemmed_tokens)
    
#     return preprocessed_text

# # example IOB-formatted text
# iob_tags = ['B-ORG', 'I-ORG', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC']

# # convert IOB-formatted text to plain text
# text = ' '.join([tag[2:] if tag.startswith('B-') or tag.startswith('S-') else '' for tag in iob_tags])

# # preprocess the text
# preprocessed_text = preprocess(text)

# # convert the preprocessed text back to IOB-formatted text
# preprocessed_iob_tags = []
# for tag in iob_tags:
#     if tag == 'O':
#         preprocessed_iob_tags.append('O')
#     else:
#         label = 'B-' + tag[2:] if tag.startswith('B-') else 'I-' + tag[2:]
#         preprocessed_iob_tags.append(label)

# print(preprocessed_iob_tags)

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

In [None]:
!python -m prodigy train --spancat final_ver_condition,eval:final_ver_condition --verbose

In [None]:
db.datasets