In [1]:
import pandas as pd
import spacy
import re
import nltk
import os

nlp = spacy.load("en_core_web_trf")
pd.set_option('display.max_colwidth', None)

In [2]:
def remove_mask_braces(text):
    text = re.sub("{pos_adj}", "pos_adj", text)
    text = re.sub("{neg_adj}", "neg_adj", text)
    text = re.sub("{pos_verb}", "pos_verb", text)
    text = re.sub("{neg_verb}", "neg_verb", text)
    return text

In [16]:
def get_replaced_words(template: str, instance: str):
    """
    Compare both template and instance words to find the words that were replaced.
    It returns a list of replaced words.
    """
    # its necessary to use the same tokenizer of TemplateGenerator
    temp_tokens = [token for token in nltk.tokenize.word_tokenize(remove_mask_braces(template))]
    inst_tokens = [token for token in nltk.tokenize.word_tokenize(instance)]
    
    # assert len(temp_tokens) == len(inst_tokens), f"{len(temp_tokens)=}, {len(inst_tokens)=}\n{template}\n{instance}"
    if len(temp_tokens) != len(inst_tokens):
        print(f"{temp_tokens=}")
        print(f"{inst_tokens=}")
        temp_tokens = remove_mask_braces(template).split()
        inst_tokens = instance.split()
    
    if len(temp_tokens) != len(inst_tokens):
        print(f"{len(temp_tokens)=}, {len(inst_tokens)=}\n{template}\n{instance}")
        print(f"{temp_tokens=}")
        print(f"{inst_tokens=}")
        assert False

    return [inst_tokens[i] for i in range(len(temp_tokens)) if temp_tokens[i] != inst_tokens[i]]

In [4]:
def TE_message(original_word: str, original_classification: str, correct_classification: str):
    assert original_classification != correct_classification, f"{original_classification=}\t {correct_classification=}"
    # return f"{original_word} was classified as {original_classification}. Spacy classified as {correct_classification}"
    return f"{original_word} foi classificado como {original_classification}. O correto seria {correct_classification}"

def LE_message(wrong_classified_words: list[str]):
    assert len(wrong_classified_words)
    # return f"The following words were classified wrongly in other templates: {', '.join(wrong_classified_words)}"
    return f"As seguintes palavras foram classificadas erradas em outros templates: {', '.join(wrong_classified_words)}"

In [17]:
for approach in ["approach1", "approach2", "approach3", "approach4", "approach5", "random"]:
    print(approach)
    APPROACH_FILEPATH = f"../notebooks/test_cases_imdb/{approach}.xlsx"
    OUTPUT_FILEPATH = f"../notebooks/test_cases_imdb_rotulated/{approach}.xlsx"

    templates = pd.read_excel(APPROACH_FILEPATH, sheet_name=None)

    wrong_lex = list() # it keeps the words belonging to wrong templates 

    for t_index in templates["templates"]["template_index"]:
        # classifies all as VP or VN.
        t = "template" + str(t_index) # template name

        templates[t]["classification"] = templates[t]["succeed"].map(lambda x: "VP" if x == 0 else "VN")
        templates[t]["subclassification"] = ""
        templates[t]["obs"] = ""

        # rodar um pos tagger de deep learning no original_text e ver se bate com as tags do template_text]

        # get original text
        t_index = int(t[8:])
        template_info = templates["templates"].query("template_index == @t_index").iloc[0]

        # get tokens from Spacy
        doc_original = nlp(template_info["original_text"])
        doc_template = nlp(remove_mask_braces(template_info["template_text"]))

        i_orig = 0
        i_temp = 0
        # we are doing a while loop because the doc_original and doc_template not necessarily has the same length
        # iterating over each word:
        while i_orig < len(doc_original) and i_temp < len(doc_template):
            # if both words are the same it doesn't change anything
            if str(doc_original[i_orig]) == str(doc_template[i_temp]):
                i_temp+=1
            elif str(doc_template[i_temp]) in ["neg_verb", "pos_verb"]:
                if doc_original[i_orig].pos_ != "VERB":
                    # change all classifications to False and set subclassification to TE
                    templates[t]["classification"] = templates[t]["classification"].map(lambda x: "F" + x[-1])
                    templates[t]["subclassification"] = "TE"
                    templates[t]["obs"] = TE_message(
                        original_word=doc_original[i_orig].text,
                        original_classification="VERB",
                        correct_classification=doc_original[i_orig].pos_
                    )
                    
                    wrong_lex.append(doc_original[i_orig].text)
                i_temp+=1
                # replaced_words[t_index].append()
            elif str(doc_template[i_temp]) in ["neg_adj", "pos_adj"]:
                if doc_original[i_orig].pos_ != "ADJ":
                    # change all classifications to False and set subclassification to TE
                    templates[t]["classification"] = templates[t]["classification"].map(lambda x: "F" + x[-1])
                    templates[t]["subclassification"] = "TE"
                    templates[t]["obs"] = TE_message(
                        original_word=doc_original[i_orig].text,
                        original_classification="ADJ",
                        correct_classification=doc_original[i_orig].pos_
                    )
                    wrong_lex.append(doc_original[i_orig].text)

                i_temp+=1


            i_orig+=1
        
    # iterate over templates again, this time classifying as LE the sentences that contain some word of wrong_lex.
    # the subclassification has to be different from TE (it's open for discussion)
    for t in list(templates.keys())[1:]:
        t_index = int(t[8:])
        template: str = templates["templates"].query("template_index == @t_index").iloc[0]["template_text"]
        for i, row in templates[t].iterrows():
            replaced_words = get_replaced_words(template, row["text"])
            wrong_lex_in_text = set(wrong_lex).intersection(set(replaced_words))
            if len(wrong_lex_in_text) and row["subclassification"] != "TE":
                templates[t].at[i,"classification"] = "F" + row["classification"][-1]
                templates[t].at[i,"subclassification"] = "LE"
                templates[t].at[i, "obs"] = LE_message(wrong_lex_in_text)
                
        # display(templates["templates"].query("template_index == @t_index"))
        # display(templates[t])
        
    # NO

    #

    with pd.ExcelWriter(OUTPUT_FILEPATH) as writer:
        for i, df in templates.items():
            df.to_excel(writer, sheet_name=i, index=False)

approach1
temp_tokens=['Do', "n't", 'mind', 'what', 'this', 'socially', 'retarded', 'person', 'above', 'neg_verb', ',', 'this', 'show', 'is', 'pos_adj', '.']
inst_tokens=['Do', "n't", 'mind', 'what', 'this', 'socially', 'retarded', 'person', 'above', 'was', ',', 'this', 'show', 'is', 'dead', '.', '.']
temp_tokens=['Do', "n't", 'mind', 'what', 'this', 'socially', 'retarded', 'person', 'above', 'neg_verb', ',', 'this', 'show', 'is', 'pos_adj', '.']
inst_tokens=['Do', "n't", 'mind', 'what', 'this', 'socially', 'retarded', 'person', 'above', 'is', ',', 'this', 'show', 'is', 'dead', '.', '.']
temp_tokens=['Do', "n't", 'mind', 'what', 'this', 'socially', 'retarded', 'person', 'above', 'neg_verb', ',', 'this', 'show', 'is', 'pos_adj', '.']
inst_tokens=['Do', "n't", 'mind', 'what', 'this', 'socially', 'retarded', 'person', 'above', 'getting', ',', 'this', 'show', 'is', 'dead', '.', '.']
temp_tokens=['The', 'shorts', 'neg_verb', 'still', 'pos_adj', "'10", "'", '.']
inst_tokens=['The', 'shorts',