In [1]:
import pandas as pd
import os

In [10]:
dirpath = "../notebooks_rotten_tomatoes_spacy/generated_templates/"
neg_verb = list()
neg_adj = list()
pos_verb = list()
pos_adj = list()

for filename in sorted(os.listdir(dirpath)):
    df = pd.read_csv(os.path.join(dirpath, filename))
    for _, row in df.iterrows():
        if "neg_adj" in row["template_text"]:
            neg_adj.append((row['original_text'], row["template_text"]))
        if "neg_verb" in row["template_text"]:
            neg_verb.append((row['original_text'], row["template_text"]))
        if "pos_verb" in row["template_text"]:
            pos_verb.append((row['original_text'], row["template_text"]))
        if "pos_adj" in row["template_text"]:
            pos_adj.append((row['original_text'], row["template_text"]))
    

In [3]:
import spacy
import re
nlp = spacy.load("en_core_web_trf")

In [11]:
def remove_mask_braces(text):
    text = re.sub("{pos_adj}", "pos_adj", text)
    text = re.sub("{neg_adj}", "neg_adj", text)
    text = re.sub("{pos_verb}", "pos_verb", text)
    text = re.sub("{neg_verb}", "neg_verb", text)
    return text

def get_replaced_words(template: str, instance: str, tag: str=None):
    """
    Compare both template and instance words to find the words that were replaced.
    It returns a list of replaced words.
    """
    # its necessary to use the same tokenizer of TemplateGenerator
    temp_tokens = [token.text for token in nlp(remove_mask_braces(template))]
    inst_tokens = [token.text for token in nlp(instance)]
    
    if len(temp_tokens) != len(inst_tokens):
        temp_tokens = remove_mask_braces(template).split()
        inst_tokens = instance.split()
    
    if len(temp_tokens) != len(inst_tokens):
        print(f"ADD TO {tag} the word:")
        print(f"{template=}")
        print(f"{instance=}")
        return []
        # assert False
    if tag:
        get_token_condition = lambda i: temp_tokens[i] != inst_tokens[i] and temp_tokens[i] == tag
    else:
        get_token_condition = lambda i: temp_tokens[i] != inst_tokens[i]
    
    return [inst_tokens[i] for i in range(len(temp_tokens)) if get_token_condition(i)]

In [12]:
neg_verbs = list()
for a, b in neg_verb:
    neg_verbs += get_replaced_words(b, a, "neg_verb")
neg_verbs = list(set(neg_verbs))

pos_verbs = list()
for a, b in pos_verb:
    pos_verbs += get_replaced_words(b, a, "pos_verb")
pos_verbs = list(set(pos_verbs))

neg_adjs = list()
for a, b in neg_adj:
    neg_adjs += get_replaced_words(b, a, "neg_adj")
neg_adjs = list(set(neg_adjs))

pos_adjs = list()
for a, b in pos_adj:
    pos_adjs += get_replaced_words(b, a, "pos_adj")
pos_adjs = list(set(pos_adjs))

    

In [6]:
# # IMDB_SPACY
# neg_verbs += ["reading", "do", "seen"]
# pos_verbs += ["felt", "enjoy"]
# neg_adjs += ["worst"]
# pos_adjs += ["precious", "great"]



In [13]:
lex_to_dict = {
    "neg_verbs": neg_verbs,
    "pos_verbs": pos_verbs,
    "pos_adjs": pos_adjs,
    "neg_adjs": neg_adjs
}

In [14]:
import json

In [15]:
with open("rotten_tomatoes_all_lexicons.json", "wt") as f:
    json.dump(lex_to_dict, f, indent=4)