In [24]:
"""
Copyright (c) Facebook, Inc. and its affiliates.
"""

"""This file has functions to preprocess the chat from user before
querying the dialogue manager"""
import string

from spacy.lang.en import English
from typing import List

tokenizer = English().Defaults.create_tokenizer()


def word_tokenize(st) -> str:
    chat_with_spaces = insert_spaces(st)
    return " ".join([str(x) for x in tokenizer(chat_with_spaces)])


def sentence_split(st):
    st = st.replace(" ?", " .")
    st = st.replace(" !", " .")
    st = st.replace(" ...", " .")
    res = [
        " ".join([x for x in sen.lower().split() if x not in string.punctuation])
        for sen in st.split(" .")
    ]
    return [x for x in res if x != ""]


def insert_spaces(chat):
    updated_chat = ""
    for i, c in enumerate(chat):
        # [num , (num , {num , ,num , :num
        if (
            (c in ["[", "(", "{", ",", ":", "x"])
            and (i != len(chat) - 1)
            and (chat[i + 1].isdigit())
        ):
            updated_chat += c + " "
        # num, , num] , num) , num}, num:
        # 4x -> 4 x
        elif (
            (c.isdigit())
            and (i != len(chat) - 1)
            and (chat[i + 1] in [",", "]", ")", "}", ":", "x"])
        ):
            updated_chat += c + " "
        else:
            updated_chat += c

    return updated_chat


def preprocess_chat(chat: str) -> List[str]:
    # For debug mode, return as is.
    if chat == "_debug_" or chat.startswith("_ttad_"):
        return [chat]

    # Tokenize
    tokenized_line = word_tokenize(chat)
    tokenized_sentences = [sen for sen in sentence_split(tokenized_line)]

    return tokenized_sentences


In [5]:
with open('real_data/humanbot/raw_data/all_appen_humanbot_data.txt') as f, open('real_data/humanbot/humanbot_all_data_preprocessed.txt', 'w') as f2:
    for line in f.readlines():
        line = line.strip()
        preprocessed_line = preprocess_chat(line)[0]
        f2.write(preprocessed_line+"\n")

In [33]:
prev_text = set()
with open('../../minecraft/python/craftassist/text_to_tree_tool/turk_data/new_dance_form_data/first_65/all_combined.txt') as f:
    for line in f.readlines():
        parts = line.strip().split("\t")
        chat = preprocess_chat(parts[0].strip().lower())[0]
        prev_text.add(chat)
        

In [34]:
print(len(prev_text))

63


In [35]:
all_text = set()
with open('../../minecraft/python/craftassist/text_to_tree_tool/turk_data/new_dance_form_data/all_data.txt') as f:
    for line in f.readlines():
        line = line.strip()
        chat = preprocess_chat(line.lower())[0]
        all_text.add(chat)
print(len(all_text))

83


In [36]:
new_text = set()
for l in all_text:
    if l not in prev_text:
        new_text.add(l)
        print(l)
print(len(new_text))

bob your head
rotate your camera up
stare in my eyes for a minute
tilt 10 degrees
look to the left of the chair
do n’t stare at me
look in the distance
look who is calling on my phone
look at my right hand
stare in the distance
face abhinav
aim your camera up a little
stick your arm out
pan 90 degrees
put your head down in shame
reach towards the ceiling
stare down
do n’t look at the sun directly
look at yourself in the mirror
look at this venn diagram i made
20


In [37]:
with open('../../minecraft/python/craftassist/text_to_tree_tool/turk_data/new_dance_form_data/next_20/tool1/input.txt', 'w') as f:
    for line in new_text:
        f.write(line + "\n")
        print(line)

bob your head
stare in my eyes for a minute
look at yourself in the mirror
rotate your camera up
stare down
do n’t look at the sun directly
look in the distance
look who is calling on my phone
tilt 10 degrees
look at my right hand
stare in the distance
pan 90 degrees
look to the left of the chair
put your head down in shame
look at this venn diagram i made
face abhinav
aim your camera up a little
do n’t stare at me
stick your arm out
reach towards the ceiling
