Unexpected result while creating token using "from_json()" function of Nugget class #60

jaidevd · 2022-09-19T08:57:52Z

Raised by @dikshagupta14

    @classmethod
    def from_json(cls, obj):
        if isinstance(obj, str):
            obj = json.loads(obj)

        text = obj.pop('text')
        obj['text'] = nlp(text)

        tokenlist = obj.pop('tokenmap')
        tokenmap = {}
        for tk in tokenlist:
            index = tk.pop('index')
            if isinstance(index, int):
                token = obj['text'][index]
            elif isinstance(index, (list, tuple)):
                start, end = index
                token = obj['text'][start:end]
            tk.pop('idx')
            tk.pop('text')
            tokenmap[token] = Variable(token, **tk)
        obj['tokenmap'] = tokenmap

        return cls(**obj)

In above function, from_json() which is method of nlg.narrative.Nugget class,
the line - "token = obj['text'][index]" isn't behaving as expected for all input text.

Steps to reproduce.

Code to be used:

from gramex import data
from nlg.utils import load_spacy_model
nlp = lo
```ad_spacy_model()
from nlg.narrative import Nugget
from nlg.narrative import Variable


def from_json(cls, obj):
        if isinstance(obj, str):
            obj = json.loads(obj)
        text = obj.pop('text')
        obj['text'] = nlp(text)

        tokenlist = obj.pop('tokenmap')
        tokenmap = {}
        for tk in tokenlist:
            index = tk.pop('index')
            if isinstance(index, int):
                print('\nPrinting the object text and the type of it: ')
                print(obj['text'])
                print(type(obj['text']))
                print('\nIndex to be used for splitting the text, ', str(index))
                token = obj['text'][index]
                print('\nPrinting the token after split through index value: ')
                print(token)
            elif isinstance(index, (list, tuple)):
                start, end = index
                token = obj['text'][start:end]
            tk.pop('idx')
            tk.pop('text')
            tokenmap[token] = Variable(token, **tk)
            
        obj['tokenmap'] = tokenmap
        return cls(**obj)

#WORKING SCENARIO INPUT:

#defining input nugget dictionary data
nugget_dict = {
    "text": "CP-899-J",
    "tokenmap": [
        {
            "text": "CP-899-J",
            "index": 0,
            "idx": 0,
            "sources": [
                {
                    "location": "cell",
                    "tmpl": 'df["ABC"].iloc[0]',
                    "type": "doc",
                    "enabled": True,
                }
            ],
            "varname": "",
            "inflections": [],
        }
    ]
}
#calling the from_json function
output_nugget = from_json(Nugget, nugget_dict)
print('\nPrinting the final output: ')
print(output_nugget)

OUTPUT obtained for WORKING scenario:
Printing the object text and the type of it:
CP-899-J
<class 'spacy.tokens.doc.Doc'>

Index to be used for splitting the text, 0

Printing the token after split through index value:
CP-899-J

Printing the final output:
{{ df["ABC"].iloc[0] }}

#NON-WORKING SCENARIO input:

#defining input nugget dictionary data
nugget_dict = {
    "text": "CP-K-J",
    "tokenmap": [
        {
            "text": "CP-K-J",
            "index": 0,
            "idx": 0,
            "sources": [
                {
                    "location": "cell",
                    "tmpl": 'df["ABC"].iloc[0]',
                    "type": "doc",
                    "enabled": True,
                }
            ],
            "varname": "",
            "inflections": [],
        }
    ]
}
#calling the from_json function
output_nugget = from_json(Nugget, nugget_dict)
print('\nPrinting the final output: ')
print(output_nugget)

OUTPUT obtained for NON-WORKING scenario:
Printing the object text and the type of it:
CP-K-J
<class 'spacy.tokens.doc.Doc'>

Index to be used for splitting the text, 0

Printing the token after split through index value:
CP

Printing the final output:
{{ df["ABC"].iloc[0] }}-K-J

Expected behavior:
The output that came in non-working scenario should have been instead like this,

Printing the object text and the type of it:
CP-K-J
<class 'spacy.tokens.doc.Doc'>

Index to be used for splitting the text, 0

Printing the token after split through index value:
CP-K-J

Printing the final output:
{{ df["ABC"].iloc[0] }}

Workaround solution

def from_json(cls, obj):
        if isinstance(obj, str):
            obj = json.loads(obj)
        text = obj.pop('text')
        obj['text'] = nlp(text)

        tokenlist = obj.pop('tokenmap')
        tokenmap = {}
        for tk in tokenlist:
            index = tk.pop('index')
            if isinstance(index, int):
                print('\nPrinting the object text and the type of it: ')
                print(obj['text'])
                print(type(obj['text']))
                print('\nIndex to be used for splitting the text, ', str(index))
                len_text = len(tk['text'])
                token = obj['text'][index]
                if not len(token) == len_text:
                    token = obj['text'][index:len_text]
                print('\nPrinting the token after split through index value: ')
                print(token)
            elif isinstance(index, (list, tuple)):
                start, end = index
                token = obj['text'][start:end]
            tk.pop('idx')
            tk.pop('text')
            tokenmap[token] = Variable(token, **tk)
            
        obj['tokenmap'] = tokenmap
        return cls(**obj)

I have added a check, to make sure the length of token taken is equal to the text length for that token element.

                len_text = len(tk['text'])
                token = obj['text'][index]
                if not len(token) == len_text:
                    token = obj['text'][index:len_text]

After adding this, the output comes as expected for all inputs.

The text was updated successfully, but these errors were encountered:

jaidevd mentioned this issue Sep 19, 2022

Unexpected result while creating token using "from_json()" function of Nugget class gramener/gramex#602

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Unexpected result while creating token using "from_json()" function of Nugget class #60

Unexpected result while creating token using "from_json()" function of Nugget class #60

jaidevd commented Sep 19, 2022

Unexpected result while creating token using "from_json()" function of Nugget class #60

Unexpected result while creating token using "from_json()" function of Nugget class #60

Comments

jaidevd commented Sep 19, 2022