Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unexpected result while creating token using "from_json()" function of Nugget class #60

Open
jaidevd opened this issue Sep 19, 2022 · 0 comments

Comments

@jaidevd
Copy link
Member

jaidevd commented Sep 19, 2022

Raised by @dikshagupta14

    @classmethod
    def from_json(cls, obj):
        if isinstance(obj, str):
            obj = json.loads(obj)

        text = obj.pop('text')
        obj['text'] = nlp(text)

        tokenlist = obj.pop('tokenmap')
        tokenmap = {}
        for tk in tokenlist:
            index = tk.pop('index')
            if isinstance(index, int):
                token = obj['text'][index]
            elif isinstance(index, (list, tuple)):
                start, end = index
                token = obj['text'][start:end]
            tk.pop('idx')
            tk.pop('text')
            tokenmap[token] = Variable(token, **tk)
        obj['tokenmap'] = tokenmap

        return cls(**obj)

In above function, from_json() which is method of nlg.narrative.Nugget class,
the line - "token = obj['text'][index]" isn't behaving as expected for all input text.

Steps to reproduce.

Code to be used:

from gramex import data
from nlg.utils import load_spacy_model
nlp = lo
```ad_spacy_model()
from nlg.narrative import Nugget
from nlg.narrative import Variable


def from_json(cls, obj):
        if isinstance(obj, str):
            obj = json.loads(obj)
        text = obj.pop('text')
        obj['text'] = nlp(text)

        tokenlist = obj.pop('tokenmap')
        tokenmap = {}
        for tk in tokenlist:
            index = tk.pop('index')
            if isinstance(index, int):
                print('\nPrinting the object text and the type of it: ')
                print(obj['text'])
                print(type(obj['text']))
                print('\nIndex to be used for splitting the text, ', str(index))
                token = obj['text'][index]
                print('\nPrinting the token after split through index value: ')
                print(token)
            elif isinstance(index, (list, tuple)):
                start, end = index
                token = obj['text'][start:end]
            tk.pop('idx')
            tk.pop('text')
            tokenmap[token] = Variable(token, **tk)
            
        obj['tokenmap'] = tokenmap
        return cls(**obj)

#WORKING SCENARIO INPUT:

#defining input nugget dictionary data
nugget_dict = {
    "text": "CP-899-J",
    "tokenmap": [
        {
            "text": "CP-899-J",
            "index": 0,
            "idx": 0,
            "sources": [
                {
                    "location": "cell",
                    "tmpl": 'df["ABC"].iloc[0]',
                    "type": "doc",
                    "enabled": True,
                }
            ],
            "varname": "",
            "inflections": [],
        }
    ]
}
#calling the from_json function
output_nugget = from_json(Nugget, nugget_dict)
print('\nPrinting the final output: ')
print(output_nugget)

OUTPUT obtained for WORKING scenario:
Printing the object text and the type of it:
CP-899-J
<class 'spacy.tokens.doc.Doc'>

Index to be used for splitting the text, 0

Printing the token after split through index value:
CP-899-J

Printing the final output:
{{ df["ABC"].iloc[0] }}


#NON-WORKING SCENARIO input:

#defining input nugget dictionary data
nugget_dict = {
    "text": "CP-K-J",
    "tokenmap": [
        {
            "text": "CP-K-J",
            "index": 0,
            "idx": 0,
            "sources": [
                {
                    "location": "cell",
                    "tmpl": 'df["ABC"].iloc[0]',
                    "type": "doc",
                    "enabled": True,
                }
            ],
            "varname": "",
            "inflections": [],
        }
    ]
}
#calling the from_json function
output_nugget = from_json(Nugget, nugget_dict)
print('\nPrinting the final output: ')
print(output_nugget)

OUTPUT obtained for NON-WORKING scenario:
Printing the object text and the type of it:
CP-K-J
<class 'spacy.tokens.doc.Doc'>

Index to be used for splitting the text, 0

Printing the token after split through index value:
CP

Printing the final output:
{{ df["ABC"].iloc[0] }}-K-J


Expected behavior:
The output that came in non-working scenario should have been instead like this,


Printing the object text and the type of it:
CP-K-J
<class 'spacy.tokens.doc.Doc'>

Index to be used for splitting the text, 0

Printing the token after split through index value:
CP-K-J

Printing the final output:
{{ df["ABC"].iloc[0] }}


Workaround solution

def from_json(cls, obj):
        if isinstance(obj, str):
            obj = json.loads(obj)
        text = obj.pop('text')
        obj['text'] = nlp(text)

        tokenlist = obj.pop('tokenmap')
        tokenmap = {}
        for tk in tokenlist:
            index = tk.pop('index')
            if isinstance(index, int):
                print('\nPrinting the object text and the type of it: ')
                print(obj['text'])
                print(type(obj['text']))
                print('\nIndex to be used for splitting the text, ', str(index))
                len_text = len(tk['text'])
                token = obj['text'][index]
                if not len(token) == len_text:
                    token = obj['text'][index:len_text]
                print('\nPrinting the token after split through index value: ')
                print(token)
            elif isinstance(index, (list, tuple)):
                start, end = index
                token = obj['text'][start:end]
            tk.pop('idx')
            tk.pop('text')
            tokenmap[token] = Variable(token, **tk)
            
        obj['tokenmap'] = tokenmap
        return cls(**obj)

I have added a check, to make sure the length of token taken is equal to the text length for that token element.

                len_text = len(tk['text'])
                token = obj['text'][index]
                if not len(token) == len_text:
                    token = obj['text'][index:len_text]

After adding this, the output comes as expected for all inputs.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant