You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
@classmethod
def from_json(cls, obj):
if isinstance(obj, str):
obj = json.loads(obj)
text = obj.pop('text')
obj['text'] = nlp(text)
tokenlist = obj.pop('tokenmap')
tokenmap = {}
for tk in tokenlist:
index = tk.pop('index')
if isinstance(index, int):
token = obj['text'][index]
elif isinstance(index, (list, tuple)):
start, end = index
token = obj['text'][start:end]
tk.pop('idx')
tk.pop('text')
tokenmap[token] = Variable(token, **tk)
obj['tokenmap'] = tokenmap
return cls(**obj)
In above function, from_json() which is method of nlg.narrative.Nugget class,
the line - "token = obj['text'][index]" isn't behaving as expected for all input text.
Steps to reproduce.
Code to be used:
from gramex import data
from nlg.utils import load_spacy_model
nlp = lo
```ad_spacy_model()
from nlg.narrative import Nugget
from nlg.narrative import Variable
def from_json(cls, obj):
if isinstance(obj, str):
obj = json.loads(obj)
text = obj.pop('text')
obj['text'] = nlp(text)
tokenlist = obj.pop('tokenmap')
tokenmap = {}
for tk in tokenlist:
index = tk.pop('index')
if isinstance(index, int):
print('\nPrinting the object text and the type of it: ')
print(obj['text'])
print(type(obj['text']))
print('\nIndex to be used for splitting the text, ', str(index))
token = obj['text'][index]
print('\nPrinting the token after split through index value: ')
print(token)
elif isinstance(index, (list, tuple)):
start, end = index
token = obj['text'][start:end]
tk.pop('idx')
tk.pop('text')
tokenmap[token] = Variable(token, **tk)
obj['tokenmap'] = tokenmap
return cls(**obj)
OUTPUT obtained for NON-WORKING scenario:
Printing the object text and the type of it:
CP-K-J
<class 'spacy.tokens.doc.Doc'>
Index to be used for splitting the text, 0
Printing the token after split through index value:
CP
Printing the final output:
{{ df["ABC"].iloc[0] }}-K-J
Expected behavior:
The output that came in non-working scenario should have been instead like this,
Printing the object text and the type of it:
CP-K-J
<class 'spacy.tokens.doc.Doc'>
Index to be used for splitting the text, 0
Printing the token after split through index value:
CP-K-J
Printing the final output:
{{ df["ABC"].iloc[0] }}
Workaround solution
def from_json(cls, obj):
if isinstance(obj, str):
obj = json.loads(obj)
text = obj.pop('text')
obj['text'] = nlp(text)
tokenlist = obj.pop('tokenmap')
tokenmap = {}
for tk in tokenlist:
index = tk.pop('index')
if isinstance(index, int):
print('\nPrinting the object text and the type of it: ')
print(obj['text'])
print(type(obj['text']))
print('\nIndex to be used for splitting the text, ', str(index))
len_text = len(tk['text'])
token = obj['text'][index]
if not len(token) == len_text:
token = obj['text'][index:len_text]
print('\nPrinting the token after split through index value: ')
print(token)
elif isinstance(index, (list, tuple)):
start, end = index
token = obj['text'][start:end]
tk.pop('idx')
tk.pop('text')
tokenmap[token] = Variable(token, **tk)
obj['tokenmap'] = tokenmap
return cls(**obj)
I have added a check, to make sure the length of token taken is equal to the text length for that token element.
len_text = len(tk['text'])
token = obj['text'][index]
if not len(token) == len_text:
token = obj['text'][index:len_text]
After adding this, the output comes as expected for all inputs.
The text was updated successfully, but these errors were encountered:
Raised by @dikshagupta14
In above function, from_json() which is method of nlg.narrative.Nugget class,
the line - "token = obj['text'][index]" isn't behaving as expected for all input text.
Steps to reproduce.
Code to be used:
#WORKING SCENARIO INPUT:
OUTPUT obtained for WORKING scenario:
Printing the object text and the type of it:
CP-899-J
<class 'spacy.tokens.doc.Doc'>
Index to be used for splitting the text, 0
Printing the token after split through index value:
CP-899-J
Printing the final output:
{{ df["ABC"].iloc[0] }}
#NON-WORKING SCENARIO input:
OUTPUT obtained for NON-WORKING scenario:
Printing the object text and the type of it:
CP-K-J
<class 'spacy.tokens.doc.Doc'>
Index to be used for splitting the text, 0
Printing the token after split through index value:
CP
Printing the final output:
{{ df["ABC"].iloc[0] }}-K-J
Expected behavior:
The output that came in non-working scenario should have been instead like this,
Printing the object text and the type of it:
CP-K-J
<class 'spacy.tokens.doc.Doc'>
Index to be used for splitting the text, 0
Printing the token after split through index value:
CP-K-J
Printing the final output:
{{ df["ABC"].iloc[0] }}
Workaround solution
I have added a check, to make sure the length of token taken is equal to the text length for that token element.
After adding this, the output comes as expected for all inputs.
The text was updated successfully, but these errors were encountered: