In [1]:
import pandas as pd
import json
from nltk.tokenize import word_tokenize

In [361]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
with open("NER_TRAIN/NER_TRAIN_JUDGEMENT.json") as json_file_train:
    json_object_train = json.load(json_file_train)

In [4]:
def get_start_and_end_and_labels(tree):
    start_and_end_and_labels = []
    for label in tree["annotations"][0]["result"]:
        labels = label["value"]["labels"][0]
        start = label["value"]["start"]
        end = label["value"]["end"]
        start_and_end_and_labels.append([labels, start, end])
    return start_and_end_and_labels

In [7]:
get_start_and_end_and_labels(json_object_train[0])

[['ORG', 90, 103], ['ORG', 267, 278]]

In [14]:
def print_try_text_and_label(n):
    try_text = json_object_train[n]["data"]["text"]
    try_label = get_start_and_end_and_labels(json_object_train[n])
    print(try_text)
    print(try_label)
    for label in try_label:
        print(try_text[label[1]: label[2]])
print_try_text_and_label(0)



(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.
[['ORG', 90, 103], ['ORG', 267, 278]]
Hongkong Bank
Rahul & Co.


In [141]:
def return_text_and_label(tree):
    text = tree["data"]["text"]
    labels = get_start_and_end_and_labels(tree)
    return text, labels
print(return_text_and_label(json_object_train[0]))

("\n\n(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.", [['ORG', 90, 103], ['ORG', 267, 278]])


In [142]:
from nltk.tokenize import TreebankWordTokenizer, TreebankWordDetokenizer
twt = TreebankWordTokenizer()
twd = TreebankWordDetokenizer()
try_text, try_label = return_text_and_label(json_object_train[0])
#token_number_list = list(twt().span_tokenize(try_text))
tokens = twt.tokenize(try_text)
print(tokens)
print(twd.detokenize(tokens))

['(', '7', ')', 'On', 'specific', 'query', 'by', 'the', 'Bench', 'about', 'an', 'entry', 'of', 'Rs.', '1,31,37,500', 'on', 'deposit', 'side', 'of', 'Hongkong', 'Bank', 'account', 'of', 'which', 'a', 'photo', 'copy', 'is', 'appearing', 'at', 'p.', '40', 'of', 'assessee', "'s", 'paper', 'book', ',', 'learned', 'authorised', 'representative', 'submitted', 'that', 'it', 'was', 'related', 'to', 'loan', 'from', 'broker', ',', 'Rahul', '&', 'Co.', 'on', 'the', 'basis', 'of', 'his', 'submission', 'a', 'necessary', 'mark', 'is', 'put', 'by', 'us', 'on', 'that', 'photo', 'copy', '.']
(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.


In [143]:
try_tokens = list(TreebankWordTokenizer().span_tokenize(try_text))
print(try_tokens)

[(2, 3), (3, 4), (4, 5), (6, 8), (9, 17), (18, 23), (24, 26), (27, 30), (31, 36), (37, 42), (43, 45), (46, 51), (52, 54), (55, 58), (59, 70), (71, 73), (74, 81), (82, 86), (87, 89), (90, 98), (99, 103), (104, 111), (112, 114), (115, 120), (121, 122), (123, 128), (129, 133), (134, 136), (137, 146), (147, 149), (150, 152), (153, 155), (156, 158), (159, 167), (167, 169), (170, 175), (176, 180), (180, 181), (182, 189), (190, 200), (201, 215), (216, 225), (226, 230), (231, 233), (234, 237), (238, 245), (246, 248), (249, 253), (254, 258), (259, 265), (265, 266), (267, 272), (273, 274), (275, 278), (279, 281), (282, 285), (286, 291), (292, 294), (295, 298), (299, 309), (310, 311), (312, 321), (322, 326), (327, 329), (330, 333), (334, 336), (337, 339), (340, 342), (343, 347), (348, 353), (354, 358), (358, 359)]


In [323]:
"""
correct_span = [list(token) for token in try_tokens]
for i in range(len(correct_span) - 1):
    if correct_span[i+1][0] == correct_span[i][1]:
        correct_span[i+1][0] += 1
"""

'\ncorrect_span = [list(token) for token in try_tokens]\nfor i in range(len(correct_span) - 1):\n    if correct_span[i+1][0] == correct_span[i][1]:\n        correct_span[i+1][0] += 1\n'

In [346]:
def add_label_to_tokens(tokens, labels):
    token_labels = ["o" for token in tokens]
    for label in labels:
        label_start = label[1]
        label_end = label[2]
        if label_start <= label_end:
            for i in range(0, len(tokens)):
                token_start, token_end = tokens[i]
                
                # the first token in the label ("Beginning")
                if token_start <= label_start < token_end:
                    token_labels[i] = "B-" + label[0]
                
                # the last token in a label, if the label span does not correspond to the end of the token
                #elif token_start <= label_end <= token_end:
                elif token_start < label_end <= token_end:
                    token_labels[i] = "I-" + label[0]
                
                # the following tokens after the first label ("Insider")
                if label_start < token_start <=  token_end <= label_end:
                    token_labels[i] = "I-" + label[0]
    return token_labels

In [351]:
def get_tokens_with_label(tree):
    text, labels = return_text_and_label(tree)
    twt = TreebankWordTokenizer()
    tokens_span = list(TreebankWordTokenizer().span_tokenize(text))
    tokenized_text = twt.tokenize(text)
    
    #d = {}
    #for i in range(len(tokens_span)):
    #    d[ tokens_span[i] ] = tokenized_text[i]
    #for key, value in d.items():
    #    print(f"{key}: {value}")
    return add_label_to_tokens(tokens_span, labels), tokenized_text

In [352]:
print(get_tokens_with_label(json_object_train[2236]))

(['o', 'o', 'o', 'o', 'o', 'B-OTHER_PERSON', 'I-OTHER_PERSON', 'o', 'o', 'o', 'o', 'o', 'B-GPE', 'o', 'o', 'o', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'B-DATE', 'o'], ['(', '3', ')', 'that', 'M/s', 'Sureshchand', 'Isarchand', 'got', 'this', 'vehicle', 'replaced', 'on', 'Bavana-Dholpur', 'route', 'and', 'the', 'Regional', 'Transport', 'Authority', ',', 'Jaipur', 'had', 'allowed', 'the', 'said', 'application', 'for', 'replacement', 'by', 'circular', 'note', 'dated', '17-12-66', '.'])


In [353]:
def compare_label_with_labelled_tokens(tree):
    text, labels = return_text_and_label(tree)
    labels_with_text = []
    for label in labels:
        labels_with_text.append(text[label[1]:label[2]])
    
    labelled_tokens, tokenized_text = get_tokens_with_label(tree)
    all_labelled_tokens = []
    l = len(labelled_tokens)
    for i in range(l):
        single_label = []
        if labelled_tokens[i].startswith("B"):
            single_label.append(tokenized_text[i])
            while i+ 1 < l and labelled_tokens[i+1].startswith("I"):
                single_label.append(tokenized_text[i+1])
                i += 1
        if len(single_label) > 0:
            all_labelled_tokens.append(" ".join(single_label))
    
    # compare
    if len(labels_with_text) != len(all_labelled_tokens):
        print("different number of labels!")
        print(f"labels: {labels}")
        print(f"labels_with_text: {labels_with_text}")
        print(f"all_labelled_tokens: {all_labelled_tokens} \n")
    else:
        for i in range(len(labels_with_text)):
            gold = labels_with_text[i].replace(" ", "")
            tokenized = labels_with_text[i].replace(" ", "")
            if gold != tokenized:
                print("potential tokenizing problem: ")
                print(f"gold: {gold} -- tokenized: {tokenized}")

In [354]:
for i in range(len(json_object_train)):
    compare_label_with_labelled_tokens(json_object_train[i])

different number of labels!
labels: [['OTHER_PERSON', 3, 22], ['CASE_NUMBER', 28, 41], ['CASE_NUMBER', 61, 75], ['COURT', 83, 140], ['ORG', 148, 166], ['CASE_NUMBER', 166, 167], ['CASE_NUMBER', 167, 176]]
labels_with_text: ['Jeevan Bheemmanagar', 'Cr..No.179/05', 'CC No.22109/06', '10th Addl. Chief Metropolitan Magistrate Court, Bangalore', 'Koramangala P.S.Cr', '.', 'No.430/05']
all_labelled_tokens: ['Jeevan Bheemmanagar', 'Cr..No.179/05', 'CC No.22109/06', '10th Addl. Chief Metropolitan Magistrate Court , Bangalore.', 'Koramangala', 'P.S.Cr.No.430/05'] 

different number of labels!
labels: [['OTHER_PERSON', 13, 34], ['GPE', 64, 70], ['GPE', 71, 78], ['ORG', 93, 129], ['DATE', 202, 210]]
labels_with_text: ['Sureshchand Isarchand', 'Bavana', 'Dholpur', 'Regional Transport Authority, Jaipur', '17-12-66']
all_labelled_tokens: ['Sureshchand Isarchand', 'Bavana-Dholpur', 'Regional Transport Authority , Jaipur', '17-12-66'] 

different number of labels!
labels: [['GPE', 41, 47], ['GPE', 48,

In [340]:
compare_label_with_labelled_tokens(json_object_train[7417])

different number of labels!
labels: [['OTHER_PERSON', 114, 121], ['DATE', 204, 212], ['GPE', 217, 226], ['GPE', 227, 233]]
labels_with_text: ['Sampath', '2.2.1988', 'Bangalore', 'Madras']
all_labelled_tokens: ['Sampath', '2.2.1988 ,', 'Bangalore-Madras'] 



In [369]:
token_and_labels_train = []
for tree in json_object_train:
    labels, tokens = get_tokens_with_label(tree)
    if len(labels) != len(tokens):
        print("BUG!")
    else:
        for i in range(len(labels)):
            token_and_labels_train.append([ tokens[i], labels[i] ])
token_and_labels_train[10]

['an', 'o']

In [370]:
import pandas as pd
df_train = pd.DataFrame(token_and_labels_train)
df_train

Unnamed: 0,0,1
0,(,o
1,7,o
2,),o
3,On,o
4,specific,o
...,...,...
349072,accused,o
349073,No.1,o
349074,as,o
349075,aforementioned,o


In [371]:
with open("NER_DEV/NER_DEV/NER_DEV_JUDGEMENT.json") as json_file_dev:
    json_object_dev = json.load(json_file_dev)

In [372]:
for i in range(len(json_object_dev)):
    compare_label_with_labelled_tokens(json_object_dev[i])

different number of labels!
labels: [['GPE', 138, 144], ['GPE', 145, 149], ['OTHER_PERSON', 239, 248], ['OTHER_PERSON', 260, 276]]
labels_with_text: ['Bombay', 'Agra', 'Amaraveni', 'Venkateswara Rao']
all_labelled_tokens: ['Bombay-Agra', 'Amaraveni', 'Venkateswara Rao'] 



In [373]:
token_and_labels_dev = []
for tree in json_object_dev:
    labels, tokens = get_tokens_with_label(tree)
    if len(labels) != len(tokens):
        print("BUG!")
    else:
        for i in range(len(labels)):
            token_and_labels_dev.append([ tokens[i], labels[i] ])
token_and_labels_dev[:10]

[['True', 'o'],
 [',', 'o'],
 ['our', 'o'],
 ['Constitution', 'B-STATUTE'],
 ['has', 'o'],
 ['no', 'o'],
 ["'due", 'o'],
 ['process', 'o'],
 ["'", 'o'],
 ['clause', 'o']]

In [374]:
df_dev = pd.DataFrame(token_and_labels_dev)
df_dev

Unnamed: 0,0,1
0,True,o
1,",",o
2,our,o
3,Constitution,B-STATUTE
4,has,o
...,...,...
37450,of,o
37451,right,o
37452,ear,o
37453,lobule,o


In [378]:
X = df_train.drop(1, axis=1)
v = DictVectorizer(sparse=True)
X = v.fit_transform(X.to_dict('records'))
y = df_train[1]

#classes = np.unique(y)
#classes = classes.tolist()
classes = df_train[1].unique().tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(233881, 27286) (233881,)
(115196, 27286) (115196,)


In [379]:
print(classes)

['o', 'B-ORG', 'I-ORG', 'B-OTHER_PERSON', 'I-OTHER_PERSON', 'B-WITNESS', 'I-WITNESS', 'B-GPE', 'B-STATUTE', 'B-DATE', 'I-DATE', 'B-PROVISION', 'I-PROVISION', 'I-STATUTE', 'B-COURT', 'I-COURT', 'B-PRECEDENT', 'I-PRECEDENT', 'B-CASE_NUMBER', 'I-CASE_NUMBER', 'I-GPE', 'B-PETITIONER', 'I-PETITIONER', 'B-JUDGE', 'I-JUDGE', 'B-RESPONDENT', 'I-RESPONDENT']


In [380]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 8.49, NNZs: 72, Bias: -0.020000, T: 233881, Avg. loss: 0.001383
Total training time: 0.05 seconds.
Norm: 18.95, NNZs: 359, Bias: -0.030000, T: 233881, Avg. loss: 0.001402
Total training time: 0.06 seconds.
Norm: 26.85, NNZs: 721, Bias: -0.010000, T: 233881, Avg. loss: 0.001574
Total training time: 0.05 seconds.
-- Epoch 1
-- Epoch 1
Norm: 41.77, NNZs: 1745, Bias: -0.010000, T: 233881, Avg. loss: 0.000733
Total training time: 0.07 seconds.
-- Epoch 1
-- Epoch 1
Norm: 19.77, NNZs: 391, Bias: -0.010000, T: 233881, Avg. loss: 0.000540
Total training time: 0.05 seconds.
-- Epoch 1
Norm: 43.78, NNZs: 1917, Bias: -0.010000, T: 233881, Avg. loss: 0.002377
Total training time: 0.06 seconds.
Norm: 18.30, NNZs: 335, Bias: -0.030000, T: 233881, Avg. loss: 0.000560
Total training time: 0.06 seconds.
Norm: 26.63, NNZs: 709, Bias: -0.010000, T: 233881, Avg. loss: 0.001777
Total training time: 0.07 seconds.
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 27.78,

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.4s


Norm: 28.67, NNZs: 822, Bias: -0.060000, T: 233881, Avg. loss: 0.011405
Total training time: 0.06 seconds.
Norm: 55.82, NNZs: 3116, Bias: -0.040000, T: 233881, Avg. loss: 0.021732
Total training time: 0.07 seconds.
-- Epoch 1
-- Epoch 1
Norm: 13.00, NNZs: 169, Bias: -0.030000, T: 233881, Avg. loss: 0.000941
Total training time: 0.07 seconds.
Norm: 21.56, NNZs: 465, Bias: -0.050000, T: 233881, Avg. loss: 0.006478
Total training time: 0.09 seconds.
Norm: 17.46, NNZs: 305, Bias: -0.030000, T: 233881, Avg. loss: 0.001246
Total training time: 0.04 seconds.
Norm: 113.72, NNZs: 12932, Bias: -0.000000, T: 233881, Avg. loss: 0.041306
Total training time: 0.04 seconds.


[Parallel(n_jobs=-1)]: Done  23 out of  27 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    0.6s finished


In [381]:
classes.remove("o")
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=classes))

                precision    recall  f1-score   support

         B-ORG       0.47      0.27      0.34       472
         I-ORG       0.36      0.24      0.28       937
B-OTHER_PERSON       0.40      0.22      0.28       853
I-OTHER_PERSON       0.35      0.32      0.33       695
     B-WITNESS       0.19      0.13      0.16       293
     I-WITNESS       0.13      0.04      0.06       260
         B-GPE       0.16      0.23      0.19       464
     B-STATUTE       0.74      0.61      0.67       600
        B-DATE       0.10      0.35      0.16       583
        I-DATE       0.21      0.10      0.14       638
   B-PROVISION       0.83      0.88      0.85       786
   I-PROVISION       0.61      0.21      0.31      2185
     I-STATUTE       0.57      0.41      0.48      1263
       B-COURT       0.74      0.59      0.66       429
       I-COURT       0.21      0.05      0.07       970
   B-PRECEDENT       0.10      0.03      0.05       437
   I-PRECEDENT       0.52      0.26      0.34  

In [177]:
def number_of_tokens_before(text, label_start):
    tokens_before = word_tokenize(text[:label_start])
    #if len(tokens_before) > 0:
    #    if tokens_before[-1] == ".":
    #        abbr_combine = "".join(tokens_before[-2:])
    #        tokens_before = tokens_before[:-2]
    #        tokens_before.append(abbr_combine)
    return len(tokens_before)

In [6]:
def slice_id_to_token_id(text, start, end):
    token_number_start = number_of_tokens_before(text, start)
    token_number_end = number_of_tokens_before(text, end)
    return token_number_start, token_number_end

In [28]:
def check_labels(text, labels, check_list):
    all_NE = [text[label[1]:label[2]] for label in labels]
    for i in range(len(all_NE)):
        origin = all_NE[i].replace(" ", "")
        check = "".join(check_list[i])
        if origin != check:
            print("Tokenizing Error!")
            print("Origin: " , all_NE[i])
            print("Error: " , check)

In [29]:
import re
def text_to_text_plus_label(text, labels):
    token_numbers_of_labels = []
    for label in labels:
        start, end = slice_id_to_token_id(text, label[1], label[2])
        token_numbers_of_labels.append([label[0], start, end])
    tokens = word_tokenize(text)
    number_of_tokens = len(tokens)
    
    check_list = []
    return_list = []
    token_labels = [ "o" for i in range(number_of_tokens)]
    for label in token_numbers_of_labels:
        check_list.append(tokens[label[1]:label[2]])
        # check the label
        # print(tokens[label[1]:label[2]])
        # print(label[1],label[2])
        
        #token_labels[label[1]] = named_entities_to_list[label[0]] * 2 - 1
        token_labels[label[1]] = "B-" + label[0]
        for i in range(label[1] + 1, label[2]):
            #token_labels[i] = named_entities_to_list[label[0]]  * 2
            token_labels[i] = "I-" + label[0]
    
    for i in range (len(token_labels)):
        return_list.append([tokens[i], token_labels[i]])
    
    check_labels(text, labels, check_list)
    
    return return_list

In [32]:
for n in range(0, 1):
    try_text = json_object_train[n]["data"]["text"]
    try_label = get_start_and_end_and_labels(json_object_train[n])
    try_convert = text_to_text_plus_label(try_text, try_label)
    print(try_convert)

Tokenizing Error!
Origin:  Rahul & Co.
Error:  Rahul&Co.on
[['(', 'o'], ['7', 'o'], [')', 'o'], ['On', 'o'], ['specific', 'o'], ['query', 'o'], ['by', 'o'], ['the', 'o'], ['Bench', 'o'], ['about', 'o'], ['an', 'o'], ['entry', 'o'], ['of', 'o'], ['Rs', 'o'], ['.', 'o'], ['1,31,37,500', 'o'], ['on', 'o'], ['deposit', 'o'], ['side', 'o'], ['of', 'o'], ['Hongkong', 'B-ORG'], ['Bank', 'I-ORG'], ['account', 'o'], ['of', 'o'], ['which', 'o'], ['a', 'o'], ['photo', 'o'], ['copy', 'o'], ['is', 'o'], ['appearing', 'o'], ['at', 'o'], ['p.', 'o'], ['40', 'o'], ['of', 'o'], ['assessee', 'o'], ["'s", 'o'], ['paper', 'o'], ['book', 'o'], [',', 'o'], ['learned', 'o'], ['authorised', 'o'], ['representative', 'o'], ['submitted', 'o'], ['that', 'o'], ['it', 'o'], ['was', 'o'], ['related', 'o'], ['to', 'o'], ['loan', 'o'], ['from', 'o'], ['broker', 'o'], [',', 'o'], ['Rahul', 'B-ORG'], ['&', 'I-ORG'], ['Co.', 'I-ORG'], ['on', 'I-ORG'], ['the', 'o'], ['basis', 'o'], ['of', 'o'], ['his', 'o'], ['submission'

In [106]:
from nltk.tokenize import word_tokenize
print(word_tokenize(try_text))

['He', 'had', 'prepared', 'G.D.', 'No', '.', '7', 'on', '19.8.1998', 'at', '3.05', 'A.M', '.']


In [37]:
try_text = json_object_train[58]["data"]["text"]
try_label = get_start_and_end_and_labels(json_object_train[58])
print(try_label)
try_convert = text_to_text_plus_label(try_text, try_label)
print(try_convert)

[['OTHER_PERSON', 17, 23], ['PROVISION', 171, 195], ['STATUTE', 203, 218]]
[['Therefore', 'o'], [',', 'o'], ['Shri', 'o'], ['.', 'o'], ['Dharap', 'B-OTHER_PERSON'], [',', 'o'], ['learned', 'o'], ['Counsel', 'o'], ['for', 'o'], ['the', 'o'], ['respondent', 'o'], ['submitted', 'o'], ['that', 'o'], ['the', 'o'], ['learned', 'o'], ['Single', 'o'], ['Judge', 'o'], ['was', 'o'], ['justified', 'o'], ['in', 'o'], ['holding', 'o'], ['that', 'o'], ['there', 'o'], ['is', 'o'], ['an', 'o'], ['unfair', 'o'], ['labour', 'o'], ['practice', 'o'], ['under', 'o'], ['item', 'B-PROVISION'], ['4', 'I-PROVISION'], ['(', 'I-PROVISION'], ['c', 'I-PROVISION'], [')', 'I-PROVISION'], ['of', 'I-PROVISION'], ['Schedule-II', 'I-PROVISION'], ['of', 'o'], ['the', 'o'], ['MRTU', 'B-STATUTE'], ['&', 'I-STATUTE'], ['PULP', 'I-STATUTE'], ['Act', 'I-STATUTE'], ['.', 'o']]


In [108]:
try_text[b:e]

'Hongkong Bank'

In [109]:
b = try_label[0][1]
e = try_label[0][2]

In [110]:
try_text

"\n\n(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy."

In [48]:
print(word_tokenize(try_text))

['Therefore', ',', 'Shri', '.', 'Dharap', ',', 'learned', 'Counsel', 'for', 'the', 'respondent', 'submitted', 'that', 'the', 'learned', 'Single', 'Judge', 'was', 'justified', 'in', 'holding', 'that', 'there', 'is', 'an', 'unfair', 'labour', 'practice', 'under', 'item', '4', '(', 'c', ')', 'of', 'Schedule-II', 'of', 'the', 'MRTU', '&', 'PULP', 'Act', '.']


In [129]:
start = try_label[1][1]
end = try_label[1][2]
slice_id_to_token_id(try_text, start, end)

(21, 22)

In [133]:
print(word_tokenize(try_text[:try_label[1][1]]))

['We', 'affirm', 'the', 'death', 'sentence', 'passed', 'by', 'the', 'trial', 'court', 'as', 'also', 'the', 'other', 'sentences', 'passed', 'under', 'Sections', '364', 'and', '376']


In [134]:
print(word_tokenize(try_text[:try_label[1][2]]))

['We', 'affirm', 'the', 'death', 'sentence', 'passed', 'by', 'the', 'trial', 'court', 'as', 'also', 'the', 'other', 'sentences', 'passed', 'under', 'Sections', '364', 'and', '376', 'I.P.C', '.']


In [79]:
try_text = json_object_train[0]["data"]["text"]
try_label = get_start_and_end_and_labels(json_object_train[0])
for n in range(len(try_label)):
    print(tokens_before(try_text,try_label[n][1]))
    print(tokens_before(try_text,try_label[n][2]))
    print("\n")
print(try_text)

['(', '7', ')', 'On', 'specific', 'query', 'by', 'the', 'Bench', 'about', 'an', 'entry', 'of', 'Rs', '.', '1,31,37,500', 'on', 'deposit', 'side', 'of']
['(', '7', ')', 'On', 'specific', 'query', 'by', 'the', 'Bench', 'about', 'an', 'entry', 'of', 'Rs', '.', '1,31,37,500', 'on', 'deposit', 'side', 'of', 'Hongkong', 'Bank']


['(', '7', ')', 'On', 'specific', 'query', 'by', 'the', 'Bench', 'about', 'an', 'entry', 'of', 'Rs', '.', '1,31,37,500', 'on', 'deposit', 'side', 'of', 'Hongkong', 'Bank', 'account', 'of', 'which', 'a', 'photo', 'copy', 'is', 'appearing', 'at', 'p.', '40', 'of', 'assessee', "'s", 'paper', 'book', ',', 'learned', 'authorised', 'representative', 'submitted', 'that', 'it', 'was', 'related', 'to', 'loan', 'from', 'broker', ',']
!
['(', '7', ')', 'On', 'specific', 'query', 'by', 'the', 'Bench', 'about', 'an', 'entry', 'of', 'Rs', '.', '1,31,37,500', 'on', 'deposit', 'side', 'of', 'Hongkong', 'Bank', 'account', 'of', 'which', 'a', 'photo', 'copy', 'is', 'appearing', 'at',

In [80]:
word_tokenize

<function nltk.tokenize.word_tokenize(text, language='english', preserve_line=False)>

In [81]:
from nltk.tokenize import TreebankWordTokenizer as twt
print(try_text)
print(try_label)
for label in try_label:
    print(try_text[label[1]: label[2]])

token_number_list = list(twt().span_tokenize(try_text))
print(token_number_list)



(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.
[['ORG', 90, 103], ['ORG', 267, 278]]
Hongkong Bank
Rahul & Co.
[(2, 3), (3, 4), (4, 5), (6, 8), (9, 17), (18, 23), (24, 26), (27, 30), (31, 36), (37, 42), (43, 45), (46, 51), (52, 54), (55, 58), (59, 70), (71, 73), (74, 81), (82, 86), (87, 89), (90, 98), (99, 103), (104, 111), (112, 114), (115, 120), (121, 122), (123, 128), (129, 133), (134, 136), (137, 146), (147, 149), (150, 152), (153, 155), (156, 158), (159, 167), (167, 169), (170, 175), (176, 180), (180, 181), (182, 189), (190, 200), (201, 215), (216, 225), (226, 230), (231, 233), (234, 237), (238, 245), (246, 248), (249, 253), (254, 258), (259, 265), (265, 266), (267, 272), (

In [82]:
for token_number in token_number_list:
    print(try_text[token_number[0]: token_number[1]])

(
7
)
On
specific
query
by
the
Bench
about
an
entry
of
Rs.
1,31,37,500
on
deposit
side
of
Hongkong
Bank
account
of
which
a
photo
copy
is
appearing
at
p.
40
of
assessee
's
paper
book
,
learned
authorised
representative
submitted
that
it
was
related
to
loan
from
broker
,
Rahul
&
Co.
on
the
basis
of
his
submission
a
necessary
mark
is
put
by
us
on
that
photo
copy
.


In [74]:
from nltk.tokenize import TreebankWordTokenizer as twt
list(twt().span_tokenize(try_text))

[(0, 2),
 (3, 6),
 (7, 12),
 (13, 17),
 (17, 18),
 (19, 28),
 (29, 33),
 (34, 42),
 (43, 50),
 (51, 53),
 (54, 58),
 (58, 59),
 (60, 66),
 (67, 70),
 (71, 81),
 (82, 88),
 (89, 91),
 (92, 96),
 (96, 97),
 (98, 105),
 (106, 109),
 (110, 113),
 (113, 114)]