In [113]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [114]:
import pandas as pd
import json

In [115]:
with open("NER_TRAIN/NER_TRAIN_JUDGEMENT.json") as json_file_train:
    json_object_train = json.load(json_file_train)

In [116]:
with open("NER_DEV/NER_DEV/NER_DEV_JUDGEMENT.json") as json_file_dev:
    json_object_dev = json.load(json_file_dev)

In [117]:
named_entities = ["COURT", "PETITIONER", "RESPONDENT", "JUDGE", "LAWYER", "DATE", "ORG", "GPE", "STATUTE", "PROVISION", "PRECEDENT", "CASE_NUMBER", "WITNESS", "OTHER_PERSON"]
def make_tag_lookup_table():
    iob_labels = ["B", "I"]
    ner_labels = named_entities
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a, b]) for a, b in all_labels]
    all_labels = ["[PAD]", "O"] + all_labels
    return dict(zip(range(0, len(all_labels) + 1), all_labels))


mapping = make_tag_lookup_table()
print(mapping)


{0: '[PAD]', 1: 'O', 2: 'B-COURT', 3: 'I-COURT', 4: 'B-PETITIONER', 5: 'I-PETITIONER', 6: 'B-RESPONDENT', 7: 'I-RESPONDENT', 8: 'B-JUDGE', 9: 'I-JUDGE', 10: 'B-LAWYER', 11: 'I-LAWYER', 12: 'B-DATE', 13: 'I-DATE', 14: 'B-ORG', 15: 'I-ORG', 16: 'B-GPE', 17: 'I-GPE', 18: 'B-STATUTE', 19: 'I-STATUTE', 20: 'B-PROVISION', 21: 'I-PROVISION', 22: 'B-PRECEDENT', 23: 'I-PRECEDENT', 24: 'B-CASE_NUMBER', 25: 'I-CASE_NUMBER', 26: 'B-WITNESS', 27: 'I-WITNESS', 28: 'B-OTHER_PERSON', 29: 'I-OTHER_PERSON'}


In [118]:
named_entities = ["COURT", "PETITIONER", "RESPONDENT", "JUDGE", "LAWYER", "DATE", "ORG", "GPE", "STATUTE", "PROVISION", "PRECEDENT", "CASE_NUMBER", "WITNESS", "OTHER_PERSON"]
named_entities_to_list = {"".join(key): idx for idx, key in enumerate(named_entities)}
for key, value in named_entities_to_list.items():
    named_entities_to_list[key] = named_entities_to_list[key] + 1
named_entities_to_list  

{'COURT': 1,
 'PETITIONER': 2,
 'RESPONDENT': 3,
 'JUDGE': 4,
 'LAWYER': 5,
 'DATE': 6,
 'ORG': 7,
 'GPE': 8,
 'STATUTE': 9,
 'PROVISION': 10,
 'PRECEDENT': 11,
 'CASE_NUMBER': 12,
 'WITNESS': 13,
 'OTHER_PERSON': 14}

In [119]:
def get_start_and_end_and_labels(tree):
    start_and_end_and_labels = []
    for label in tree["annotations"][0]["result"]:
        labels = label["value"]["labels"][0]
        start = label["value"]["start"]
        end = label["value"]["end"]
        start_and_end_and_labels.append([labels, start, end])
    return start_and_end_and_labels

In [120]:
def slice_id_to_token_id(text, start, end):
    token_number_start = 0
    token_number_end = 1
    for char in text[:start]:
        if char == " ":        
            token_number_start += 1
    for char in text[:end]:
        if char == " ":
            token_number_end += 1
    return token_number_start, token_number_end

In [121]:
import re
def text_to_text_plus_label(text, labels):
    text = re.sub(r"\n", " ", text)
    token_numbers_of_labels = []
    for label in labels:
        start, end = slice_id_to_token_id(text, label[1], label[2])
        token_numbers_of_labels.append([label[0], start, end])
    tokens = text.split(" ")
    number_of_tokens = len(tokens)
    
    return_list = []
    token_labels = [ "o" for i in range(number_of_tokens)]
    for label in token_numbers_of_labels:
        # check the label
        # print(tokens[label[1]:label[2]])
        # print(label[1],label[2])
        
        #token_labels[label[1]] = named_entities_to_list[label[0]] * 2 - 1
        token_labels[label[1]] = "B-" + label[0]
        for i in range(label[1] + 1, label[2]):
            #token_labels[i] = named_entities_to_list[label[0]]  * 2
            token_labels[i] = "I-" + label[0]
    
    for i in range (len(token_labels)):
        return_list.append([tokens[i], token_labels[i]])
    return return_list

In [122]:
try_text = json_object_train[0]["data"]["text"]

In [123]:
try_label = get_start_and_end_and_labels(json_object_train[0])

In [124]:
try_convert = text_to_text_plus_label(try_text, try_label)
print(try_convert)

[['', 'o'], ['', 'o'], ['(7)', 'o'], ['On', 'o'], ['specific', 'o'], ['query', 'o'], ['by', 'o'], ['the', 'o'], ['Bench', 'o'], ['about', 'o'], ['an', 'o'], ['entry', 'o'], ['of', 'o'], ['Rs.', 'o'], ['1,31,37,500', 'o'], ['on', 'o'], ['deposit', 'o'], ['side', 'o'], ['of', 'o'], ['Hongkong', 'B-ORG'], ['Bank', 'I-ORG'], ['account', 'o'], ['of', 'o'], ['which', 'o'], ['a', 'o'], ['photo', 'o'], ['copy', 'o'], ['is', 'o'], ['appearing', 'o'], ['at', 'o'], ['p.', 'o'], ['40', 'o'], ['of', 'o'], ["assessee's", 'o'], ['paper', 'o'], ['book,', 'o'], ['learned', 'o'], ['authorised', 'o'], ['representative', 'o'], ['submitted', 'o'], ['that', 'o'], ['it', 'o'], ['was', 'o'], ['related', 'o'], ['to', 'o'], ['loan', 'o'], ['from', 'o'], ['broker,', 'o'], ['Rahul', 'B-ORG'], ['&', 'I-ORG'], ['Co.', 'I-ORG'], ['on', 'o'], ['the', 'o'], ['basis', 'o'], ['of', 'o'], ['his', 'o'], ['submission', 'o'], ['a', 'o'], ['necessary', 'o'], ['mark', 'o'], ['is', 'o'], ['put', 'o'], ['by', 'o'], ['us', 'o'],

In [125]:
df = pd.DataFrame(try_convert)

In [126]:
df.groupby(1).size()

1
B-ORG     2
I-ORG     3
o        63
dtype: int64

In [2]:
token_and_labels_train = []
for tree in json_object_train:
    labels = get_start_and_end_and_labels(tree)
    text = tree["data"]["text"]
    token_and_labels_train += text_to_text_plus_label(text, labels) 

NameError: name 'json_object_train' is not defined

In [128]:
token_and_labels_dev = []
for tree in json_object_dev:
    labels = get_start_and_end_and_labels(tree)
    text = tree["data"]["text"]
    token_and_labels_dev += text_to_text_plus_label(text, labels) 

In [129]:
df_train = pd.DataFrame(token_and_labels_train)
df_dev = pd.DataFrame(token_and_labels_dev)
df_train

Unnamed: 0,0,1
0,,o
1,,o
2,(7),o
3,On,o
4,specific,o
...,...,...
320342,with,o
320343,accused,o
320344,No.1,o
320345,as,o


In [107]:
df_train[1].unique()

array(['o', 'B-ORG', 'I-ORG', 'B-OTHER_PERSON', 'I-OTHER_PERSON',
       'B-WITNESS', 'I-WITNESS', 'B-GPE', 'B-STATUTE', 'B-DATE', 'I-DATE',
       'B-PROVISION', 'I-PROVISION', 'I-STATUTE', 'B-COURT', 'I-COURT',
       'B-PRECEDENT', 'I-PRECEDENT', 'B-CASE_NUMBER', 'I-CASE_NUMBER',
       'I-GPE', 'B-PETITIONER', 'I-PETITIONER', 'B-JUDGE', 'I-JUDGE',
       'B-RESPONDENT', 'I-RESPONDENT'], dtype=object)

In [139]:
X = df_train.drop(1, axis=1)
v = DictVectorizer(sparse=True)
X = v.fit_transform(X.to_dict('records'))
y = df_train[1]

#classes = np.unique(y)
#classes = classes.tolist()
classes = df_train[1].unique().tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

((214632, 36532), (214632,))

In [109]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 21.10, NNZs: 445, Bias: -0.010000, T: 214632, Avg. loss: 0.001380
Total training time: 0.05 seconds.
Norm: 41.98, NNZs: 1762, Bias: -0.020000, T: 214632, Avg. loss: 0.000610
Total training time: 0.05 seconds.
Norm: 10.34, NNZs: 107, Bias: -0.030000, T: 214632, Avg. loss: 0.001353
Total training time: 0.06 seconds.
-- Epoch 1
Norm: 29.55, NNZs: 873, Bias: -0.010000, T: 214632, Avg. loss: 0.001521
Total training time: 0.06 seconds.
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 21.89, NNZs: 479, Bias: -0.010000, T: 214632, Avg. loss: 0.000491
Total training time: 0.05 seconds.
-- Epoch 1
Norm: 27.80, NNZs: 773, Bias: -0.030000, T: 214632, Avg. loss: 0.001815
Total training time: 0.06 seconds.
-- Epoch 1
Norm: 45.49, NNZs: 2069, Bias: -0.010000, T: 214632, Avg. loss: 0.002279
Total training time: 0.07 seconds.
Norm: 19.77, NNZs: 391, Bias: -0.010000, T: 214632, Avg. loss: 0.000508
Total training time: 0.08 seconds.
-- Epoch 1
-- Epoch 1
Norm: 30.0

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.4s


Norm: 12.12, NNZs: 147, Bias: -0.010000, T: 214632, Avg. loss: 0.000737
Total training time: 0.07 seconds.
Norm: 33.36, NNZs: 1113, Bias: -0.030000, T: 214632, Avg. loss: 0.002600
Total training time: 0.08 seconds.
-- Epoch 1
Norm: 64.37, NNZs: 4143, Bias: -0.030000, T: 214632, Avg. loss: 0.017004
Total training time: 0.07 seconds.
-- Epoch 1
-- Epoch 1
Norm: 39.09, NNZs: 1528, Bias: -0.020000, T: 214632, Avg. loss: 0.005099
Total training time: 0.06 seconds.
-- Epoch 1
Norm: 14.83, NNZs: 220, Bias: -0.020000, T: 214632, Avg. loss: 0.000890
Total training time: 0.05 seconds.
Norm: 26.93, NNZs: 725, Bias: -0.010000, T: 214632, Avg. loss: 0.004840
Total training time: 0.05 seconds.
Norm: 19.70, NNZs: 388, Bias: -0.040000, T: 214632, Avg. loss: 0.001289
Total training time: 0.06 seconds.
Norm: 126.28, NNZs: 15947, Bias: 0.010000, T: 214632, Avg. loss: 0.030902
Total training time: 0.04 seconds.


[Parallel(n_jobs=-1)]: Done  23 out of  27 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    0.6s finished


In [134]:
new_classes = []
for text in mapping.values():
    new_classes.append(text)
new_classes = new_classes[1:]
print(new_classes)
new_classes.remove('B-JUDGE')
new_classes.remove('I-JUDGE')
print(new_classes)
len(new_classes)

['O', 'B-COURT', 'I-COURT', 'B-PETITIONER', 'I-PETITIONER', 'B-RESPONDENT', 'I-RESPONDENT', 'B-JUDGE', 'I-JUDGE', 'B-LAWYER', 'I-LAWYER', 'B-DATE', 'I-DATE', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-STATUTE', 'I-STATUTE', 'B-PROVISION', 'I-PROVISION', 'B-PRECEDENT', 'I-PRECEDENT', 'B-CASE_NUMBER', 'I-CASE_NUMBER', 'B-WITNESS', 'I-WITNESS', 'B-OTHER_PERSON', 'I-OTHER_PERSON']
['O', 'B-COURT', 'I-COURT', 'B-PETITIONER', 'I-PETITIONER', 'B-RESPONDENT', 'I-RESPONDENT', 'B-LAWYER', 'I-LAWYER', 'B-DATE', 'I-DATE', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-STATUTE', 'I-STATUTE', 'B-PROVISION', 'I-PROVISION', 'B-PRECEDENT', 'I-PRECEDENT', 'B-CASE_NUMBER', 'I-CASE_NUMBER', 'B-WITNESS', 'I-WITNESS', 'B-OTHER_PERSON', 'I-OTHER_PERSON']


27

In [131]:
#classes.remove("o")
print(classes)

['o', 'B-ORG', 'I-ORG', 'B-OTHER_PERSON', 'I-OTHER_PERSON', 'B-WITNESS', 'I-WITNESS', 'B-GPE', 'B-STATUTE', 'B-DATE', 'I-DATE', 'B-PROVISION', 'I-PROVISION', 'I-STATUTE', 'B-COURT', 'I-COURT', 'B-PRECEDENT', 'I-PRECEDENT', 'B-CASE_NUMBER', 'I-CASE_NUMBER', 'I-GPE', 'B-PETITIONER', 'I-PETITIONER', 'B-JUDGE', 'I-JUDGE', 'B-RESPONDENT', 'I-RESPONDENT']


In [132]:
# Illusion
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=classes))

                precision    recall  f1-score   support

             o       0.92      0.98      0.95     88722
         B-ORG       0.26      0.28      0.27       481
         I-ORG       0.34      0.17      0.22       858
B-OTHER_PERSON       0.40      0.18      0.25       883
I-OTHER_PERSON       0.39      0.18      0.24       698
     B-WITNESS       0.17      0.10      0.13       281
     I-WITNESS       0.20      0.18      0.19       229
         B-GPE       0.32      0.28      0.30       483
     B-STATUTE       0.70      0.51      0.59       607
        B-DATE       0.71      0.25      0.37       602
        I-DATE       0.50      0.32      0.39       472
   B-PROVISION       0.87      0.74      0.80       803
   I-PROVISION       0.52      0.33      0.41      1330
     I-STATUTE       0.62      0.40      0.48      1118
       B-COURT       0.74      0.58      0.65       445
       I-COURT       0.30      0.06      0.10       789
   B-PRECEDENT       0.19      0.10      0.13  

In [140]:
classes.remove("o")
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=classes))

                precision    recall  f1-score   support

         B-ORG       0.26      0.28      0.27       481
         I-ORG       0.34      0.17      0.22       858
B-OTHER_PERSON       0.40      0.18      0.25       883
I-OTHER_PERSON       0.39      0.18      0.24       698
     B-WITNESS       0.17      0.10      0.13       281
     I-WITNESS       0.20      0.18      0.19       229
         B-GPE       0.32      0.28      0.30       483
     B-STATUTE       0.70      0.51      0.59       607
        B-DATE       0.71      0.25      0.37       602
        I-DATE       0.50      0.32      0.39       472
   B-PROVISION       0.87      0.74      0.80       803
   I-PROVISION       0.52      0.33      0.41      1330
     I-STATUTE       0.62      0.40      0.48      1118
       B-COURT       0.74      0.58      0.65       445
       I-COURT       0.30      0.06      0.10       789
   B-PRECEDENT       0.19      0.10      0.13       448
   I-PRECEDENT       0.61      0.27      0.37  

In [3]:
16993 / 105715

0.1607435084898075