In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
import pandas as pd
import json

In [3]:
with open("NER_TRAIN/NER_TRAIN_JUDGEMENT.json") as json_file_train:
    json_object_train = json.load(json_file_train)

In [4]:
with open("NER_DEV/NER_DEV/NER_DEV_JUDGEMENT.json") as json_file_dev:
    json_object_dev = json.load(json_file_dev)

In [5]:
named_entities = ["COURT", "PETITIONER", "RESPONDENT", "JUDGE", "LAWYER", "DATE", "ORG", "GPE", "STATUTE", "PROVISION", "PRECEDENT", "CASE_NUMBER", "WITNESS", "OTHER_PERSON"]
def make_tag_lookup_table():
    iob_labels = ["B", "I"]
    ner_labels = named_entities
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a, b]) for a, b in all_labels]
    all_labels = ["[PAD]", "O"] + all_labels
    return dict(zip(range(0, len(all_labels) + 1), all_labels))


mapping = make_tag_lookup_table()
print(mapping)


{0: '[PAD]', 1: 'O', 2: 'B-COURT', 3: 'I-COURT', 4: 'B-PETITIONER', 5: 'I-PETITIONER', 6: 'B-RESPONDENT', 7: 'I-RESPONDENT', 8: 'B-JUDGE', 9: 'I-JUDGE', 10: 'B-LAWYER', 11: 'I-LAWYER', 12: 'B-DATE', 13: 'I-DATE', 14: 'B-ORG', 15: 'I-ORG', 16: 'B-GPE', 17: 'I-GPE', 18: 'B-STATUTE', 19: 'I-STATUTE', 20: 'B-PROVISION', 21: 'I-PROVISION', 22: 'B-PRECEDENT', 23: 'I-PRECEDENT', 24: 'B-CASE_NUMBER', 25: 'I-CASE_NUMBER', 26: 'B-WITNESS', 27: 'I-WITNESS', 28: 'B-OTHER_PERSON', 29: 'I-OTHER_PERSON'}


In [6]:
named_entities = ["COURT", "PETITIONER", "RESPONDENT", "JUDGE", "LAWYER", "DATE", "ORG", "GPE", "STATUTE", "PROVISION", "PRECEDENT", "CASE_NUMBER", "WITNESS", "OTHER_PERSON"]
named_entities_to_list = {"".join(key): idx for idx, key in enumerate(named_entities)}
for key, value in named_entities_to_list.items():
    named_entities_to_list[key] = named_entities_to_list[key] + 1
named_entities_to_list  

{'COURT': 1,
 'PETITIONER': 2,
 'RESPONDENT': 3,
 'JUDGE': 4,
 'LAWYER': 5,
 'DATE': 6,
 'ORG': 7,
 'GPE': 8,
 'STATUTE': 9,
 'PROVISION': 10,
 'PRECEDENT': 11,
 'CASE_NUMBER': 12,
 'WITNESS': 13,
 'OTHER_PERSON': 14}

In [7]:
def get_start_and_end_and_labels(tree):
    start_and_end_and_labels = []
    for label in tree["annotations"][0]["result"]:
        labels = label["value"]["labels"][0]
        start = label["value"]["start"]
        end = label["value"]["end"]
        start_and_end_and_labels.append([labels, start, end])
    return start_and_end_and_labels

In [8]:
def slice_id_to_token_id(text, start, end):
    token_number_start = 0
    token_number_end = 1
    for char in text[:start]:
        if char == " ":        
            token_number_start += 1
    for char in text[:end]:
        if char == " ":
            token_number_end += 1
    return token_number_start, token_number_end

In [9]:
import re
def text_to_text_plus_label(text, labels):
    text = re.sub(r"\n", " ", text)
    token_numbers_of_labels = []
    for label in labels:
        start, end = slice_id_to_token_id(text, label[1], label[2])
        token_numbers_of_labels.append([label[0], start, end])
    tokens = text.split(" ")
    number_of_tokens = len(tokens)
    
    return_list = []
    token_labels = [ i - i for i in range(number_of_tokens)]
    for label in token_numbers_of_labels:
        # check the label
        # print(tokens[label[1]:label[2]])
        
        token_labels[label[1]] = named_entities_to_list[label[0]] * 2 - 1
        for i in range(label[1] + 1, label[2]):
            token_labels[i] = named_entities_to_list[label[0]]  * 2
    
    for i in range (len(token_labels)):
        return_list.append([tokens[i], token_labels[i]])
    return return_list

In [10]:
try_text = json_object_train[0]["data"]["text"]

In [11]:
try_label = get_start_and_end_and_labels(json_object_train[0])

In [12]:
try_convert = text_to_text_plus_label(try_text, try_label)
print(try_convert)

[['', 0], ['', 0], ['(7)', 0], ['On', 0], ['specific', 0], ['query', 0], ['by', 0], ['the', 0], ['Bench', 0], ['about', 0], ['an', 0], ['entry', 0], ['of', 0], ['Rs.', 0], ['1,31,37,500', 0], ['on', 0], ['deposit', 0], ['side', 0], ['of', 0], ['Hongkong', 13], ['Bank', 14], ['account', 0], ['of', 0], ['which', 0], ['a', 0], ['photo', 0], ['copy', 0], ['is', 0], ['appearing', 0], ['at', 0], ['p.', 0], ['40', 0], ['of', 0], ["assessee's", 0], ['paper', 0], ['book,', 0], ['learned', 0], ['authorised', 0], ['representative', 0], ['submitted', 0], ['that', 0], ['it', 0], ['was', 0], ['related', 0], ['to', 0], ['loan', 0], ['from', 0], ['broker,', 0], ['Rahul', 13], ['&', 14], ['Co.', 14], ['on', 0], ['the', 0], ['basis', 0], ['of', 0], ['his', 0], ['submission', 0], ['a', 0], ['necessary', 0], ['mark', 0], ['is', 0], ['put', 0], ['by', 0], ['us', 0], ['on', 0], ['that', 0], ['photo', 0], ['copy.', 0]]


In [13]:
df = pd.DataFrame(try_convert)

In [14]:
df.groupby(1).size()

1
0     63
13     2
14     3
dtype: int64

In [15]:
token_and_labels_train = []
for tree in json_object_train:
    labels = get_start_and_end_and_labels(tree)
    text = tree["data"]["text"]
    token_and_labels_train += text_to_text_plus_label(text, labels) 

In [16]:
token_and_labels_dev = []
for tree in json_object_dev:
    labels = get_start_and_end_and_labels(tree)
    text = tree["data"]["text"]
    token_and_labels_dev += text_to_text_plus_label(text, labels) 

In [17]:
df_train = pd.DataFrame(token_and_labels_train)
df_dev = pd.DataFrame(token_and_labels_dev)
df_train

Unnamed: 0,0,1
0,,0
1,,0
2,(7),0
3,On,0
4,specific,0
...,...,...
320342,with,0
320343,accused,0
320344,No.1,0
320345,as,0


In [18]:
df_train.groupby(1).size()

1
0     268939
1       1293
2       2534
3        464
4        386
5        324
6        456
7        567
8        391
11      1883
12      1383
13      1441
14      2648
15      1395
16       278
17      1803
18      3301
19      2384
20      3908
21      1351
22     12190
23      1039
24      3637
25       881
26       756
27      2653
28      2062
dtype: int64

In [19]:
X = df_train.drop(1, axis=1)[:20000]
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df_train[1][:20000]

classes = np.unique(y)
classes = classes.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

((13400, 5835), (13400,))

In [20]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 8.43, NNZs: 60, Bias: -1.000000, T: 13400, Avg. loss: 0.007836
Total training time: 0.12 seconds.
-- Epoch 1
Norm: 7.07, NNZs: 24, Bias: -2.000000, T: 13400, Avg. loss: 0.002015
Total training time: 0.12 seconds.
Norm: 6.86, NNZs: 47, Bias: -1.000000, T: 13400, Avg. loss: 0.001716
Total training time: 0.12 seconds.
-- Epoch 1
-- Epoch 1
Norm: 57.63, NNZs: 2479, Bias: 1.000000, T: 13400, Avg. loss: 0.126716
Total training time: 0.16 seconds.
-- Epoch 1
Norm: 5.10, NNZs: 26, Bias: -2.000000, T: 13400, Avg. loss: 0.002313
Total training time: 0.12 seconds.
-- Epoch 1
Norm: 6.78, NNZs: 40, Bias: -2.000000, T: 13400, Avg. loss: 0.002612
Total training time: 0.14 seconds.
-- Epoch 1
Norm: 4.58, NNZs: 21, Bias: -1.000000, T: 13400, Avg. loss: 0.001194
Total training time: 0.17 seconds.
-- Epoch 1
Norm: 6.08, NNZs: 34, Bias: -1.000000, T: 13400, Avg. loss: 0.001493
Total training time: 0.18 seconds.
-- Epoch 1
Norm: 4.58, NNZs: 21, Bias: -1.000

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.4s


Norm: 10.44, NNZs: 80, Bias: -1.000000, T: 13400, Avg. loss: 0.004030
Total training time: 0.19 seconds.
-- Epoch 1
Norm: 14.59, NNZs: 192, Bias: -1.000000, T: 13400, Avg. loss: 0.012836
Total training time: 0.13 seconds.
-- Epoch 1
Norm: 11.22, NNZs: 117, Bias: -2.000000, T: 13400, Avg. loss: 0.006642
Total training time: 0.17 seconds.
-- Epoch 1
Norm: 11.18, NNZs: 108, Bias: -1.000000, T: 13400, Avg. loss: 0.005075
Total training time: 0.13 seconds.
-- Epoch 1
Norm: 5.00, NNZs: 25, Bias: -1.000000, T: 13400, Avg. loss: 0.001791
Total training time: 0.13 seconds.
-- Epoch 1
Norm: 10.63, NNZs: 89, Bias: -1.000000, T: 13400, Avg. loss: 0.004925
Total training time: 0.12 seconds.
-- Epoch 1
Norm: 10.05, NNZs: 63, Bias: -1.000000, T: 13400, Avg. loss: 0.002985
Total training time: 0.12 seconds.
-- Epoch 1
Norm: 13.71, NNZs: 134, Bias: -2.000000, T: 13400, Avg. loss: 0.010299
Total training time: 0.13 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.7s


Norm: 8.89, NNZs: 79, Bias: -1.000000, T: 13400, Avg. loss: 0.004104
Total training time: 0.12 seconds.
-- Epoch 1
Norm: 15.46, NNZs: 207, Bias: -1.000000, T: 13400, Avg. loss: 0.012164
Total training time: 0.14 seconds.
-- Epoch 1
Norm: 24.62, NNZs: 504, Bias: -2.000000, T: 13400, Avg. loss: 0.036493
Total training time: 0.15 seconds.
-- Epoch 1
Norm: 7.81, NNZs: 55, Bias: -1.000000, T: 13400, Avg. loss: 0.003134
Total training time: 0.17 seconds.
-- Epoch 1
Norm: 13.56, NNZs: 167, Bias: -2.000000, T: 13400, Avg. loss: 0.013582
Total training time: 0.15 seconds.
-- Epoch 1
Norm: 9.00, NNZs: 81, Bias: -1.000000, T: 13400, Avg. loss: 0.003284
Total training time: 0.18 seconds.
Norm: 8.43, NNZs: 65, Bias: -1.000000, T: 13400, Avg. loss: 0.003955
Total training time: 0.14 seconds.
Norm: 15.65, NNZs: 221, Bias: -1.000000, T: 13400, Avg. loss: 0.008657
Total training time: 0.14 seconds.
Norm: 12.85, NNZs: 141, Bias: -1.000000, T: 13400, Avg. loss: 0.007164
Total training time: 0.12 seconds.

[Parallel(n_jobs=-1)]: Done  23 out of  27 | elapsed:    0.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.0s finished


In [21]:
new_classes = []
for text in mapping.values():
    new_classes.append(text)
new_classes = new_classes[1:]
len(new_classes)

29

In [22]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92      5501
           1       0.83      0.58      0.68        26
           2       0.00      0.00      0.00        44
           3       0.00      0.00      0.00        11
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00        15
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         6
          11       0.50      0.05      0.09        41
          12       0.17      0.03      0.05        33
          13       0.50      0.03      0.05        35
          14       0.45      0.11      0.18        87
          15       0.53      0.23      0.32        35
          16       0.00      0.00      0.00         8
          17       0.75      0.20      0.31        46
          18       0.75      0.30      0.43        69
          19       0.82    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
