In [43]:
import pandas as pd
import re
import spacy
from pandarallel import pandarallel
import numpy as np
import pickle
import json
import glob

pandarallel.initialize(progress_bar=True)
nlp = spacy.load("en_core_web_sm")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [44]:
with open("./get-case-type/cases-criminal", "rb") as pic_r:
    df_criminal = pickle.load(pic_r)

In [45]:
cases_title = [
    "37482.txt",
    "39583.txt",
    "32672.txt",
    "43181.txt",
    "36965.txt",
    "38633.txt",
    "40816.txt",
    "33555.txt",
    "33314.txt",
    "38645.txt",
    "32569.txt",
    "37525.txt",
    "34940.txt",
    "32973.txt",
    "36737.txt",
    "37469.txt",
    "40596.txt",
    "43496.txt",
    "11363_2017_Judgement_10-Oct-2018.txt",
    "38302.txt",
    "36797.txt",
    "31636.txt",
    "38100.txt",
    "34569.txt",
    "38860.txt",
    "33044.txt",
    "33628.txt",
    "34744.txt",
    "38830.txt",
    "19878_2017_Judgement_23-Jul-2018.txt",
    "36793.txt",
    "34949.txt",
    "40313.txt",
    "38574.txt",
    "36805.txt",
    "36660.txt",
    "38621.txt",
    "21569_2019_4_24_14619_Judgement_21-Jun-2019.txt",
    "39015.txt",
    "38113.txt",
    "41158.txt",
    "35785.txt",
    "40004.txt",
    "39749.txt",
    "37329.txt",
    "40060.txt",
    "33746.txt",
    "34082.txt",
    "37545.txt",
    "34287.txt",
    "17606_2018_Judgement_19-Nov-2018.txt",
    "37901.txt",
    "31688.txt",
    "37134.txt",
    "30537.txt",
    "34343.txt",
    "34881.txt",
    "38581.txt",
    "37708.txt",
    "30572.txt",
    "38547.txt",
]

In [46]:
df_tagged_man = df_criminal[
    df_criminal["file-name"].isin(cases_title)
].reset_index(drop=True)
df_tagged_man["text"] = df_tagged_man["text"].apply(
    lambda x: re.sub(r"\s+", " ", x).strip()
)

In [47]:
entity_types = {
    "e_16": "LAW",
    "e_3": "PETITIONER_OR_APPELLANT",
    "e_27": "VICTIM_ALLIES",
    "e_2": "JUDGE",
    "e_30": "WEAPONS",
    "e_12": "COURT",
    "e_23": "VICTIM",
    "e_31": "VEHICLE",
    "e_18": "LOC_OF_JUDGMENT",
    "e_29": "WITNESS",
    "e_7": "ADVOCATE",
    "m_14": "CRIMINAL",
    "e_5": "DATE_OF_JUDGMENT",
    "e_22": "PETITION-DETAILS",
    "m_15": "CIVIL",
    "m_20": "FILE-NAME",
    "e_26": "LOC_CRIME",
    "e_6": "CASE_NO",
    "e_21": "POLICE",
    "e_28": "POSTMORTEM_DOC",
    "e_17": "JUDGMENT",
    "e_9": "CITATION",
    "e_4": "RESPONDENT",
    "e_25": "DATE_AND_TIME_CRIME",
    "e_24": "ACCUSED",
    "e_13": "CRIME",
}

### Download tags

In [2]:
!python download-tags.py download 'folder:sb' -u 'dev-sb' -w 'De4Pp@6QBjKMUGe' -p 'cases-annotation' -o 'dev-sb' --output_folder './tags-download-tsv/' -t 'entitiestsv'

downloaded: ./tags-download-tsv/a78oVORrJ1QqUulbmR7iYVTPwAVq-38581.tsv
downloaded: ./tags-download-tsv/aUWC86pQeT3vLRge5KmONgWvKIJa-34343.tsv
downloaded: ./tags-download-tsv/aVWIZ0f3LE8Mq1cxVOnye2e1PeS4-30572.tsv
downloaded: ./tags-download-tsv/aTXYGtrzAAlHHVDWfHTI2KBJWEPC-38547.tsv
downloaded: ./tags-download-tsv/aD8Ic6BpHLRpuglGgDKg4cU1hI8a-34881.tsv
downloaded: ./tags-download-tsv/aP6eQ.DItz.8x85Rrfh.lcnxAymy-37708.tsv
downloaded: ./tags-download-tsv/ay7ag15iBIxUv9AqzJpQ8RIlgV9S-31688.tsv
downloaded: ./tags-download-tsv/aiXoAZs5XsKzhwt6NY5L4MfCLaj8-37134.tsv
downloaded: ./tags-download-tsv/azJYqd9_NA5iW1aaUPma2N4szjzS-37901.tsv
downloaded: ./tags-download-tsv/ajA3a1C9hFaueFKmdsyJBs.fxGDe-17606_2018_Judgement_19_Nov_2018.tsv
downloaded: ./tags-download-tsv/a57jfAG1bL.DotgB5GXTDJCtwxwa-30537.tsv
downloaded: ./tags-download-tsv/axNfmqEbge8QY9LlbzMo3NNF14xO-34287.tsv
downloaded: ./tags-download-tsv/aD5R7Yu9jcjTuX7a4t2av4_7l9bW-37545.tsv
downloaded: ./tags-download-tsv/a2kv6LHmrsad9wyihx

In [28]:
def get_tags(tags):
    res = []
    for i in tags["entities"]:
        entity_label = entity_types[i["classId"]]
        for j in i["offsets"]:
            start_v = j["start"] - 1
            end_v = len(j["text"]) + start_v
            res.append((start_v, end_v, entity_label))
    return res

In [29]:
def change_json_names():
    for name in glob.glob("./tags-download/*"):
        tags = {}
        with open(name) as f:
            tags = json.load(f)
        file_name = (
            "./tags-download-processed/"
            + tags["metas"]["m_20"]["value"].split(".")[0]
            + ".json"
        )
        tags = {"entities": get_tags(tags)}
        with open(file_name, "w") as f:
            json.dump(tags, f)

In [30]:
change_json_names()

In [31]:
def update_tags(row):
    with open(
        "./tags-download-processed/" + row["file-name"].split(".")[0] + ".json"
    ) as f:
        tags = json.load(f)
        return tags

In [32]:
df_tagged_man["tag"] = df_tagged_man.apply(update_tags, axis=1)

In [33]:
df_tagged_man.iloc[0, 1]

"CRL.A. No..2325of 2009 @ SLP(CRL.) No. 7359 of 2009 1 IN THE SUPREME COURT OF INDIA CRIMINAL APPELLATE JURISDICTION CRIMINAL APPEAL NO. 2325 OF 2009 [arising out of SLP(CRL.) No. 7359 of 2009] SAIYED SALIM SAIYED HUSSAIN SAIYED ..... APPELLANT VERSUS STATE OF GUJARAT ..... RESPONDENT O R D E R 1. Application seeking exemption from filing official translation is allowed. 2. Leave granted. 3. The appellant stands convicted for offences punishable under Section 8C read with Sections 21 and 29 of the Narcotic Drugs and Psychotropic Substances Act, 1985, for being in possession of 252 grams of charas. The trial court on a consideration of the evidence convicted and sentenced the appellant to undergo 10 years rigorous imprisonment and Rs. 1 lakh as fine and in default of payment of fine to undergo a CRL.A. No..2325of 2009 @ SLP(CRL.) No. 7359 of 2009 2 further period of one year's imprisonment. The High Court has confirmed the conviction and sentence. 4. In this matter, we notice that the a

In [39]:
df_tagged_man.iloc[0, 2]

{'entities': [[0, 51, 'CASE_NO'],
  [57, 83, 'COURT'],
  [116, 192, 'CASE_NO'],
  [193, 227, 'PETITIONER_OR_APPELLANT'],
  [251, 267, 'RESPONDENT'],
  [455, 555, 'LAW'],
  [569, 602, 'CRIME'],
  [704, 757, 'JUDGMENT'],
  [805, 856, 'CASE_NO'],
  [902, 916, 'COURT'],
  [1134, 1151, 'COURT'],
  [1264, 1280, 'RESPONDENT'],
  [1650, 1667, 'JUDGE'],
  [1690, 1702, 'JUDGE'],
  [1704, 1713, 'LOC_OF_JUDGMENT'],
  [1714, 1731, 'DATE_OF_JUDGMENT'],
  [1733, 1784, 'CASE_NO']]}

In [13]:
with open(
    "../data/proc.ipynb_checkpoints/ssed/cases-tagged-man", "wb"
) as pic_f:
    pickle.dump(df_tagged_man, pic_f)