In [1]:
import pandas as pd
import re
import spacy
from pandarallel import pandarallel
from spacy.matcher import Matcher
import numpy as np
import pickle
import random

pandarallel.initialize(progress_bar=True)
nlp = spacy.load("en_core_web_sm")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
df_all = pd.read_csv(r"../data/cases-tagging-raw.csv")

In [3]:
df_civil = (
    df_all[df_all["civil"]]
    .drop(["civil", "criminal", "not-defined"], axis=1)
    .reset_index(drop=True)
)

In [4]:
df_criminal = (
    df_all[df_all["criminal"]]
    .drop(["civil", "criminal", "not-defined"], axis=1)
    .reset_index(drop=True)
)

In [5]:
df_civil

Unnamed: 0,file-name,text,tag
0,35177_2015_Judgement_08-Dec-2017.txt,...,
1,40216.txt,...,
2,30829.txt,http://JUDIS.NIC.IN SUPREME COU...,
3,33421.txt,...,
4,43039.txt,...,
...,...,...,...
11295,37715.txt,1\n\n ...,
11296,31352.txt,http://JUDIS.NIC.IN SUPREME COU...,
11297,35009.txt,1\n\n ...,
11298,33757.txt,1\n\n ...,


In [6]:
df_criminal

Unnamed: 0,file-name,text,tag
0,37482.txt,CRL.A. No..2325of 2009 @ SLP(CRL.) No. 7359 of...,
1,39583.txt,...,
2,37865.txt,...,
3,30932.txt,http://JUDIS.NIC.IN SUPREME COU...,
4,43935.txt,1\n\n ...,
...,...,...,...
4521,34974.txt,...,
4522,40710.txt,1\n\n ...,
4523,29948.txt,http://JUDIS.NIC.IN SUPREME ...,
4524,41916.txt,REPORTAB...,


In [7]:
df_criminal.iloc[0, 1]

"CRL.A. No..2325of 2009 @ SLP(CRL.) No. 7359 of 2009\n                                                          1\n\n\n\n\n                                   IN THE SUPREME COURT OF INDIA\n\n                                  CRIMINAL APPELLATE JURISDICTION\n\n                             CRIMINAL APPEAL NO. 2325 OF 2009\n                       [arising out of SLP(CRL.) No. 7359 of 2009]\n\n\n\nSAIYED SALIM SAIYED HUSSAIN SAIYED                                .....             APPELLANT\n\n\n\n                                                       VERSUS\n\nSTATE OF GUJARAT                                                  .....             RESPONDENT\n\n\n\n\n                                                      O R D E R\n\n\n        1.         Application             seeking        exemption       from    filing\n\n        official translation is allowed.\n\n        2.         Leave granted.\n\n        3.         The      appellant          stands     convicted       for    offences\n

In [20]:
re.sub(r"\s+", " ", df_criminal.iloc[0, 1]).strip()

1786

In [8]:
df_criminal["lth"] = df_criminal["text"].apply(
    lambda x: len(re.sub(r"\s+", " ", x).strip())
)

In [9]:
df_criminal[df_criminal["lth"] <= 5000].shape

(923, 4)

In [10]:
df_criminal_five_k = df_criminal[df_criminal["lth"] <= 5000].reset_index(
    drop=True
)

In [11]:
df_criminal_five_k.iloc[71:80]

Unnamed: 0,file-name,text,tag,lth
71,37121.txt,IN THE SUPREME COUR...,,1098
72,37124.txt,IN THE SUPREME COUR...,,1310
73,34967.txt,IN THE SUPREME COURT...,,3107
74,38107.txt,IN THE SUPREME COURT OF IND...,,1052
75,37155.txt,...,,4966
76,32569.txt,IN THE SUPREME COURT OF ...,,2709
77,44556.txt,1\n\n ...,,1319
78,34843.txt,IN THE SUPREME COURT OF INDIA\...,,4713
79,4288_2018_Judgement_16-Nov-2018.txt,...,,2357


In [12]:
re.sub(r"\s+", " ", df_criminal_five_k.iloc[76, 1]).strip()

'IN THE SUPREME COURT OF INDIA CRIMINAL APPELLATE JURISDICTION CRIMINAL APPEAL NO. 1497 OF 2008 [Arising out of SLP(Crl) 4802 of 2007] PRITAM SINGH SIDHU .......APPELLANT(S) Versus STATE OF PUNJAB & ANR. .....RESPONDENT(S) ORDER Leave granted. Heard learned counsel for the parties. 2. The second respondent is the wife of one Gurjant Singh. The appellant is the brother-in-law of the said Gurjant Singh. The second respondent filed a complaint under Section 406 and 498A of IPC in the Court of Sub Divisional Judicial Magistrate, Abohar against her husband (A1), father-in-law (A2), mother-in-law (A3), sister-in-law (A4) and the husband of the sister-in-law (A5) who is the appellant herein. The only reference to accused No.5 (appellant) in the said complaint reads thus: “One T.V., one fridge, one washing machine were handed over to the accused No.5 who is the brother-in-law of the complainant as a trust property.” In the pre-summons statement recorded by the learned Magistrate, there is no r

In [22]:
df_criminal_five_k[df_criminal_five_k['file-name']=='38547.txt'].index

Int64Index([578], dtype='int64')

In [34]:
add_names=[]
import requests

In [37]:
def upload_files():
    files_list = []
    for i in range(0, 50):
        file = random.randint(0, 9) + 81 + (i * 10)
        file_name = df_criminal_five_k.iloc[file, 0]
        add_names.append(file_name)
        with open('./temp-files/'+file_name, 'w') as f:
            f.write('"')
            f.write(re.sub(r"\s+", " ", df_criminal_five_k.iloc[file, 1]).strip())
            f.write('"')
        tagtogAPIUrl = "https://www.tagtog.net/-api/documents/v1"
        auth = requests.auth.HTTPBasicAuth(username='dev-sb', password='De4Pp@6QBjKMUGe')
        params = {'project':'cases-annotation', 'owner': 'dev-sb', 'output':'ann.json','folder':'sb'}
        #you can append more files to the list in case you want to upload multiple files
        files = [(file_name, open('temp-files/'+file_name))]
        response = requests.post(tagtogAPIUrl, params=params, auth=auth, files=files)
        print(response.text)

In [36]:
upload_files()

{"annotatable":{"parts":["s1p1"]},"anncomplete":false,"sources":[],"metas":{},"entities":[],"relations":[]}
{"annotatable":{"parts":["s1p1"]},"anncomplete":false,"sources":[],"metas":{},"entities":[],"relations":[]}
{"annotatable":{"parts":["s1p1"]},"anncomplete":false,"sources":[],"metas":{},"entities":[],"relations":[]}
{"annotatable":{"parts":["s1p1"]},"anncomplete":false,"sources":[],"metas":{},"entities":[],"relations":[]}
{"annotatable":{"parts":["s1p1"]},"anncomplete":false,"sources":[],"metas":{},"entities":[],"relations":[]}
{"annotatable":{"parts":["s1p1"]},"anncomplete":false,"sources":[],"metas":{},"entities":[],"relations":[]}
{"annotatable":{"parts":["s1p1"]},"anncomplete":false,"sources":[],"metas":{},"entities":[],"relations":[]}
{"annotatable":{"parts":["s1p1"]},"anncomplete":false,"sources":[],"metas":{},"entities":[],"relations":[]}
{"annotatable":{"parts":["s1p1"]},"anncomplete":false,"sources":[],"metas":{},"entities":[],"relations":[]}
{"annotatable":{"parts":["s1

In [16]:
add_names

['37525.txt',
 '34940.txt',
 '32973.txt',
 '36737.txt',
 '37469.txt',
 '40596.txt',
 '43496.txt',
 '11363_2017_Judgement_10-Oct-2018.txt',
 '38302.txt',
 '36797.txt',
 '31636.txt',
 '38100.txt',
 '34569.txt',
 '38860.txt',
 '33044.txt',
 '33628.txt',
 '34744.txt',
 '38830.txt',
 '19878_2017_Judgement_23-Jul-2018.txt',
 '36793.txt',
 '34949.txt',
 '40313.txt',
 '38574.txt',
 '36805.txt',
 '36660.txt',
 '38621.txt',
 '21569_2019_4_24_14619_Judgement_21-Jun-2019.txt',
 '39015.txt',
 '38113.txt',
 '41158.txt',
 '35785.txt',
 '40004.txt',
 '39749.txt',
 '37329.txt',
 '40060.txt',
 '33746.txt',
 '34082.txt',
 '37545.txt',
 '34287.txt',
 '17606_2018_Judgement_19-Nov-2018.txt',
 '37901.txt',
 '31688.txt',
 '37134.txt',
 '30537.txt',
 '34343.txt',
 '34881.txt',
 '38581.txt',
 '37708.txt',
 '30572.txt',
 '38547.txt']