### Imports

In [1]:
import pandas as pd
import re
import spacy
from pandarallel import pandarallel
from spacy.matcher import Matcher

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
df_all = pd.read_csv(r"../data/cases-tagging-raw.csv")

In [3]:
df = (
    df_all[df_all["civil"]]
    .drop(["civil", "criminal", "not-defined"], axis=1)
    .reset_index(drop=True)
)

In [4]:
df = df[df.iloc[:, 1].str.contains("PETITIONER:")].reset_index(drop=True)

In [17]:
df = df[df.iloc[:, 1].str.contains("CASE NO.:")].reset_index(drop=True)

In [5]:
nlp = spacy.load("en_core_web_sm")

In [5]:
matcher = Matcher(nlp.vocab)

In [None]:
pattern = []
matcher.add("petitioner_pattern", None, pattern)

### main-spacy

In [9]:
def clean_text(row):
    text = row["text"]
    #     text = re.sub(r"\.", "", row["text"])
    return re.sub(r"\s+", " ", text).strip()

In [19]:
t=df.parallel_apply(clean_text,axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=177), Label(value='0 / 177'))), HB…

In [21]:
t.iloc[0]

'http://JUDIS.NIC.IN SUPREME COURT OF INDIA Page 1 of 2 CASE NO.: Appeal (civil) 1815 of 2008 PETITIONER: M/S TEXMACO LTD. CEMENT DIVISION RESPONDENT: A.S. NARASIMHAM DATE OF JUDGMENT: 07/03/2008 BENCH: ALTAMAS KABIR & J.M. PANCHAL JUDGMENT: JUDGMENT O R D E R CIVIL APPEAL NO. 1815 OF 2008 [Arising out of SLP(C)No. 24836 of 2005] Leave granted. We have heard learned counsel for the parties. It appears that the respondent herein was appointed to look after sales promotion work by the appellant herein with effect from 19th May, 1986. His services were thereafter terminated on 21st September, 1993, but on a representation made by him, he was re-appointed by the appellant and was put on probation for six months. Thereafter, on 20th October, 1994 his probation was extended. Subsequently, by letter dated 20th October, 1994, he was informed that his period of probation would be over on 16th November, 1994, and that thereafter, he would be relieved from service after working hours on the same 

In [22]:
doc = nlp(t.iloc[10])

In [23]:
spacy.displacy.render(doc, style="ent")

In [24]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text, token.tag_)

http://JUDIS.NIC.IN PUNCT compound COURT ``
SUPREME ADJ compound COURT JJ
COURT PROPN ROOT COURT NNP
OF ADP prep COURT IN
INDIA PROPN pobj OF NNP
Page PROPN ROOT Page NNP
1 NUM nummod Page CD
of ADP prep Page IN
4 NUM nummod CASE CD
CASE NOUN pobj of NN
NO INTJ nummod Page UH
. PUNCT punct Page .
: PUNCT punct PETITIONER :
Appeal NOUN nsubj PETITIONER NN
( PUNCT punct civil -LRB-
civil ADJ appos Appeal JJ
) PUNCT punct civil -RRB-
5373 NUM appos Appeal CD
of ADP prep 5373 IN
2007 NUM pobj of CD
PETITIONER PROPN ROOT PETITIONER NNP
: PUNCT punct PETITIONER :
Delhi PROPN compound Authority NNP
Development PROPN compound Authority NNP
Authority PROPN compound RESPONDENT NNP
RESPONDENT NOUN ROOT RESPONDENT NN
: PUNCT punct RESPONDENT :
Arjun PROPN compound Satija NNP
Lal PROPN compound Satija NNP
Satija PROPN ROOT Satija NNP
and CCONJ cc Satija CC
Ors PROPN compound DATE NNP
DATE PROPN conj Satija NNP
OF ADP prep DATE IN
JUDGMENT PROPN pobj OF NNP
: PUNCT punct Satija :
23/11/2007 NUM appo

In [26]:
for ent in doc.ents:
    print(ent.text, ent.label_,ent)

http://JUDIS.NIC.IN SUPREME COURT OF INDIA ORG http://JUDIS.NIC.IN SUPREME COURT OF INDIA
1 CARDINAL 1
4 CARDINAL 4
5373 CARDINAL 5373
2007 DATE 2007
Delhi Development Authority ORG Delhi Development Authority
Arjun Lal Satija PERSON Arjun Lal Satija
23/11/2007 TIME 23/11/2007
ARIJIT PASAYAT & LOKESHWAR SINGH PANTA & P. SATHASIVAM ORG ARIJIT PASAYAT & LOKESHWAR SINGH PANTA & P. SATHASIVAM
U D G PERSON U D G
2007 DATE 2007
4024 CARDINAL 4024
ARIJIT PASAYAT PERSON ARIJIT PASAYAT
J. 1 PERSON J. 1
2 CARDINAL 2
the Delhi High Court ORG the Delhi High Court
no.1 3 DATE no.1 3
The Mianwali District Cooperative House Building Society Ltd. ORG The Mianwali District Cooperative House Building Society Ltd.
Flat No ORG Flat No
New Qutab Road FAC New Qutab Road
Delhi GPE Delhi
Society ORG Society
Society ORG Society
No.1 CARDINAL No.1
22.11.1974 WORK_OF_ART 22.11.1974
No.1 CARDINAL No.1
The Managing Committee ORG The Managing Committee
Shri K.K. Satija ORG Shri K.K. Satija
No.1 CARDINAL No.1
No.1 C

### Main

In [27]:
appellant_regex = [
    r"\B([\.\w\s&]*)\s+([…\s\.]*APPELLANT)",
    r"petitioner:\v*([\W\S]*)respondent:",
]

In [28]:
respondent_regex = [
    r"(versus|v\.s\.|vs\.|v\.s|v\/s)([\/\.\w\s&\\]*)\s{2,3}([…\s\.]*respondent)",
    r"respondent:\v*(.*)date of judgment:",
]

In [29]:
date_of_judgement_regex = [r"date of judgment:\v*(.*)bench:"]

In [30]:
bench_regex = [r"bench:\v*(.*)judgment"]

In [31]:
not_found_files = []

In [32]:
df.head()

Unnamed: 0,file-name,text,tag
0,35177_2015_Judgement_08-Dec-2017.txt,...,
1,40216.txt,...,
2,30829.txt,http://JUDIS.NIC.IN SUPREME COU...,
3,33421.txt,...,
4,43039.txt,...,


In [33]:
def tags(row):
    text = row["text"]
    appellant = re.findall(
        appellant_regex[1], text, re.MULTILINE | re.IGNORECASE
    )
    if len(appellant) > 0:
        print(appellant[0], row["file-name"])
        respondent = re.findall(
            respondent_regex[1], text, re.MULTILINE | re.IGNORECASE
        )
        date = re.findall(
            date_of_judgement_regex[0], text, re.MULTILINE | re.IGNORECASE
        )
        bench = re.findall(bench_regex[0], text, re.MULTILINE | re.IGNORECASE)
        print(appellant,respondent,date,bench)
    else:
        appellant = re.findall(
            appellant_regex[0], text, re.MULTILINE | re.IGNORECASE
        )
        if len(appellant) > 0:
            print(appellant[0][0], row["file-name"])
            respondent = re.findall(
                respondent_regex[0], text, re.MULTILINE | re.IGNORECASE
            )
        else:
            not_found_files.append(row["file-name"])

In [34]:
t = df.iloc[:10, :]

In [35]:
a = t.apply(tags, axis=1)




                         VIJAY KUMAR AND ANOTHER                    35177_2015_Judgement_08-Dec-2017.txt


Sandeep Kumar Chourasia                                  40216.txt

M/S TEXMACO LTD. CEMENT DIVISION

 30829.txt
['\nM/S TEXMACO LTD. CEMENT DIVISION\n\n'] [] [] []



Mahesh Yadav & Anr.                                    33421.txt
TAR INDUSTRIES                                      43039.txt


          SHAMSHER SINGH & ANR.                                       2251_2010_11_1501_15425_Judgement_29-Jul-2019.txt


MANEESH BAWA AND ORS.                                           44440.txt
 B.C.C.L.         .....         36775.txt
 SPORTS & LEISURE APPAREL LTD.          43794.txt



UNION OF INDIA & ORS.                       40617.txt


In [148]:
not_found_files

[]

In [36]:
df[df.iloc[:, 1].str.contains("PETITIONER")].shape

(2442, 3)

In [76]:
df.iloc[5, 0]

'2251_2010_11_1501_15425_Judgement_29-Jul-2019.txt'

In [30]:
df.iloc[295, 1]

"                                                               1\n\n                                                                                                 REPORTABLE\n\n                                           IN THE SUPREME COURT OF INDIA\n                                            CIVIL APPELLATE JURISDICTION\n\n                                        CIVIL APPEAL NO. 1820 OF 2018\n                              [@ SPECIAL LEAVE PETITION (C) NOS. 9820 OF 2016]\n\n                         R K ARORA GENERAL MANAGER & ANR.                                 Appellant(s)\n\n                                                              VERSUS\n\n                         M/S ACE ENTERPRISES                                               Respondent(s)\n\n                                                   J U D G M E N T\n\n\n                         KURIAN, J.\n\n                         1.   Leave granted.\n\n                         2.   On    22.06.2011,        the     Court    