In [1]:
import pandas as pd
import re
import spacy
from pandarallel import pandarallel
from spacy.matcher import Matcher
import numpy as np
import pickle

pandarallel.initialize(progress_bar=True)
nlp = spacy.load("en_core_web_sm")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
df_all = pd.read_csv(r"../data/cases-tagging-raw.csv")

In [3]:
df = (
    df_all[df_all["civil"]]
    .drop(["civil", "criminal", "not-defined"], axis=1)
    .reset_index(drop=True)
)

In [4]:
df = df[df.iloc[:, 1].str.contains("PETITIONER:")].reset_index(drop=True)

In [5]:
df = df[df.iloc[:, 1].str.contains("CASE NO.:")].reset_index(drop=True)

In [6]:
df = df.drop([df[df["file-name"] == "18946.txt"].index][0]).reset_index(
    drop=True
)

In [7]:
df.shape

(1410, 3)

In [8]:
df.head()

Unnamed: 0,file-name,text,tag
0,30829.txt,http://JUDIS.NIC.IN SUPREME COU...,
1,30541.txt,http://JUDIS.NIC.IN SUPREME COU...,
2,29170.txt,http://JUDIS.NIC.IN SUPREME C...,
3,27555.txt,http://JUDIS.NIC.IN SUPREME CO...,
4,29668.txt,http://JUDIS.NIC.IN SUPREME COU...,


In [9]:
def clean_text(row):
    text = row["text"]
    #     text = re.sub(r"\.", "", row["text"])
    return re.sub(r"\s+", " ", text).strip()

In [10]:
%time df['clean-text']=df.parallel_apply(clean_text,axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=177), Label(value='0 / 177'))), HB…

CPU times: user 380 ms, sys: 309 ms, total: 689 ms
Wall time: 1.1 s


In [11]:
appellant_regex = [
    r"petitioner:\v*([\W\S]*?)(respondent:|Vs.)",
]
respondent_regex = [
    r"respondent:\v*([\W\S]*?)date of judgment:*\v*\s*[0-9]",
]
date_of_judgement_regex = [r"date of judgment:*\v*([\W\S]*?)(.*):"]
bench_regex = [r"bench:\v*([\W\S]*?)(.*):"]
case_no_regex = [r"case no.:(.*?)petitioner:|judgment:"]

In [12]:
not_found_files = []

In [13]:
def tags(row):
    text = row["text"]
    appellant = re.findall(
        appellant_regex[0], text, re.MULTILINE | re.IGNORECASE
    )
    respondent = re.findall(
        respondent_regex[0], text, re.MULTILINE | re.IGNORECASE
    )
    date = re.findall(
        date_of_judgement_regex[0], text, re.MULTILINE | re.IGNORECASE
    )
    bench = re.findall(bench_regex[0], text, re.MULTILINE | re.IGNORECASE)
    case_no = re.findall(
        case_no_regex[0], row["clean-text"], re.MULTILINE | re.IGNORECASE
    )
    #     print(appellant, respondent, date, bench)
    #     print(case_no[0])
    if (
        len(appellant) == 0
        or len(respondent) == 0
        or len(date) == 0
        or len(bench) == 0
        or len(case_no) == 0
    ):
        not_found_files.append(row["file-name"])
        return np.NaN
    elif (
        len(appellant[0][0].strip()) == 0
        or len(respondent[0].strip()) == 0
        or len(date[0][0].strip()) == 0
        or len(bench[0][0].strip()) == 0
        or len(case_no[0].strip()) == 0
    ):
        not_found_files.append(row["file-name"])
        return np.NaN
    else:
        #         print(appellant[0],respondent,date,bench)
        return [
            appellant[0][0].strip(),
            respondent[0].strip(),
            date[0][0].strip(),
            bench[0][0].strip(),
            case_no[0].strip(),
        ]

In [14]:
df["tag"] = df.apply(tags, axis=1)

In [15]:
len(not_found_files)

0

In [16]:
not_found_files = []

In [17]:
df.iloc[0, 2]

['M/S TEXMACO LTD. CEMENT DIVISION',
 'A.S. NARASIMHAM',
 '07/03/2008',
 'ALTAMAS KABIR & J.M. PANCHAL',
 'Appeal (civil) 1815 of 2008']

In [18]:
df["text"] = df["clean-text"]
df = df.drop(["clean-text"], axis=1)

In [19]:
def get_ent_pos(start, end):
    loc = []
    fir_v = start[0]
    end_v = end[0]
    for (i, j) in zip(start[1:], end[1:]):
        if i == end_v:
            end_v = j
        else:
            loc.append([fir_v, end_v])
            fir_v = i
            end_v = j
    return loc

In [20]:
def tags_spacy_matcher(row, matcher, pattern, TYPE_V):
    res = []
    with nlp.disable_pipes("tagger", "parser"):
        doc = nlp(row["text"])
        matches = matcher(doc)
        start = 0
        end = 0
        pos_s = []
        pos_e = []
        for m_id, start, end in matches:
            pos_s.append(start)
            pos_e.append(end)
        loc = get_ent_pos(pos_s, pos_e)
        for i, j in loc:
            pos = row["text"].find(doc[i:j].text)
            if pos == -1:
                not_found_files.append(row["file-name"])
                return np.NaN
            res.append((pos, pos + len(doc[i:j].text), TYPE_V))
    return res

In [21]:
def matcher_tags(row):
    matcher = Matcher(nlp.vocab)
    pattern = [{"ENT_TYPE": "DATE"}]
    matcher.add("DATE_PATTERN", None, pattern)
    return tags_spacy_matcher(row, matcher, pattern, "DATE")

In [22]:
def find_pos(row, index, tag):
    value = re.sub(r"\s+", " ", row["tag"][index]).strip()
    pos = row["text"].find(value)
    if pos == -1:
        not_found_files.append(row["file-name"])
        return np.NaN
    return (pos, pos + len(row["tag"][index]), tag)

In [23]:
def replace_val_pos(row):
    res = [
        find_pos(row, 0, "PETITIONER"),
        find_pos(row, 1, "RESPONDENT"),
        find_pos(row, 2, "JUDGEMENT_DATE"),
        find_pos(row, 3, "JUDGES_BENCH"),
    ]
    res.extend(matcher_tags(row))
    return res

In [24]:
df["tag"] = df.parallel_apply(replace_val_pos, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=177), Label(value='0 / 177'))), HB…

In [None]:
def remove_duplicates_overlaps(rows):
    tags_list = set(rows["tag"])

    return tags_list

In [31]:
set(df.iloc[2, 2])

{(88, 92, 'DATE'),
 (105, 147, 'PETITIONER'),
 (160, 198, 'RESPONDENT'),
 (217, 227, 'DATE'),
 (217, 227, 'JUDGEMENT_DATE'),
 (235, 269, 'JUDGES_BENCH'),
 (354, 366, 'DATE'),
 (753, 757, 'DATE'),
 (1012, 1032, 'DATE'),
 (1024, 1032, 'DATE'),
 (1083, 1092, 'DATE'),
 (1116, 1130, 'DATE'),
 (1135, 1144, 'DATE'),
 (1270, 1290, 'DATE'),
 (1282, 1290, 'DATE'),
 (1455, 1459, 'DATE'),
 (1618, 1636, 'DATE'),
 (1641, 1659, 'DATE'),
 (1859, 1863, 'DATE'),
 (4038, 4051, 'DATE'),
 (4402, 4415, 'DATE'),
 (4726, 4730, 'DATE'),
 (5773, 5796, 'DATE'),
 (5847, 5856, 'DATE')}

In [28]:
df["tag"] = df["tag"].apply(lambda x: {"entities": x})

In [29]:
df["tag"]

0       {'entities': [(105, 137, 'PETITIONER'), (150, ...
1       {'entities': [(105, 126, 'PETITIONER'), (139, ...
2       {'entities': [(105, 147, 'PETITIONER'), (160, ...
3       {'entities': [(105, 137, 'PETITIONER'), (150, ...
4       {'entities': [(105, 127, 'PETITIONER'), (140, ...
                              ...                        
1405    {'entities': [(106, 118, 'PETITIONER'), (131, ...
1406    {'entities': [(105, 131, 'PETITIONER'), (144, ...
1407    {'entities': [(106, 142, 'PETITIONER'), (155, ...
1408    {'entities': [(105, 118, 'PETITIONER'), (131, ...
1409    {'entities': [(104, 125, 'PETITIONER'), (138, ...
Name: tag, Length: 1410, dtype: object

In [28]:
df.to_csv(r"../data/processed/cases-tag-civil-petitioner.csv", index=False)

In [30]:
with open("cases-tagged", "wb") as pic_f:
    pickle.dump(df, pic_f)

In [31]:
with open("cases-tagged", "rb") as pic_r:
    df = pickle.load(pic_r)