# Simple Vectorization and Classification
This notebook shows a simple solution to the RR task (https://sites.google.com/view/legaleval/home?pli=1)
1. Use Scikit-learn Tfidf to vectorize the sentences
2. Apply Naive Bayes (NB) classifier to classify the sentences into 13 categories
3. Evalute the NB classifier on the dev data

In [1]:
# train and dev files
## Change the following paths to your paths:
train_file = "/content/drive/MyDrive/Colab Notebooks/semEval/legalEval/taskA-RR/data/train.csv"
dev_file = "/content/drive/MyDrive/Colab Notebooks/semEval/legalEval/taskA-RR/data/dev.csv"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Read in the train data
train = pd.read_csv(train_file)
train.shape

(28986, 9)

In [4]:
train.head()

Unnamed: 0,id,type,to_name,from_name,value.start,value.end,value.text,value.labels,document_id
0,d7a902fe9c23417499a7ef782f9fbdeb,labels,text,label,0,116,"IN THE HIGH COURT OF KARNATAKA,\n ...",['PREAMBLE'],1735
1,8d41599e98424d9480c25109556a7d14,labels,text,label,116,678,\n\n BEFORE\n\nTHE HON'BLE MR.JUSTICE ANA...,['PREAMBLE'],1735
2,e501424117da40a7935c2d9f2fb2fe38,labels,text,label,678,964,This Criminal Appeal is filed under Section 37...,['PREAMBLE'],1735
3,4825806388fe43d39f73354b10b5b32d,labels,text,label,964,1093,\n\n This appeal coming on for hearing t...,['PREAMBLE'],1735
4,d6893a25f82948f8be17fc9e876fb716,labels,text,label,1093,1180,\n Heard the learned Counsel for the app...,['NONE'],1735


In [5]:
train['value.labels'].unique()

array(["['PREAMBLE']", "['NONE']", "['FAC']", "['ARG_RESPONDENT']",
       "['RLC']", "['ARG_PETITIONER']", "['ANALYSIS']", "['PRE_RELIED']",
       "['RATIO']", "['RPC']", "['ISSUE']", "['STA']",
       "['PRE_NOT_RELIED']"], dtype=object)

## Map the labels to numbers

In [6]:
lab2id = {"['PREAMBLE']":1, "['NONE']":2, "['FAC']":3, "['ARG_RESPONDENT']":4,
       "['RLC']":5, "['ARG_PETITIONER']":6, "['ANALYSIS']":7, "['PRE_RELIED']":8,
       "['RATIO']":9, "['RPC']":10, "['ISSUE']":11, "['STA']":12,
       "['PRE_NOT_RELIED']":13}

In [7]:
id2lab = {1:"['PREAMBLE']", 2:"['NONE']", 3:"['FAC']", 4:"['ARG_RESPONDENT']",
       5:"['RLC']", 6:"['ARG_PETITIONER']", 7:"['ANALYSIS']", 8:"['PRE_RELIED']",
       9:"['RATIO']", 10:"['RPC']", 11:"['ISSUE']", 12:"['STA']",
       13:"['PRE_NOT_RELIED']"}

## Extract Training sentences and labels

In [8]:
sents = train["value.text"].str.replace('\n', "").apply(lambda x: x.lower())
sents

0              in the high court of karnataka,         ...
1              beforethe hon'ble mr.justice anand byrar...
2        this criminal appeal is filed under section 37...
3               this appeal coming on for hearing this ...
4               heard the learned counsel for the appel...
                               ...                        
28981     so section 132 of the evidence act sufficient...
28982     for the reasons aforesaid, the appeal is allo...
28983    the judgment and order dated april 27, 1987 pa...
28984                                               r.s.s.
28985                                      appeal allowed.
Name: value.text, Length: 28986, dtype: object

In [9]:
y = train['value.labels'].map(lab2id)
y

0         1
1         1
2         1
3         1
4         2
         ..
28981     9
28982    10
28983    10
28984     2
28985    10
Name: value.labels, Length: 28986, dtype: int64

## Conver the sentences to vectors

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english', max_features=5000)

In [35]:
features_transformer = tfidf.fit(sents)
features = features_transformer.transform(sents).toarray()
features.shape

(28986, 5000)

## Naive Bayes Classification

In [36]:
from sklearn.naive_bayes import MultinomialNB

In [37]:
nbcls = MultinomialNB()

In [38]:
nbcls.fit(features, y)

MultinomialNB()

In [39]:
predicts_train = nbcls.predict(features)

In [40]:
from sklearn.metrics import precision_recall_fscore_support

In [41]:
evals = precision_recall_fscore_support(predicts_train, y, average='weighted')
evals

  _warn_prf(average, modifier, msg_start, len(result))


(0.746991089247239, 0.5881805009314842, 0.6296663899419641, None)

In [26]:
print('weighted precision on training: {}'.format(evals[0]))
print('weighted recall on training: {}'.format(evals[1]))
print('weighted f1score on training: {}'.format(evals[2]))

weighted precision on training: 0.6959864357657609
weighted recall on training: 0.5043814255157663
weighted f1score on training: 0.554005023673026


## Evaluate the dev data

In [42]:
# Read in the train data
dev = pd.read_csv(dev_file)
dev.shape

(2890, 9)

In [48]:
sents_dev = dev["value.text"].str.replace('\n', "").apply(lambda x: x.lower())
sents_dev

0       petitioner:the commissioner of income-taxnew d...
1              date of judgment:05/05/1961bench:das, s.k.
2       bench:das, s.k.hidayatullah, m.shah, j.c.citat...
3       itentered into transactions in the nature of f...
4       the assessee claimed deduction of theselosses ...
                              ...                        
2885                           the petitions are allowed.
2886    the impugned orders are set aside with directi...
2887     the respondent having challenged the judgment...
2888    therefore, having regard to the law laid down ...
2889                                        sd/- judge nv
Name: value.text, Length: 2890, dtype: object

In [49]:
y_dev = dev['value.labels'].map(lab2id)
y_dev

0        1
1        1
2        1
3        1
4        1
        ..
2885    10
2886    10
2887    10
2888    10
2889     2
Name: value.labels, Length: 2890, dtype: int64

In [50]:
features_dev = features_transformer.transform(sents_dev).toarray()
features_dev.shape

(2890, 5000)

In [51]:
predicts = nbcls.predict(features_dev)

In [52]:
evals_dev = precision_recall_fscore_support(predicts, y_dev, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
print('weighted precision on dev: {}'.format(evals_dev[0]))
print('weighted recall on dev: {}'.format(evals_dev[1]))
print('weighted f1score on dev: {}'.format(evals_dev[2]))

weighted precision on dev: 0.7010339420321985
weighted recall on dev: 0.5141868512110727
weighted f1score on dev: 0.5620552512797896


## Logistic Regression

In [54]:
from sklearn.linear_model import LogisticRegression

In [55]:
lr = LogisticRegression()

In [56]:
lr.fit(features, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [57]:
predicts = lr.predict(features_dev)

In [58]:
evals_dev = precision_recall_fscore_support(predicts, y_dev, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
evals_dev

(0.6520520451227161, 0.5539792387543253, 0.5814886909919296, None)