In [1]:
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from sklearn.model_selection import train_test_split

In [2]:
# load the matching document contents
df_text = pd.read_csv("data/trackGBV_xls_match.csv")
print(len(df_text))

809


In [3]:
df_labels = pd.read_csv("data/trackGBV_labels.csv")
print(len(df_labels))

809


## There are 5 types of document

* Judgment
* Sentence
* Ruling
* Decision
* Other

In [4]:
# Get the index of the document type
df_text["j_idx"]= df_text["contents"].str.find("JUDGMENT") 
df_text["s_idx"]= df_text["contents"].str.find("SENTENCE") 
df_text["r_idx"]= df_text["contents"].str.find("RULING")
df_text["d_idx"]= df_text["contents"].str.find("DECISION")
df_text["o_idx"] = df_text["contents"].str.find('APPLICATION FOR BAIL PENDING APPEAL')

In [5]:
# check if there are any duplicate document_types
# first j_idx and s_idx, then do the others

print("Number of docs that are:")
print()
print("JUDGMENT:", len(df_text.loc[(df_text.j_idx != -1)])) # 203
print("SENTENCE:", len(df_text.loc[(df_text.s_idx != -1)])) # 565
print("RULING:", len(df_text.loc[(df_text.r_idx != -1)]))   # 47
print("DECISION:", len(df_text.loc[(df_text.d_idx != -1)])) # 4
print("OTHER:", len(df_text.loc[(df_text.o_idx != -1)])) # 4

print("JUDGMENT & SENTENCE:", len(df_text.loc[((df_text.j_idx != -1) & (df_text.s_idx != -1))])) # 9
print("JUDGMENT & RULING:", len(df_text.loc[((df_text.j_idx != -1) & (df_text.r_idx != -1))]))   # 1
print("JUDGMENT & DECISION:", len(df_text.loc[((df_text.j_idx != -1) & (df_text.d_idx != -1))])) # 0

print("SENTENCE & RULING:", len(df_text.loc[((df_text.s_idx != -1) & (df_text.r_idx != -1))]))   # 2
print("SENTENCE & DECISION:", len(df_text.loc[((df_text.s_idx != -1) & (df_text.d_idx != -1))])) # 0

print("RULING & DECISION:", len(df_text.loc[((df_text.r_idx != -1) & (df_text.d_idx != -1))])) # 0

Number of docs that are:

JUDGMENT: 196
SENTENCE: 555
RULING: 46
DECISION: 4
OTHER: 1
JUDGMENT & SENTENCE: 9
JUDGMENT & RULING: 1
JUDGMENT & DECISION: 0
SENTENCE & RULING: 2
SENTENCE & DECISION: 0
RULING & DECISION: 0


In [6]:
# find and replace missing judgment statements
df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('J U D G M E N T')
            ), 'j_idx' ] = df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('J U D G M E N T')
            )]['contents'].str.find('J U D G M E N T')

df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('JUDGEMENT')
            ), 'j_idx' ] = df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('JUDGEMENT')
            )]['contents'].str.find('JUDGEMENT')

In [7]:
# find and replace missing ruling statements

df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('R U L I N G')
            ), 'r_idx' ] = df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('R U L I N G')
            )]['contents'].str.find('R U L I N G')

In [8]:
# find and replace missing sentence statements

df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('S E N T E N C E')
            ), 's_idx' ] = df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('S E N T E N C E')
            )]['contents'].str.find('S E N T E N C E')

In [9]:
# find and replace missing 'other' statements
df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('CHARGE')
            ), 'o_idx' ] = df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('CHARGE')
            )]['contents'].str.find('CHARGE')

df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('PUNISHMENT')
            ), 'o_idx' ] = df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('PUNISHMENT')
            )]['contents'].str.find('PUNISHMENT')

df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('ORDER OF SUMMARY DISMISSAL')
            ), 'o_idx' ] = df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('ORDER OF SUMMARY DISMISSAL')
            )]['contents'].str.find('ORDER OF SUMMARY DISMISSAL')

In [10]:
# Fix the lower case 'sentence' and 'judgment' cases (there are only 6)
df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('Sentence')
            ), 's_idx' ] = df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('Sentence')
            )]['contents'].str.find('Sentence')

df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('Judgment')
            ), 'j_idx' ] = df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & 
              df_text.contents.str.contains('Judgment')
            )]['contents'].str.find('Judgment')

In [11]:
# Check we have captured every doc type = there should be none
df_text.loc[((df_text.j_idx == -1) & (df_text.s_idx == -1) & (df_text.r_idx == -1) & (df_text.d_idx == -1) & (df_text.o_idx == -1))]

Unnamed: 0,docid,contents,j_idx,s_idx,r_idx,d_idx,o_idx


In [12]:
# Replace all -1s to a None so that we can get the "minimum Non -1 value to key off of in or replace text"
df_text.loc[df_text['s_idx'] == -1, 's_idx'] = None
df_text.loc[df_text['j_idx'] == -1, 'j_idx'] = None
df_text.loc[df_text['r_idx'] == -1, 'r_idx'] = None
df_text.loc[df_text['d_idx'] == -1, 'd_idx'] = None
df_text.loc[df_text['o_idx'] == -1, 'o_idx'] = None

In [13]:
df_text['type_idx'] = df_text[['s_idx', 'j_idx', 'r_idx', 'd_idx', 'o_idx']].min(axis=1).astype(int)

In [14]:
# Remove all the parts of the document up to the document type
for row in df_text.itertuples():
    idx = row.type_idx
    new_text = row.contents[idx:]
    df_text.at[row.Index, 'cleaned_contents'] = new_text

In [15]:
# Remove URL
df_text['cleaned_contents'] = df_text['cleaned_contents'].replace(r'http://www.paclii.org.*', '<URL>', regex=True)           

In [16]:
print(df_text['cleaned_contents'][0])

SENTENCE
[Child Rape]

[1] On the 11th October 2011 the accused entered a plea of guilty to one charge of rape contrary to Section 207(1) & (2) (b) of the Crimes Decree 2009, the particulars being that on the 14th March 2010 at Uciwai settlement, Korolevu he raped a child with his finger. He agreed to facts put to him, whereupon he was found guilty and convicted.

[2] The facts admitted by the accused were that in March 2010 he was staying with the victims' family, he being related to the victims' father. On the 14th March 2010, the mother had left home to visit the father in hospital. She left the victim, who at the time was 9 years old and her elder brother in the care of the accused.

[3] After dinner the two children went to sleep on a mattress. The boy asked the accused to sleep on the mattress with them as his mother has not yet returned. The accused blew out the candle and joined them on the mattress. The boy told the victim to lie in the middle between him and the accused. Whil

In [20]:
# lower case - DO NOT DO
# df_text['cleaned_contents'] = df_text['cleaned_contents'].str.lower()

In [21]:
# Train test split
df_labels = pd.read_csv("data/trackGBV_labels.csv")
df = pd.merge(left=df_text, right=df_labels, left_on='docid', right_on='DocID')
df_split = df[['docid', 'cleaned_contents', 'Discrimination_Label']].copy()
train, test = train_test_split(df_split, test_size=0.2, random_state=42, shuffle=True)

In [22]:
df.head()

Unnamed: 0,docid,contents,j_idx,s_idx,r_idx,d_idx,o_idx,type_idx,cleaned_contents,DocID,Customary_Practices,Gender_Stereotypes,Other_Factors,Num_Factors,Discrimination_Label
0,80380,Home | Databases | WorldLII | Search | Feedbac...,,592.0,,,,592,SENTENCE\n[Child Rape]\n\n[1] On the 11th Octo...,80380,0,0,1,1,1
1,78839,Home | Databases | WorldLII | Search | Feedbac...,601.0,,,,,601,"JUDGMENT\n\nOn the 12th of August 2004, the Ap...",78839,0,0,1,1,1
2,248796,State v Lagivere - Sentence [2017] FJHC 386...,,874.0,,,,874,"SENTENCE\n \n \n• Inoke Lagivere, you stand c...",248796,0,0,0,0,0
3,257586,State v Goundar - Sentence [2018] FJHC 438;...,,875.0,,,,875,SENTENCE\n \n \n (The name of the complainant ...,257586,1,1,0,2,1
4,80121,Home | Databases | WorldLII | Search | Feedbac...,,645.0,,,,645,SENTENCE\nBackground\n[1] On the 17th July 201...,80121,0,1,0,1,1


In [23]:
train.to_csv(r'data/train.csv', index = False)
test.to_csv(r'data/test.csv', index = False)

In [15]:
# LOWER CASE AND REMOVE NEWLINE
import pandas as pd
import re
def lcase_text(df):

    df['len_txt'] =df.cleaned_contents.apply(lambda x: len(x.split()))
    df = df[df.len_txt >249]
    df = df[df.len_txt <20000]
    df = df.rename(columns = {'cleaned_contents':'text', 'Discrimination_Label':'label'})
    df['text'] = df['text'].str.lower()
    df['text'] = pd.Series(re.sub(r'(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)([\s]{1,3})?([0-9]{1,2})(.{1,3})?((,)|(.))?([\s]{1,3})?([0-9]{4})|([0-9]{1,2})(.{1,3})?([\s]{1,3})?(day)?([\s]{1,3})?(of)?([\s]{1,3})?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)((,)|(.))?(\s{1,3})?([0-9]{4})|(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first)([\s]{1,3})?(day)?([\s]{1,3})?(of)?([\s]{1,3})?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)((,)|(.))?(\s{1,3})?([0-9]{4})|(\b[0-9]{1,2}(\-|\/)[0-9]{1,2}(\-|\/)[0-9]{2,4}\b)|(\b[0-9]{2,4}(\-|\/)[0-9]{1,2}(\-|\/)[0-9]{1,2}\b)', '[DATE]', i) for i in df['text'])
    df = df.replace({'text': {"'": ""}}, regex=True)
    df = df.replace({'text': {"(\\W)+": " "}}, regex=True)
    df.dropna(subset = ["text"], inplace=True)
    df.reset_index(inplace=True, drop=True)
    return(df)

df_train = lcase_text(pd.read_csv("../w266_project/data/train.csv"))
df_test = lcase_text(pd.read_csv("../w266_project/data/test.csv"))
df_all = pd.concat([df_train, df_test])

# Save to disk
df_train.to_csv(r'data/train_lcase.csv', index = False)
df_test.to_csv(r'data/test_lcase.csv', index = False)
df_all.to_csv(r'data/all_lcase.csv', index = False)