# A very fast and loose first attempt with the universal sentence encoder
#### Load packages

In [1]:
import pandas as pd
import numpy as np
import nltk
import tensorflow_hub as hub
from sklearn.cluster import MiniBatchKMeans

#### Load the pre-trained model from Tensor-Hub

In [2]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

#### Load our data

In this case, I'm loading all of the text.

In [3]:
data = pd.read_csv('./data/cleaned_text.csv')

In [4]:
data.head()

Unnamed: 0,docid,contents,j_idx,s_idx,r_idx,d_idx,o_idx,type_idx,cleaned_contents
0,80380,Home | Databases | WorldLII | Search | Feedbac...,,592.0,,,,592,SENTENCE\n[Child Rape]\n\n[1] On the 11th Octo...
1,78839,Home | Databases | WorldLII | Search | Feedbac...,601.0,,,,,601,"JUDGMENT\n\nOn the 12th of August 2004, the Ap..."
2,248796,State v Lagivere - Sentence [2017] FJHC 386...,,874.0,,,,874,"SENTENCE\n \n \n• Inoke Lagivere, you stand c..."
3,257586,State v Goundar - Sentence [2018] FJHC 438;...,,875.0,,,,875,SENTENCE\n \n \n (The name of the complainant ...
4,80121,Home | Databases | WorldLII | Search | Feedbac...,,645.0,,,,645,SENTENCE\nBackground\n[1] On the 17th July 201...


In [5]:
data.shape

(809, 9)

#### Split the documents into sentences using a custom function based on `punkt`

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/chris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def split_sent(doc):
    doc_carriage_return = doc.split('\n')
    doc_carriage_return = [line for line in doc_carriage_return if line != '']
    sentences = []
    for line in doc_carriage_return:
        for sent in nltk.tokenize.sent_tokenize(line):
            sentences.append(sent)
    return sentences

#### Build embeddings from a subset of the documents

In [8]:
docs = list(data.cleaned_contents)

In [9]:
sentences = []
for doc in docs[:1000]:
    for sent in split_sent(doc):
        sentences.append(sent)

In [10]:
len(sentences)

80692

In [11]:
embeddings = embed(sentences[:30000]).numpy()

In [12]:
embeddings.shape

(30000, 512)

#### Cluster the embeddings

In [13]:
clust = MiniBatchKMeans(n_clusters=500, verbose=1)
clust.fit(embeddings)
y_hat = clust.predict(embeddings)

Init 1/3 with method: k-means++


  init_size=init_size)


Inertia for init 1/3: 52.849937
Init 2/3 with method: k-means++


  init_size=init_size)


Inertia for init 2/3: 54.443264
Init 3/3 with method: k-means++


  init_size=init_size)


Inertia for init 3/3: 51.134789
Minibatch iteration 1/30000: mean batch inertia: 0.704741, ewa inertia: 0.704741 
Minibatch iteration 2/30000: mean batch inertia: 0.654610, ewa inertia: 0.704407 
Minibatch iteration 3/30000: mean batch inertia: 0.594080, ewa inertia: 0.703672 
Minibatch iteration 4/30000: mean batch inertia: 0.745141, ewa inertia: 0.703948 
Minibatch iteration 5/30000: mean batch inertia: 0.620763, ewa inertia: 0.703393 
Minibatch iteration 6/30000: mean batch inertia: 0.644112, ewa inertia: 0.702998 
Minibatch iteration 7/30000: mean batch inertia: 0.654428, ewa inertia: 0.702674 
Minibatch iteration 8/30000: mean batch inertia: 0.616427, ewa inertia: 0.702099 
Minibatch iteration 9/30000: mean batch inertia: 0.550556, ewa inertia: 0.701089 
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch iteration 10/30000: mean batch inertia: 0.533098, ewa inertia: 0.699969 
Minibatch iteration 11/30000: mean batch inertia: 0.608938, ewa inertia: 0.699362 
Minibatch iter

Minibatch iteration 101/30000: mean batch inertia: 0.567808, ewa inertia: 0.633699 
Minibatch iteration 102/30000: mean batch inertia: 0.554562, ewa inertia: 0.633171 
Minibatch iteration 103/30000: mean batch inertia: 0.554966, ewa inertia: 0.632650 
Minibatch iteration 104/30000: mean batch inertia: 0.548016, ewa inertia: 0.632086 
Minibatch iteration 105/30000: mean batch inertia: 0.532095, ewa inertia: 0.631419 
Minibatch iteration 106/30000: mean batch inertia: 0.538608, ewa inertia: 0.630800 
Minibatch iteration 107/30000: mean batch inertia: 0.520479, ewa inertia: 0.630065 
Minibatch iteration 108/30000: mean batch inertia: 0.540788, ewa inertia: 0.629470 
Minibatch iteration 109/30000: mean batch inertia: 0.548566, ewa inertia: 0.628930 
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch iteration 110/30000: mean batch inertia: 0.521161, ewa inertia: 0.628212 
Minibatch iteration 111/30000: mean batch inertia: 0.545456, ewa inertia: 0.627660 
Minibatch iteration 112/30

Minibatch iteration 201/30000: mean batch inertia: 0.505329, ewa inertia: 0.580338 
Minibatch iteration 202/30000: mean batch inertia: 0.574217, ewa inertia: 0.580297 
Minibatch iteration 203/30000: mean batch inertia: 0.540272, ewa inertia: 0.580030 
Minibatch iteration 204/30000: mean batch inertia: 0.493756, ewa inertia: 0.579455 
Minibatch iteration 205/30000: mean batch inertia: 0.509238, ewa inertia: 0.578987 
Minibatch iteration 206/30000: mean batch inertia: 0.514698, ewa inertia: 0.578558 
Minibatch iteration 207/30000: mean batch inertia: 0.479397, ewa inertia: 0.577897 
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch iteration 208/30000: mean batch inertia: 0.521993, ewa inertia: 0.577525 
Minibatch iteration 209/30000: mean batch inertia: 0.520615, ewa inertia: 0.577145 
Minibatch iteration 210/30000: mean batch inertia: 0.494093, ewa inertia: 0.576592 
Minibatch iteration 211/30000: mean batch inertia: 0.538769, ewa inertia: 0.576339 
Minibatch iteration 212/30

Minibatch iteration 299/30000: mean batch inertia: 0.498265, ewa inertia: 0.549085 
Minibatch iteration 300/30000: mean batch inertia: 0.528430, ewa inertia: 0.548948 
Minibatch iteration 301/30000: mean batch inertia: 0.484920, ewa inertia: 0.548521 
Minibatch iteration 302/30000: mean batch inertia: 0.485560, ewa inertia: 0.548101 
Minibatch iteration 303/30000: mean batch inertia: 0.505903, ewa inertia: 0.547820 
Minibatch iteration 304/30000: mean batch inertia: 0.535530, ewa inertia: 0.547738 
Minibatch iteration 305/30000: mean batch inertia: 0.508244, ewa inertia: 0.547475 
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch iteration 306/30000: mean batch inertia: 0.536383, ewa inertia: 0.547401 
Minibatch iteration 307/30000: mean batch inertia: 0.555150, ewa inertia: 0.547452 
Minibatch iteration 308/30000: mean batch inertia: 0.508655, ewa inertia: 0.547194 
Minibatch iteration 309/30000: mean batch inertia: 0.474615, ewa inertia: 0.546710 
Minibatch iteration 310/30

Minibatch iteration 400/30000: mean batch inertia: 0.520002, ewa inertia: 0.529241 
Minibatch iteration 401/30000: mean batch inertia: 0.503591, ewa inertia: 0.529070 
Minibatch iteration 402/30000: mean batch inertia: 0.485238, ewa inertia: 0.528778 
Minibatch iteration 403/30000: mean batch inertia: 0.475612, ewa inertia: 0.528423 
Minibatch iteration 404/30000: mean batch inertia: 0.581736, ewa inertia: 0.528779 
Minibatch iteration 405/30000: mean batch inertia: 0.481922, ewa inertia: 0.528466 
Minibatch iteration 406/30000: mean batch inertia: 0.470610, ewa inertia: 0.528081 
Minibatch iteration 407/30000: mean batch inertia: 0.495433, ewa inertia: 0.527863 
Minibatch iteration 408/30000: mean batch inertia: 0.508932, ewa inertia: 0.527737 
Minibatch iteration 409/30000: mean batch inertia: 0.482952, ewa inertia: 0.527438 
Minibatch iteration 410/30000: mean batch inertia: 0.487758, ewa inertia: 0.527174 
Minibatch iteration 411/30000: mean batch inertia: 0.490584, ewa inertia: 0.

#### Examine some results

In [14]:
index = 8
sentence_index = np.where(y_hat==index)[0]
for i in sentence_index:
    print('-----------------------------------------------------------------')
    print(sentences[i])
    print('\n')

In [15]:
index = 33
sentence_index = np.where(y_hat==index)[0]
for i in sentence_index:
    print('-----------------------------------------------------------------')
    print(sentences[i])
    print('\n')

-----------------------------------------------------------------
RULING


-----------------------------------------------------------------
RULING


-----------------------------------------------------------------
DECISION


-----------------------------------------------------------------
RULING


-----------------------------------------------------------------
RULING


-----------------------------------------------------------------
RULING


-----------------------------------------------------------------
RULING


-----------------------------------------------------------------
RULING


-----------------------------------------------------------------
JUDGMENT AND SENTENCE


-----------------------------------------------------------------
RULING


-----------------------------------------------------------------
RULING


-----------------------------------------------------------------
Decision


-----------------------------------------------------------------
RULING


------

In [18]:
index = 800
sentence_index = np.where(y_hat==index)[0]
for i in sentence_index:
    print('-----------------------------------------------------------------')
    print(sentences[i])
    print('\n')

In [19]:
index = 54
sentence_index = np.where(y_hat==index)[0]
for i in sentence_index:
    print('-----------------------------------------------------------------')
    print(sentences[i])
    print('\n')

-----------------------------------------------------------------
Statement of Offence [a]


-----------------------------------------------------------------
Statement of Offence [a]


-----------------------------------------------------------------
Statement of Offence (a)


-----------------------------------------------------------------
Statement of Offence (a)


-----------------------------------------------------------------
Statement of offence [a]


-----------------------------------------------------------------
(a) the nature of the offence;


-----------------------------------------------------------------
Statement of Offence (a)


-----------------------------------------------------------------
	◦	Prevalence of offence in community.


-----------------------------------------------------------------
		(i) attitude to the offence;


-----------------------------------------------------------------
Statement of Offence [a]


--------------------------------------------