In [1]:
import os, re
import xml.etree.ElementTree as ET
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

# Preprocessing

In [2]:
dataset_path = 'data/legal_dataset/corpus/fulltext/'
raw_files = os.listdir(dataset_path)
preprocessed_files_path = dataset_path + 'preprocessed/'
preprocessed_files = os.listdir(preprocessed_files_path)

def preprocess(files):
    for i, file in enumerate(files):
        try:
            print('%d%%' % ((i/len(files) * 100))) if i % 100 == 0 else None

            # open file with r+b (allow read and binary mode)
            with open(dataset_path + file, 'r+b') as f:
                # read entire content of file into memory
                f_content = f.read()
                # replace the desired part of the content using re.sub
                f_content = re.sub(r'<catchphrases>(\n.*)*</catchphrases>', '', f_content.decode())
                f_content = re.sub('&eacute;','',f_content)
                # return pointer to top of file so we can re-write the content with replaced string

                # re-write the content with the updated content

#             print('writing')
            with open(preprocessed_files_path + file, 'wb') as f2:
                f2.write(f_content.encode())
        except Exception as e:
            print(e)
            print('%d, %s' % (i, file))
    
        
def parse(files):
    texts = ['judgment of Federal Magistrate Barnes dismissal application for judicial review of Refugee Review Tribunal decision',
        'This is an example sentence', 'This is another example sentence', '''
    Fenerbahçe Spor Kulübü (Turkish: [feˈnæɾbahtʃe], Fenerbahçe Sports Club) is a Turkish professional football club based in Istanbul, Turkey. They are the men's football department of Fenerbahçe SK, a major professional multi-sport club. Fenerbahçe, known informally as Fener, are one of the most successful and best supported football teams in Turkey, having never been relegated, and currently compete in the Turkish Super League, the Turkish Cup and UEFA Europa League.

They are nicknamed Sarı Kanaryalar (Turkish for "Yellow Canaries") and play their home games at Şükrü Saracoğlu Stadium, their own traditional home ground in Kadıköy, Istanbul. The club's name translates as "Lighthouse in the Garden" and comes from the Fenerbahçe neighbourhood of the Kadıköy district in Istanbul.

Fenerbahçe have won 19 Turkish Super League titles, 6 Turkish Cups and 9 Turkish Super Cups, among others.[4]

In international club football, Fenerbahçe have won the Balkans Cup in 1968, which is marked as the first ever non-domestic trophy won by a Turkish football club. In UEFA competitions, Fenerbahçe have reached the quarter-finals in the 1963–64 UEFA Cup Winners' Cup and in the 2007–08 UEFA Champions League. The club's semi-final performance in the 2012–13 UEFA Europa League is marked as one of its greatest achievements in European competitions. Fenerbahçe are a member of the European Club Association.

They are one of the most popular clubs in Turkey, and the most popular in Istanbul and Ankara.[5] Fenerbahçe have a large fanbase throughout the country, in Northern Cyprus,[6][7] Azerbaijan,[8] South Korea[9][10] and in the Turkish diaspora.[11][12] In their home at the Şükrü Saracoğlu Stadium, Fenerbahçe's average attendances have been among the highest in Turkey.[13][14] Fenerbahçe's longest-running and deepest rivalry is with their nearest major neighbours, Galatasaray, with matches between the two being referred to as Intercontinental derby, being considered to be one of the fiercest and most intense derbies in the world.[15][16] Matches against Beşiktaş are also derbys, but the rivalry is not as intense and fierce.

    ''', '''
    Galatasaray Spor Kulübü (Turkish pronunciation: [ɡaɫatasaˈɾaj ˈspoɾ kulyˈby], Galatasaray Sports Club) is a Turkish sports club based on the European side of the city of Istanbul in Turkey. Most notable for its association football department, the club also consists of various other departments including basketball, wheelchair basketball, volleyball, water polo, handball, athletics, swimming, rowing, sailing, judo, bridge, motorsport, equestrian,[1][2] esports, and chess.[3][4] Galatasaray S.K. is among the key members of the Galatasaray Community Cooperation Committee together with Galatasaray University and the prestigious Galatasaray High School.

The football branch of Galatasaray has accumulated the most Süper Lig (22), Turkish Cup (18) and Turkish Super Cup (16) titles in Turkey,[5][6][7] thus making them the most decorated football club in Turkey, as those competitions are the top nationwide professional leagues and cups within the Turkish football system that is recognized and accounted for in accordance to the regulations set by the Turkish Football Federation[8] and UEFA.[9][10][11][12]

In the year 2000 Galatasaray also took claim of the UEFA Cup title by defeating Arsenal F.C.[13][14] and the UEFA Super Cup, by defeating Real Madrid C.F.[15] These accomplishments makes Galatasaray the only Turkish football club to have reached that level of European success in the history of Turkish Football.

The club's wheelchair basketball team won the Champions Cup in 2008, 2009, 2011, 2013 and 2014. They also won the Kitakyushu Champions Cup and became world champions in 2008, 2009, 2011, and 2012.[16][17] Galatasaray women's basketball team won the 2013–14 EuroLeague Women and FIBA Eurocup in 2009 and 2018.[18] Galatasaray men's basketball team claimed their first EuroCup championship after defeating Strasbourg in 2016.[19]


    ''']
    
    for i,xmlfile in enumerate(files):
        try:
            print('%d%%' % ((i/len(files) * 100))) if i % 100 == 0 else None

            tree = ET.parse(preprocessed_files_path + xmlfile)
            root = tree.getroot()
            text = ''
            
            for sentence in root.findall('sentences')[0].findall('sentence'):
                text += sentence.text

            texts.append(text)
        except Exception as e:
#             print(e)
            pass

    return texts

# preprocess(raw_files)
texts = parse(preprocessed_files)
# texts[0]

0%
2%
5%
7%
10%
12%
15%
18%
20%
23%
25%
28%
30%
33%
36%
38%
41%
43%
46%
48%
51%
54%
56%
59%
61%
64%
66%
69%
72%
74%
77%
79%
82%
84%
87%
90%
92%
95%
97%


# SBERT Method 1

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('bert-base-nli-mean-tokens') #'bert-base-nli-mean-tokens' 'nlpaueb/legal-bert-base-uncased' 'bert-base-uncased'

document_embeddings = model.encode(texts[:1000])

In [None]:
# scores = cosine_similarity(document_embeddings)
# scores = scores[0]

In [6]:
query = "judgment of Federal Magistrate Barnes dismissal application for judicial review of Refugee Review Tribunal decision"
query_embedding = model.encode(query)
query_embedding = query_embedding.reshape(1,-1)

In [7]:
scores = cosine_similarity(query_embedding, document_embeddings)
scores = scores.reshape(-1)
# scores = scores[1:]

In [40]:
print(scores.argmax())
plt.plot(scores)
plt.show()

0


In [8]:
scores.argsort()[:10], scores[scores.argsort()[:10]],texts[756]

(array([902,   3, 905,   4, 731,   1, 864,   2, 669, 737]), array([0.14845721, 0.1821603 , 0.27451792, 0.30216554, 0.30486718,
       0.30494103, 0.30557644, 0.3224008 , 0.32298714, 0.32717836],
      dtype=float32), '\n INTRODUCTION \n \n1 The applicant ("OMI") holds Australian Patent No 775427 entitled "A Single Use Syringe" (the "OMI patent").The respondent ("RTI") is entitled to exploit the invention which is the subject of Australian Patent No 701878, entitled "Tamperproof Retractable Syringe" (the "RTI patent").2 In letters dated 5 October 2004 and 24 March 2006, RTI\'s legal advisers asserted or implied that OMI was infringing, or proposing to infringe, RTI\'s intellectual property and other rights including, in particular, those held in connection with the RTI patent.These allegations apparently arose out of the proposed exploitation by OMI of the OMI patent.OMI commenced proceedings for a declaration that RTI had made unjustifiable threats and for interlocutory and permanent i

In [10]:
scores.argsort()[-10:], scores[scores.argsort()[-10:]],texts[644]

(array([650, 131, 177, 514, 589, 318,  27, 127, 644,   0]), array([0.78481853, 0.7849215 , 0.78616905, 0.79047096, 0.7966834 ,
       0.79779863, 0.7982595 , 0.7989042 , 0.80154324, 1.0000001 ],
      dtype=float32), "\n \n1 This arises as the consequence of a notice to appeal against a judgment of a Federal Magistrate (McInnis FM) dated 11 December 2006 dismissing an application for review of an earlier decision of the Federal Magistrates Court on 13 November 2006.The earlier application to the Federal Magistrates Court sought judicial review of a decision of the Refugee Review Tribunal (the Tribunal) dated 11 July 2003.In view of the need for leave to appeal to be granted, the 'appellants' are described as 'applicants' in these reasons.At the hearing, the application for leave to appeal was dismissed.These are my reasons for that decision.Applicants' claims \n \n2 The applicants are a husband, wife and daughter and are citizens of Fiji with Indian ethnic origin.They arrived in Austra

In [11]:
from sentence_transformers.cross_encoder import CrossEncoder
model = CrossEncoder('sentence-transformers/bert-base-nli-mean-tokens')
final_scores = model.predict([[query, texts[i]] for i in scores.argsort()[-10:]])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/bert-base-nli-mean-tokens and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: The size of tensor a (11194) must match the size of tensor b (512) at non-singleton dimension 1

# SBERT Method 2

In [44]:
from sentence_transformers import CrossEncoder
model = CrossEncoder('nlpaueb/legal-bert-base-uncased', max_length=512)

Downloading: 100%|##########| 0.99k/0.99k [00:00<00:00, 1.18MB/s]
Downloading: 100%|##########| 420M/420M [04:51<00:00, 1.51MB/s] 
Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a

In [12]:
scores = model.predict([('Corruption', text) for text in texts[:10]])

RuntimeError: CUDA out of memory. Tried to allocate 1.67 GiB (GPU 0; 3.95 GiB total capacity; 2.09 GiB already allocated; 1.11 GiB free; 2.15 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [46]:
print(scores.argmax())
plt.plot(scores)
plt.show()

1678


# BERT 

In [None]:
# https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1

In [17]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the pre-trained BERT model and tokenizer
model_name = 'nlpaueb/legal-bert-base-uncased' # 'nlpaueb/legal-bert-base-uncased' 'bert-base-nli-mean-tokens'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Tokenize Documents

In [18]:
# Document set
documents = texts[:1000]

# Document IDs
document_ids = np.arange(len(documents))

# Encode documents
encoded_documents = []
max_seq_length = 128  # Maximum sequence length for encoding
# for document in documents:
#     tokens = tokenizer.encode(document, add_special_tokens=True, max_length=max_seq_length, truncation=True)
#     encoded_documents.append(tokens)

document_tensors = []
attention_masks = []
for document in documents:
    encoded_document = tokenizer.encode_plus(
    document,
    padding='max_length',
    truncation=True,
    max_length=max_seq_length,
    return_tensors='pt'
    )
    
    document_tensors.append(encoded_document['input_ids'])
    attention_masks.append(encoded_document['attention_mask'][0])

attention_masks = torch.stack(attention_masks)

document_tensors_concat = torch.cat(document_tensors, dim=0)   

# Obtain document embeddings
with torch.no_grad():
    document_embeddings = model(document_tensors_concat).last_hidden_state

## Tokenize Query

In [20]:
# Encode query
query = "judgment of Federal Magistrate Barnes dismissal application for judicial review of Refugee Review Tribunal decision"
tokenized_input = tokenizer.tokenize(query)

encoded_query = tokenizer.encode_plus(
    query,
    padding='max_length',
    truncation=True,
    max_length=max_seq_length,
    return_tensors='pt'
)

query_tensor = encoded_query['input_ids']
query_attention_mask = encoded_query['attention_mask'][0]

# Obtain query embedding
with torch.no_grad():
    query_embedding = model(query_tensor).last_hidden_state

### Reshape

In [21]:
# Reshape query embedding to 2D
query_embedding_reshaped = query_embedding.view(query_embedding.size(0), -1)
document_embeddings_reshaped = document_embeddings.view(document_embeddings.size(0), -1)

## SIM Method 1

In [22]:
mask = attention_masks.unsqueeze(-1).expand(document_embeddings.size()).float()
masked_embeddings = document_embeddings * mask

query_mask = query_attention_mask.unsqueeze(-1).expand(document_embeddings.size()[1:]).squeeze(0).unsqueeze(0).float()
query_masked_embedding = query_embedding * query_mask

In [26]:
# summed = torch.sum(torch.concat((query_masked_embedding, masked_embeddings), dim=0), 1)
# summed.shape
# summed_mask = torch.clamp(torch.cat((query_mask, mask), dim=0).sum(1), min=1e-9)
# summed_mask.shape

summed = torch.sum(masked_embeddings, 1)
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
mean_pooled = summed / summed_mask

from sklearn.metrics.pairwise import cosine_similarity

# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
scores = cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[0:]
)[0]

# scores = np.zeros(mean_pooled.shape[0])
# for i in range(1,mean_pooled.shape[0]):
#     scores[i] = cosine(mean_pooled[0], mean_pooled[i])

scores

array([1.0000001 , 0.79411674, 0.79244554, 0.7010117 , 0.71611655,
       0.69792706, 0.7189583 , 0.7017555 , 0.71419394, 0.75241804,
       0.7391576 , 0.75593334, 0.68441045, 0.74855024, 0.7275633 ,
       0.752017  , 0.72863054, 0.71613914, 0.7261239 , 0.6723285 ,
       0.72263163, 0.74687904, 0.71604764, 0.7185921 , 0.7684561 ,
       0.72937334, 0.7503905 , 0.73332137, 0.7614093 , 0.73588634,
       0.69950855, 0.72406733, 0.7337389 , 0.76481986, 0.73846745,
       0.6856747 , 0.7087175 , 0.72619575, 0.75121176, 0.7144312 ,
       0.726323  , 0.72365206, 0.7178132 , 0.7422186 , 0.7196129 ,
       0.72061193, 0.698955  , 0.73053426, 0.72937304, 0.66393745,
       0.7394515 , 0.70418614, 0.7223366 , 0.73327374, 0.69572425,
       0.7211205 , 0.70567936, 0.71382606, 0.7258346 , 0.7259089 ,
       0.746806  , 0.76050264, 0.73572147, 0.67348653, 0.715235  ,
       0.74663895, 0.72701496, 0.74061346, 0.7444164 , 0.74540645,
       0.7543613 , 0.7179845 , 0.75047284, 0.72431815, 0.71123

## SIM Method 2

In [342]:
query_embedding[0].shape

torch.Size([128, 768])

In [337]:
# WRONG
scores = np.zeros(document_embeddings.shape[0])
scores2 = np.zeros((document_embeddings.shape[0],128))
for i in range(document_embeddings.shape[0]):
    s = torch.cosine_similarity(query_embedding[0], document_embeddings[i])
    scores2[i] = s
    scores[i] = np.average(s)

In [330]:
scores

array([1.        , 0.89010519, 0.89075196, 0.12275868, 0.14294639,
       0.14806464, 0.12272596, 0.11478725, 0.12670414, 0.13692893,
       0.13282262, 0.12389939, 0.12208994, 0.13973805, 0.13389388,
       0.12719598, 0.11889848, 0.12900034, 0.13886778, 0.13508305,
       0.11373245, 0.12672706, 0.12646922, 0.13797303, 0.11922668,
       0.1149366 , 0.124601  , 0.12972263, 0.1474006 , 0.13184483,
       0.14523295, 0.1232686 , 0.13834915, 0.12350786, 0.12633866,
       0.14742944, 0.1408343 , 0.14019439, 0.12012345, 0.11717765,
       0.11672356, 0.11192389, 0.1327011 , 0.1294349 , 0.13835438,
       0.13497426, 0.12331019, 0.10615107, 0.11652105, 0.11623657,
       0.13560934, 0.13457033, 0.12660711, 0.13515848, 0.13302284,
       0.12761134, 0.1248692 , 0.14226186, 0.12899603, 0.12719749,
       0.1420988 , 0.12803198, 0.13117558, 0.10812522, 0.12976371,
       0.12252453, 0.13160533, 0.15971881, 0.13345188, 0.1249274 ,
       0.11554302, 0.13873097, 0.1275629 , 0.13498758, 0.11466

## SIM Method 3

In [336]:
# WRONG
from scipy.spatial.distance import cosine

import numpy as np

scores = np.zeros(document_embeddings_reshaped.shape[0])
for i in range(document_embeddings_reshaped.shape[0]):
    scores[i] = cosine(document_embeddings_reshaped[i], query_embedding_reshaped[0])
    
scores

array([0.        , 0.1023764 , 0.10163164, 0.88029122, 0.85915545,
       0.8551745 , 0.88104099, 0.88852546, 0.87608012, 0.8665117 ,
       0.87224704, 0.87969436, 0.8801052 , 0.86566646, 0.86887749,
       0.87845059, 0.88679469, 0.87397705, 0.86571679, 0.86854863,
       0.8903373 , 0.87736714, 0.87831299, 0.8650855 , 0.8838554 ,
       0.88731602, 0.87863312, 0.8741001 , 0.85842369, 0.87060997,
       0.85809451, 0.88061447, 0.86561437, 0.87983277, 0.87806676,
       0.85599968, 0.8625924 , 0.86225118, 0.88524967, 0.88418981,
       0.88808688, 0.8901003 , 0.8721451 , 0.87537636, 0.86585066,
       0.86825886, 0.87944272, 0.89574091, 0.88629049, 0.88725238,
       0.86802213, 0.86832   , 0.87755852, 0.86793433, 0.86939624,
       0.875181  , 0.8783331 , 0.86274038, 0.87554495, 0.87717299,
       0.86163421, 0.87534113, 0.87159322, 0.89495312, 0.87492907,
       0.88241933, 0.87175538, 0.84561631, 0.87100197, 0.87943908,
       0.88758799, 0.86412509, 0.87665365, 0.86913978, 0.88761

## SIM Method 4

In [335]:
# Calculate similarity scores
scores = torch.cosine_similarity(query_embedding_reshaped, document_embeddings_reshaped,dim=1)

scores

tensor([1.0000, 0.8976, 0.8984, 0.1197, 0.1408, 0.1448, 0.1190, 0.1115, 0.1239,
        0.1335, 0.1278, 0.1203, 0.1199, 0.1343, 0.1311, 0.1215, 0.1132, 0.1260,
        0.1343, 0.1315, 0.1097, 0.1226, 0.1217, 0.1349, 0.1161, 0.1127, 0.1214,
        0.1259, 0.1416, 0.1294, 0.1419, 0.1194, 0.1344, 0.1202, 0.1219, 0.1440,
        0.1374, 0.1377, 0.1148, 0.1158, 0.1119, 0.1099, 0.1279, 0.1246, 0.1341,
        0.1317, 0.1206, 0.1043, 0.1137, 0.1127, 0.1320, 0.1317, 0.1224, 0.1321,
        0.1306, 0.1248, 0.1217, 0.1373, 0.1245, 0.1228, 0.1384, 0.1247, 0.1284,
        0.1050, 0.1251, 0.1176, 0.1282, 0.1544, 0.1290, 0.1206, 0.1124, 0.1359,
        0.1233, 0.1309, 0.1124, 0.1388, 0.1268, 0.1324, 0.1219, 0.1286, 0.1270,
        0.1236, 0.1237, 0.1157, 0.1094, 0.1240, 0.1476, 0.1150, 0.1445, 0.1225,
        0.1113, 0.1244, 0.1297, 0.1329, 0.1172, 0.1280, 0.1275, 0.1397, 0.1158,
        0.1393])

### SIM Method 5

In [358]:
A = query_embedding[0]
scores = np.zeros(document_embeddings_reshaped.shape[0])
for i in range(0,len(document_embeddings)):
    B = document_embeddings[i]
    sim = np.dot(A, B.T) / (np.linalg.norm(A, axis=1)[:, np.newaxis] * np.linalg.norm(B, axis=1))
    scores[i] = np.average(sim)
scores

array([0.76399904, 0.80013692, 0.80091947, 0.10667   , 0.12639645,
       0.1314173 , 0.10055168, 0.09774111, 0.10451196, 0.11408865,
       0.11078496, 0.10296004, 0.10464472, 0.11716688, 0.12646358,
       0.10630678, 0.0983218 , 0.10288702, 0.11903179, 0.11396606,
       0.09228052, 0.1108508 , 0.10636325, 0.12292249, 0.10474528,
       0.09532519, 0.10516456, 0.11434706, 0.1220199 , 0.11284091,
       0.11627553, 0.109974  , 0.11487019, 0.10432309, 0.10700107,
       0.12982172, 0.12320539, 0.11987582, 0.09539286, 0.1049379 ,
       0.09761845, 0.09054795, 0.11332791, 0.1049604 , 0.11264222,
       0.11761877, 0.10053931, 0.09271242, 0.09466819, 0.09784672,
       0.12230437, 0.11560668, 0.10331388, 0.11804398, 0.11340613,
       0.10558806, 0.10494338, 0.12448291, 0.10720426, 0.10922113,
       0.12377775, 0.1122538 , 0.10880842, 0.09150621, 0.1056627 ,
       0.10015669, 0.10573197, 0.13889599, 0.11594585, 0.10615143,
       0.09951828, 0.11205003, 0.10604049, 0.12154702, 0.09602

### SIM METHOD 6

In [378]:
meaned_query_embedding = torch.mean(query_masked_embedding[0],dim=0)
meaned_embeddings = torch.mean(masked_embeddings, dim=1)  # Calculate average of token embeddings

# Calculate similarity between documents
scores = torch.cosine_similarity(meaned_query_embedding, meaned_embeddings,dim=1)

scores

tensor([1.0000, 0.5741, 0.5907, 0.4186, 0.5013, 0.6274, 0.6221, 0.6180, 0.4855,
        0.6737, 0.6742, 0.6552, 0.5368, 0.6982, 0.5994, 0.6526, 0.6463, 0.6092,
        0.6895, 0.5608, 0.5722, 0.6878, 0.6622, 0.6447, 0.6818, 0.6060, 0.6486,
        0.6956, 0.6306, 0.5943, 0.5755, 0.6526, 0.6413, 0.6476, 0.6225, 0.5169,
        0.6301, 0.6039, 0.6557, 0.5972, 0.6837, 0.5438, 0.5580, 0.6277, 0.6193,
        0.5820, 0.5706, 0.6363, 0.6117, 0.5492, 0.6144, 0.5999, 0.6284, 0.6287,
        0.5110, 0.6125, 0.6025, 0.6339, 0.6131, 0.6775, 0.6660, 0.6746, 0.6882,
        0.5284, 0.6355, 0.6079, 0.6100, 0.6687, 0.7016, 0.6592, 0.6234, 0.6021,
        0.6708, 0.6009, 0.5408, 0.6666, 0.6091, 0.6743, 0.5689, 0.6976, 0.6358,
        0.6833, 0.5496, 0.6331, 0.6260, 0.6410, 0.5980, 0.6391, 0.6251, 0.6071,
        0.6839, 0.6598, 0.6140, 0.6962, 0.6830, 0.6880, 0.6370, 0.5687, 0.6207,
        0.6671])

## Plot

In [379]:
print('Index: %d' % scores.argmax())
plt.plot(scores)
plt.show()

Index: 0


### Below text is used for example query

In [195]:
texts[5]

"\n \n1 This is an appeal against the judgment of Federal Magistrate Barnes given on 6 April 2006, which included the dismissal of an application for judicial review of a decision of the Refugee Review Tribunal handed down on 18 November 2003.The Tribunal affirmed a decision of a delegate of the Minister for Immigration and Multicultural Affairs to refuse to grant a protection visa to the appellant.2 The appellants are a husband and wife and are citizens of India.They claim a well-founded fear of persecution in that country of their origin by reason of the appellant husband's political association and activity with the Congress Party in Gujarat.The appellant husband claimed in his statement to the delegate that he strongly condemned the Godhra train tragedy in February 2002, and because of his activities in helping victims and arranging rallies to have the culprits of that incident arrested and punished, he became the target of extremists from the Bharatiya Janata Party (BJP) and the R