In [1]:
import pandas as pd
import os
os.chdir('/users/sgdbareh/volatile/ECHR_Importance')
from API_key import openai_key
from openai import OpenAI
client = OpenAI(api_key=openai_key)
import torch 
import faiss
import pickle
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
import shutil # Importing shutil module for high-level file operations
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")



In [3]:
def embed_text(text,mean=True):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    if mean:
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    else:
        return outputs.last_hidden_state[:,0].squeeze().numpy()

In [25]:
# Load FAISS index
index = faiss.read_index("/users/sgdbareh/volatile/ECHR_Importance/faiss_index_TEST.bin")

# Load document store and metadata
with open("/users/sgdbareh/volatile/ECHR_Importance/docstore_TEST.pkl", "rb") as f:
    docstore = pickle.load(f)
with open("/users/sgdbareh/volatile/ECHR_Importance/index_to_docstore_id_TEST.pkl", "rb") as f:
    index_to_docstore_id = pickle.load(f)

In [26]:
from langchain.vectorstores import FAISS

vector_store_NEW = FAISS(
    embedding_function=embed_text,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    distance_strategy=DistanceStrategy.COSINE,
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [27]:
user_query = "Hello world, article 3"

In [28]:
vector_store_NEW.similarity_search(user_query, 5)

[Document(metadata={'Word Count': 2525, 'The Law': '1. The applicant complains that the Hungarian authorities’ procedure concerning his request for asylum and his potential deportation amounted to a breach of Article 3 of the Convention.Article 3 of the Convention provides that no one shall be subjected to torture or to inhuman or degrading treatment or punishment.Article 34, so far as relevant, provides that the Court may receive applications from any person claiming to be the victim of a violation by one of the High Contracting Parties of the rights set forth in the Convention or the protocols thereto.The Government submit that on 25 January 1999 the Office ruled that, in pursuance of S. 32 § 1 of the 1993 Aliens Act, the applicant could not be deported to Iraq or Egypt. According to S. 25 of Government Decree no. 64/1994, the applicant should not in the future be deported from Hungarian territory as long as the relevant situation in his country of origin persists, i.e. the reasons p

In [29]:
newset_QUERY = vector_store_NEW.similarity_search_with_relevance_scores(user_query, k=5)



In [34]:
newset_QUERY

[(Document(metadata={'Word Count': 2525, 'The Law': '1. The applicant complains that the Hungarian authorities’ procedure concerning his request for asylum and his potential deportation amounted to a breach of Article 3 of the Convention.Article 3 of the Convention provides that no one shall be subjected to torture or to inhuman or degrading treatment or punishment.Article 34, so far as relevant, provides that the Court may receive applications from any person claiming to be the victim of a violation by one of the High Contracting Parties of the rights set forth in the Convention or the protocols thereto.The Government submit that on 25 January 1999 the Office ruled that, in pursuance of S. 32 § 1 of the 1993 Aliens Act, the applicant could not be deported to Iraq or Egypt. According to S. 25 of Government Decree no. 64/1994, the applicant should not in the future be deported from Hungarian territory as long as the relevant situation in his country of origin persists, i.e. the reasons 

In [35]:
# Access the FAISS index from the vector store
faiss_index = vector_store_NEW.index

# Get the total number of vectors (chunks) in the FAISS index
num_chunks = faiss_index.ntotal

print(f"Number of chunks in the FAISS vector store: {num_chunks}")

Number of chunks in the FAISS vector store: 1096


### Test 

In [1]:
import pandas as pd
import os
os.chdir('/users/sgdbareh/volatile/ECHR_Importance')
from API_key import openai_key
from openai import OpenAI
client = OpenAI(api_key=openai_key)
import torch 
import faiss
import pickle
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain_openai import OpenAIEmbeddings
from sklearn.model_selection import ParameterGrid

  from .autonotebook import tqdm as notebook_tqdm


In [12]:

def embed_text(text,embedding_name="nlpaueb/legal-bert-base-uncased",mean=True):
    tokenizer = AutoTokenizer.from_pretrained(embedding_name)
    model = AutoModel.from_pretrained(embedding_name)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    if mean:
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    else:
        return outputs.last_hidden_state[:,0].squeeze().numpy()

def save_vectors(vector_store, filename):
    #save the retriever to disk
    faiss.write_index(vector_store.index, f"/users/sgdbareh/volatile/ECHR_Importance/VectorDB/faiss_index_{filename}.bin")

    # Save document store and metadata
    with open(f"/users/sgdbareh/volatile/ECHR_Importance/VectorDB/docstore_{filename}.pkl", "wb") as f:
        pickle.dump(vector_store.docstore, f)
    with open(f"/users/sgdbareh/volatile/ECHR_Importance/VectorDB/index_to_docstore_id_{filename}.pkl", "wb") as f:
        pickle.dump(vector_store.index_to_docstore_id, f)

#
def load_text():
    text = pd.read_pickle('/users/sgdbareh/volatile/ECHR_Importance/Art_3_Data_Process/outcome_cases.pkl')
    text['Facts'] = text['Facts'].str.replace('\n', ' ')
    # load documents
    text = text[:100]
    loader = DataFrameLoader(text,page_content_column='Facts')
    # docs loaded
    documents = loader.load()
    return documents
#
def load_embeddings(chunk_size=512, chunk_overlap=50,embedding_name="nlpaueb/legal-bert-base-uncased"):
    #load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(embedding_name)
    model = AutoModel.from_pretrained(embedding_name)
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    return tokenizer, model, text_splitter
#
def load_openai_embeddings(chunk_size=512, chunk_overlap=50):
    #load embeddings
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(model_name='gpt-4o',chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    return embeddings, text_splitter





In [5]:

print('start')
documents = load_text()
    
    



start


In [8]:
chunk_size = 512
chunk_overlap = 50
embedding_name = "nlpaueb/legal-bert-base-uncased"

In [13]:
tokenizer, model, text_splitter = load_embeddings(chunk_size, chunk_overlap, embedding_name)
index = faiss.IndexFlatL2(len(embed_text("hello world")))

vector_store = FAISS(
    embedding_function=lambda x: embed_text(x),
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    distance_strategy=DistanceStrategy.COSINE,
)

print('initialised vector store')

store = InMemoryStore()

retriever_new = ParentDocumentRetriever(vectorstore=vector_store,docstore=store,child_splitter=text_splitter)
   

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


initialised vector store


In [14]:
retriever_new.add_documents(documents)
    
print('vector store created')

   

KeyboardInterrupt: 

In [None]:
# Save the vector store
save_vectors(vector_store, f"chunk_{chunk_size}_embedding_{embedding_name}")

print('vector store saved')


### sentece transformers

In [3]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, models


In [4]:
# Load the LegalBert model and tokenizer
model_name = 'nlpaueb/legal-bert-base-uncased'
bert_model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a SentenceTransformers model
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
sentence_transformer_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Save the model
sentence_transformer_model.save('legal-bert-sentence-transformer')




In [5]:
model = SentenceTransformer('legal-bert-sentence-transformer')


In [6]:
from langchain_huggingface import HuggingFaceEmbeddings



In [7]:
embeddings = HuggingFaceEmbeddings(model_name="legal-bert-sentence-transformer")


In [8]:
import pandas as pd
import os
os.chdir('/users/sgdbareh/volatile/ECHR_Importance')
from API_key import openai_key
from openai import OpenAI
client = OpenAI(api_key=openai_key)
import torch 
import faiss
import pickle
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain_openai import OpenAIEmbeddings
from sklearn.model_selection import ParameterGrid
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, models



def load_text():
    text = pd.read_pickle('/users/sgdbareh/volatile/ECHR_Importance/Art_3_Data_Process/outcome_cases.pkl')
    text['Facts'] = text['Facts'].str.replace('\n', ' ')
    text = text[:10]
    # load documents
    loader = DataFrameLoader(text,page_content_column='Facts')
    # docs loaded
    documents = loader.load()
    return documents
#
def load_embeddings(chunk_size=512, chunk_overlap=50,embedding_name="nlpaueb/legal-bert-base-uncased"):
    #load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(embedding_name)
    model = AutoModel.from_pretrained(embedding_name)
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    return tokenizer, model, text_splitter
#
def load_openai_embeddings(chunk_size=512, chunk_overlap=50):
    #load embeddings
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(model_name='gpt-4o',chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    return embeddings, text_splitter
   


In [9]:
short_name = 'legal_bert'

In [10]:
print('start')
documents = load_text()
chunk_size = 512
chunk_overlap = 50
embedding_name = "nlpaueb/legal-bert-base-uncased"

setup = 'raw'

# Load the LegalBert model and tokenizer
model = AutoModel.from_pretrained(embedding_name)
tokenizer = AutoTokenizer.from_pretrained(embedding_name)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size,chunk_overlap=chunk_overlap)


# Create a SentenceTransformers model
word_embedding_model = models.Transformer(embedding_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
sentence_transformer_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Save the model
sentence_transformer_model.save('legal-bert-sentence-transformer')

print('embedding_name:', embedding_name, 'chunk_size:', chunk_size, 'chunk_overlap:', chunk_overlap)

embeddings = HuggingFaceEmbeddings(model_name="legal-bert-sentence-transformer")

print('vector store created')

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    distance_strategy=DistanceStrategy.COSINE,
)

print('initialised vector store')

store = InMemoryStore()

retriever_new = ParentDocumentRetriever(vectorstore=vector_store,docstore=store,child_splitter=text_splitter)
retriever_new.add_documents(documents)

print('docs added')



start




embedding_name: nlpaueb/legal-bert-base-uncased chunk_size: 512 chunk_overlap: 50
vector store created
initialised vector store
docs added


In [20]:
def save_vectors(vector_store, filename):
    #save the retriever to disk
    faiss.write_index(vector_store.index, f"/users/sgdbareh/volatile/ECHR_Importance/VectorDB/faiss_index_{filename}.bin")

    # Save document store and metadata
    with open(f"/users/sgdbareh/volatile/ECHR_Importance/VectorDB/docstore_{filename}.pkl", "wb") as f:
        pickle.dump(vector_store.docstore, f)
    with open(f"/users/sgdbareh/volatile/ECHR_Importance/VectorDB/index_to_docstore_id_{filename}.pkl", "wb") as f:
        pickle.dump(vector_store.index_to_docstore_id, f)

In [21]:
# Save the vector store
save_vectors(vector_store, f"chunk_{chunk_size}_embedding_{short_name}_{setup}")

print('vector store saved')

vector store saved


In [24]:
import torch

torch.cuda.is_available()

True

### Test Load-IN

In [11]:
# Load FAISS index
index = faiss.read_index("/users/sgdbareh/volatile/ECHR_Importance/VectorDB/faiss_index_chunk_512_embedding_bert_raw.bin")

# Load document store and metadata
with open("/users/sgdbareh/volatile/ECHR_Importance/VectorDB/docstore_chunk_512_embedding_bert_raw.pkl", "rb") as f:
    docstore = pickle.load(f)
with open("/users/sgdbareh/volatile/ECHR_Importance/VectorDB/index_to_docstore_id_chunk_512_embedding_bert_raw.pkl", "rb") as f:
    index_to_docstore_id = pickle.load(f)

In [12]:
embedding_name = "google-bert/bert-base-uncased"
setup = 'raw'
short_name = 'bert' 

# Load the LegalBert model and tokenizer
model = AutoModel.from_pretrained(embedding_name)
tokenizer = AutoTokenizer.from_pretrained(embedding_name)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size,chunk_overlap=chunk_overlap)

# Create a SentenceTransformers model
word_embedding_model = models.Transformer(embedding_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
sentence_transformer_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Save the model
sentence_transformer_model.save('bert-sentence-transformer')

print('embedding_name:', embedding_name, 'chunk_size:', chunk_size, 'chunk_overlap:', chunk_overlap)

embeddings = HuggingFaceEmbeddings(
model_name="bert-sentence-transformer",
multi_process=True,
model_kwargs={"device": "cuda"},
encode_kwargs={"normalize_embeddings": True}  # Set `True` for cosine similarity
)

print('vector store created')



embedding_name: google-bert/bert-base-uncased chunk_size: 512 chunk_overlap: 50
vector store created


In [13]:
from langchain.vectorstores import FAISS

vector_store_NEW = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    distance_strategy=DistanceStrategy.COSINE,
)

In [14]:
comm_cases = pd.read_pickle('/users/sgdbareh/volatile/ECHR_Importance/Art_3_Data_Process/comm_cases_valid.pkl')


In [148]:
def filter_new(comm_case,docs):
    start_date = comm_case['doc_date']
    print(start_date)


In [150]:
comm_cases.iloc[0]

Filename                                                      001-146030
Questions              1. Having regard to the cumulative effect of t...
Subject Matter         The applicant, Mr Ryszard Lipczyński, is a Pol...
appno                                                           44027/12
source_file                            pruned_ADMISSIBILITYCOM_meta.json
doc_date                                                      2015-09-14
importance                                                             4
keywords_art_3                                                     [350]
keywords_art_3_text                      (Art. 3) Prohibition of torture
Subj_Count                                                           643
Name: 1988, dtype: object

In [149]:
comm_cases.apply(lambda x: filter_new(x,vector_store_NEW.similarity_search(x['Subject Matter'], 5)),axis=1)

2015-09-14


Process SpawnProcess-77:
Process SpawnProcess-76:
Traceback (most recent call last):
  File "/users/sgdbareh/volatile/miniconda3/envs/ECHR/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/users/sgdbareh/volatile/miniconda3/envs/ECHR/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/users/sgdbareh/volatile/miniconda3/envs/ECHR/lib/python3.11/site-packages/sentence_transformers/SentenceTransformer.py", line 865, in _encode_multi_process_worker
    embeddings = model.encode(
                 ^^^^^^^^^^^^^
  File "/users/sgdbareh/volatile/miniconda3/envs/ECHR/lib/python3.11/site-packages/sentence_transformers/SentenceTransformer.py", line 477, in encode
    self.to(device)
  File "/users/sgdbareh/volatile/miniconda3/envs/ECHR/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1174, in to
    return self._apply(convert)
           ^^^^^^^^^^^^^^^^^^^^
  File "/users/sgdba

KeyboardInterrupt: 

In [28]:
user_query = "Hello world, article 3"

In [67]:
def date_filter(doc, start_date):
    doc_date = doc['date']
    return doc_date < start_date

In [59]:
from datetime import datetime


In [68]:
start_date = datetime(2000, 1, 1)

In [61]:
vector_store_NEW.similarity_search(user_query, 5, filter=date_filter)


[]

In [99]:
# Define the filter dictionary with a "less than" condition

# Convert the date string to a timestamp
date_string = '2020-01-01'
timestamp = pd.Timestamp(date_string)


filter_dict = {
    "date": {"$lt": timestamp},
    'respondent': 'TUR'       
    }  # Replace 'field_name' and 'some_value' with actual field and value


In [102]:
newset_QUERY = vector_store_NEW.similarity_search_with_relevance_scores(user_query, k=5)



In [103]:
newset_QUERY

  -58.76423645019531),
  -58.9892692565918),
 (Document(metadata={'Word Count': 1891, 'The Law': 'I. ALLEGED VIOLATION OF ARTICLE 3 OF THE CONVENTION31. The applicant alleged that if he were returned to Uzbekistan he would run a real risk of being subjected to torture and ill-treatment in breach of Article 3 of the Convention, which provides as follows:“No one shall be subjected to torture or to inhuman or degrading treatment or punishment.”A. Submissions by the parties1. The Government32. The Government submitted that in the course of the extradition proceedings the competent Russian authorities had duly examined the applicant’s situation with regard to his return to Uzbekistan, where he would be prosecuted. In the Government’s opinion, the diplomatic assurances given by the Uzbek authorities were sufficient and compatible with the relevant provisions of international law.33. The Government further submitted that the applicant’s allegations about the risk of ill-treatment in the event

In [115]:
start_date = datetime(2015, 1, 1)


In [19]:
# Define the filter function
def date_filter(doc, start_date):
    #print(doc)
    doc_date = doc['date']
    #print(doc_date)
    #print(start_date)
    return doc_date < start_date

In [116]:
filtered_results = [result for result in newset_QUERY if date_filter(result[0], start_date)]

2013-10-15 00:00:00
2015-01-01 00:00:00
2013-06-27 00:00:00
2015-01-01 00:00:00
2015-02-26 00:00:00
2015-01-01 00:00:00
2013-04-23 00:00:00
2015-01-01 00:00:00
2012-02-14 00:00:00
2015-01-01 00:00:00


In [117]:
filtered_results

  -58.76423645019531),
  -58.9892692565918),
 (Document(metadata={'Word Count': 973, 'The Law': 'I. ALLEGED VIOLATIONS OF ARTICLES 3 AND 13 OF THE CONVENTION21. The applicant complained, under Articles 3 and 13 of the Convention, of having contracted TB in prison, of insufficient medical care for his various diseases throughout the entire period of his detention, and of having been deprived of effective remedies in this respect. He further challenged the conditions of his detention in Rustavi nos. 1 and 6 Prisons in the periods between 18 July 2008 and 4 March 2010 and 23 July 2010 up to the present as inadequate. Articles 3 and 13 of the Convention read as follows:Article 3“No one shall be subjected to torture or to inhuman or degrading treatment or punishment.”Article 13“Everyone whose rights and freedoms as set forth in [the] Convention are violated shall have an effective remedy before a national authority notwithstanding that the violation has been committed by persons acting in a

In [39]:
appno_list = [doc[0].metadata['appno'] for doc in newset_QUERY]

In [40]:
appno_list

['34529/10', '71680/10', '66373/13', '65391/09', '2613/05']

In [17]:
REAL_K = 5

results_dict = {}

In [154]:
filtered_results

  -58.76423645019531),
  -58.9892692565918),
 (Document(metadata={'Word Count': 973, 'The Law': 'I. ALLEGED VIOLATIONS OF ARTICLES 3 AND 13 OF THE CONVENTION21. The applicant complained, under Articles 3 and 13 of the Convention, of having contracted TB in prison, of insufficient medical care for his various diseases throughout the entire period of his detention, and of having been deprived of effective remedies in this respect. He further challenged the conditions of his detention in Rustavi nos. 1 and 6 Prisons in the periods between 18 July 2008 and 4 March 2010 and 23 July 2010 up to the present as inadequate. Articles 3 and 13 of the Convention read as follows:Article 3“No one shall be subjected to torture or to inhuman or degrading treatment or punishment.”Article 13“Everyone whose rights and freedoms as set forth in [the] Convention are violated shall have an effective remedy before a national authority notwithstanding that the violation has been committed by persons acting in a

In [156]:
[doc[0] for doc in filtered_results]

 Document(metadata={'Word Count': 973, 'The Law': 'I. ALLEGED VIOLATIONS OF ARTICLES 3 AND 13 OF THE CONVENTION21. The applicant complained, under Articles 3 and 13 of the Convention, of having contracted TB in prison, of insufficient medical care for his various diseases throughout the entire period of his detention, and of having been deprived of effective remedies in this respect. He further challenged the conditions of his detention in Rustavi nos. 1 and 6 Prisons in the periods between 18 July 2008 and 4 March 2010 and 23 July 2010 up to the present as inadequate. Articles 3 and 13 of the Convention read as follows:Article 3“No one shall be subjected to torture or to inhuman or degrading treatment or punishment.”Article 13“Everyone whose rights and freedoms as set forth in [the] Convention are violated shall have an effective remedy before a national authority notwithstanding that the violation has been committed by persons acting in an official capacity.”A. Admissibility1. The pa

In [21]:
def filter_new(comm_case,docs):
    start_date = pd.Timestamp(comm_case['doc_date'])
    #[print(doc.metadata) for doc in docs]
    filtered_results = [result for result in docs if date_filter(result.metadata, start_date)]
    appno_list = [doc.metadata['appno'] for doc in filtered_results]
    appno_list = list(set(appno_list))
    results = appno_list[:5]

    if len(results) < REAL_K:
        docs = vector_store_NEW.similarity_search(comm_case['Subject Matter'],REAL_K*4 )
        filtered_results = [result for result in docs if date_filter(result.metadata, start_date)]
        appno_list = [doc.metadata['appno'] for doc in filtered_results]
        appno_list = list(set(appno_list))
        results = appno_list[:5]


    results_dict[comm_case['Filename']] = results


In [164]:
filtered_results = [print(result[0]) for result in filtered_results if date_filter(result[0], start_date)]


TypeError: 'NoneType' object is not subscriptable

In [22]:
comm_cases.apply(lambda x: filter_new(x,vector_store_NEW.similarity_search(x['Subject Matter'], REAL_K*2)),axis=1)


1988    None
3228    None
2059    None
1356    None
3393    None
4698    None
200     None
6775    None
1546    None
207     None
3581    None
1909    None
3445    None
4483    None
3597    None
5041    None
695     None
1924    None
2650    None
929     None
3681    None
2700    None
1047    None
2422    None
607     None
2029    None
4656    None
4327    None
1918    None
722     None
1936    None
3327    None
3118    None
1294    None
2194    None
1598    None
6501    None
1186    None
2815    None
3649    None
2368    None
5813    None
2868    None
160     None
6354    None
831     None
5419    None
4033    None
1347    None
541     None
dtype: object

In [24]:
results_dict

{'001-146030': ['29254/06', '13421/03', '9599/13', '36321/08', '2627/09'],
 '001-163508': ['13579/09', '8741/15', '49111/08', '41541/05', '29070/15'],
 '001-146625': ['55264/00', '40207/05', '16381/05', '33229/96', '8461/03'],
 '001-126745': ['52392/99',
  '72174/10',
  '50973/06;8672/07;8722/07',
  '31300/05',
  '51480/99'],
 '001-166878': ['41559/06', '66942/09', '16730/14', '17914/09', '26662/05'],
 '001-182962': ['22318/10',
  '63130/15;63133/15;63138/15;478/16;480/16;891/16;901/16;1905/16;2005/16;2105/16;2200/16;3758/16;4159/16;4353/16;4552/16;4684/16;4817/16;5237/16;5317/16;5332/16;5628/16;6758/16;6990/16;8536/16;8699/16;9414/16;9712/16;10073/16;10079/16;10085/16;10088/16;39419/16',
  '36218/97',
  '57953/00;37392/03',
  '55768/11'],
 '001-110650': ['40631/02', '23893/03', '43109/05', '37213/02', '11830/03'],
 '001-217180': ['40035/98', '25404/09', '10226/13', '14021/10', '49662/07'],
 '001-139546': ['49910/06', '8227/04', '41384/98', '56185/07', '52442/09'],
 '001-110661': ['210

In [25]:
comm_cases

Unnamed: 0,Filename,Questions,Subject Matter,appno,source_file,doc_date,importance,keywords_art_3,keywords_art_3_text,Subj_Count
1988,001-146030,1. Having regard to the cumulative effect of t...,"The applicant, Mr Ryszard Lipczyński, is a Pol...",44027/12,pruned_ADMISSIBILITYCOM_meta.json,2015-09-14,4,[350],(Art. 3) Prohibition of torture,643
3228,001-163508,1. Has the applicant been subjected to inhuman...,"The applicant, Mr V.S., is a stateless person,...",8685/15,pruned_ADMISSIBILITYCOM_meta.json,2018-02-20,4,[350],(Art. 3) Prohibition of torture,1814
2059,001-146625,1. Have the applicants been subjected to tortu...,"The applicants, Mr Kazimierz Wołkowski (“the f...",2037/14,pruned_ADMISSIBILITYCOM_meta.json,2015-09-14,4,[350],(Art. 3) Prohibition of torture,2024
1356,001-126745,"1. Was Onur Yaser Can, the first two applicant...",A. The circumstances of the caseThe applicants...,59683/12,pruned_ADMISSIBILITYCOM_meta.json,2020-12-15,4,[350],(Art. 3) Prohibition of torture,1869
3393,001-166878,1. Has the applicant been subjected to ill-tre...,"The applicant, Mr Damir Perkov, is a Croatian ...",33754/16,pruned_CHAMBER_meta.json,2022-09-20,3,"[350, 90, 596]","(Art. 3) Prohibition of torture, (Art. 3) Degr...",144
4698,001-182962,"1. Have the applicants’ right to life, ensured...",The application concerns the security operatio...,74941/12,pruned_COMMITTEE_meta.json,2019-10-15,4,"[350, 193]","(Art. 3) Prohibition of torture, (Art. 3) Inhu...",84
200,001-110650,1. Has the applicant been subjected to ill-tre...,"The applicant, Mr Sergey Aleksandrovich Savenk...",59731/09,pruned_CHAMBER_meta.json,2013-10-24,4,"[350, 90, 596, 193]","(Art. 3) Prohibition of torture, (Art. 3) Degr...",703
6775,001-217180,1. Is the applicant currently under a threat o...,"The applicant is an Iranian national, who alle...",14820/19,pruned_COMMITTEE_meta.json,2024-03-21,4,"[350, 90, 193]","(Art. 3) Prohibition of torture, (Art. 3) Degr...",199
1546,001-139546,1. Have the State authorities complied with th...,A list of the applicants is set out in the app...,38435/13,pruned_ADMISSIBILITY_meta.json,2015-12-15,4,[350],(Art. 3) Prohibition of torture,781
207,001-110661,1. Has the applicant been subjected to ill-tre...,"The applicant, Mr Vitaliy Vladimirovich Kulik,...",10397/10,pruned_CHAMBER_meta.json,2015-03-19,4,"[350, 90, 596, 193]","(Art. 3) Prohibition of torture, (Art. 3) Degr...",833


### TEST LEGALBERT/LEGALLONGFORMER AND OPENAI

In [9]:
comm_cases = pd.read_pickle('/users/sgdbareh/volatile/ECHR_Importance/VectorDB/train.pkl')

chunk_size = 512
chunk_overlap = 50
embedding_name = 'nlpaueb/legal-bert-base-uncased'
short_name = 'legal-bert_raw'
similarity = 'cosine'

In [10]:
with open(f"/users/sgdbareh/volatile/ECHR_Importance/VectorDB/docstore_chunk_{chunk_size}_embedding_{short_name}.pkl", "rb") as f:
        docstore = pickle.load(f)
with open(f"/users/sgdbareh/volatile/ECHR_Importance/VectorDB/index_to_docstore_id_chunk_{chunk_size}_embedding_{short_name}.pkl", "rb") as f:
        index_to_docstore_id = pickle.load(f)

In [12]:
from sentence_transformers import SentenceTransformer, models

In [11]:
# Create a SentenceTransformers model
word_embedding_model = models.Transformer(embedding_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
sentence_transformer_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Save the model
sentence_transformer_model.save(f'{short_name}_sentence_transformer_model')

embeddings = HuggingFaceEmbeddings(
        model_name="bert-sentence-transformer",
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True}  # Set `True` for cosine similarity
        )


index = faiss.read_index(f"/users/sgdbareh/volatile/ECHR_Importance/VectorDB/faiss_index_chunk_{chunk_size}_embedding_{short_name}.bin")


NameError: name 'models' is not defined

In [5]:
vector_store_NEW = FAISS(
embedding_function=embeddings,
index=index,
docstore=docstore,
index_to_docstore_id=index_to_docstore_id,
distance_strategy=DistanceStrategy.COSINE,
)

In [7]:
vector_store_NEW.similarity_search('tortue abhorrent', 5)

[Document(metadata={'Word Count': 1845, 'The Law': '1. The applicant complains that the prosecution should have been declared inadmissible on grounds that the applicant\'s rights under Article 3 had been violated in that the police officers had forced the applicant to lower his pants and underpants in a public place. He further complains under Article 3 of the Convention that he would have to serve his sentence in the Koraalspecht prison on the isle of Curaçao, where the conditions of detention, according to the findings of the European Committee for the Prevention of Torture and Inhuman or Degrading Treatment in its Report CPT/Inf (96)1, constitute inhuman and degrading treatment.Article 3 of the Convention reads as follows:"No one shall be subjected to torture or to inhuman or degrading treatment or punishment."The Court observes that the applicant raised these two complaints under Article 3 of the Convention for the first time in his appeal in cassation to the Supreme Court, which a

In [8]:
len(vector_store_NEW.index_to_docstore_id)

50195