# Embedding + QA Maltese law notes using openai 

Step 1. Importing and chunking the text

In [1]:
import os
import pandas as pd
import docx


In [2]:
def read_docx(file_path):
    doc = docx.Document(file_path)
    paragraphs = [p.text for p in doc.paragraphs]
    return '\n\n'.join(paragraphs)

In [3]:
def chunk_text(file_path):
    text = read_docx(file_path)
    
    paragraphs = text.split('\n\n')  # Split text into paragraphs
    
    chunks = []
    current_chunk = ''
    overlap = []
    
    for paragraph in paragraphs:
        sentences = paragraph.split('. ')  # Split paragraph into sentences
        
        if len(current_chunk.split()) + len(paragraph.split()) <= 500:
            current_chunk += ' ' + ' '.join(overlap) + ' ' + paragraph
        else:
            chunks.append(current_chunk.strip())
            current_chunk = ' '.join(overlap) + ' ' + paragraph
        
        overlap = sentences[-2:]  # Store last 2 sentences of the paragraph as overlap
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

In [4]:
def process_folder(folder_path):
    file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.docx')]
    data = []
    
    for file_path in file_paths:
        chunks = chunk_text(file_path)
        file_name = os.path.basename(file_path)
        
        for chunk in chunks:
            data.append([file_name, chunk])
    
    df = pd.DataFrame(data, columns=['File Name', 'Chunk'])
    return df

In [34]:
# Usage example
folder_path = r"C:\Users\grupp\Python Files\0. Law LLM\Data to embed"
df = process_folder(folder_path)

In [35]:
# Display the DataFrame
print(df.head())


  File Name                                              Chunk
0  100.docx  100. In this sub-title "criminal proceedings" ...
1  100.docx  Then the law prescribes the punishment. This o...
2  100.docx  The law lays down certain formalities as to ho...
3  100.docx  There is conflicting case law that says that t...
4  100.docx  Otherwise there is no agreement between author...


In [47]:
df.dropna(inplace = True)

In [48]:
df.shape

(340, 2)

In [56]:
import re

def remove_special_characters(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text


# The next steps are taken from OpenAI's cookbook (embedding wikipedia articles)

Step 2. Embedding the chunks

In [50]:
df.Chunk.astype(str)

0      100 In this subtitle criminal proceedings incl...
1      Then the law prescribes the punishment This of...
2      The law lays down certain formalities as to ho...
3      There is conflicting case law that says that t...
4      Otherwise there is no agreement between author...
                             ...                        
335    Knowledge that the victim does not consent or ...
336    There were situations where although they were...
337    It is surprising that the civil and continenta...
338    Usual solution  Marital rape is excluded since...
339    In a time when marital rape in principle was e...
Name: Chunk, Length: 340, dtype: object

In [8]:
import re

In [51]:
def clean_section(section):
    if isinstance(section, tuple):
        if len(section) == 2:
            titles, text = section
        else:
            titles, *text = section
            text = ' '.join(text)
    else:
        text = section

    try:
       
        # Remove special characters using regular expressions
        text = re.sub(r"[^\w\s]", "", text)
        
        # Strip leading/trailing whitespace
        text = text.strip()

        return text
    except Exception as e:
        print(f"Error cleaning section: {e}")
        return None

In [52]:
df['Chunk'] = df['Chunk'].apply(clean_section)

In [53]:
EMBEDDING_MODEL = "text-embedding-ada-002"
BATCH_SIZE = 20

In [13]:
import openai

In [96]:
openai.api_key = "xxxx"  # Replace with your actual OpenAI API key


In [58]:
embeddings = []
errors = []
for batch_start in range(0, len(df['Chunk']), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = df['Chunk'][batch_start:batch_end].tolist()  # convert Series to list
    cleaned_batch = [remove_special_characters(chunk) for chunk in batch]  # Remove special characters
    input_data = {"input": cleaned_batch}
    print(f"Batch {batch_start} to {batch_end-1}")
    try:
        response = openai.Embedding.create(model=EMBEDDING_MODEL, **input_data)
        for i, be in enumerate(response["data"]):
            assert i == be["index"]  # double check embeddings are in the same order as input
        batch_embeddings = [e["embedding"] for e in response["data"]]
        embeddings.extend(batch_embeddings)
    except Exception as e:
        print(f"Error in batch {batch_start} to {batch_end-1}: {e}")
        errors.extend(range(batch_start, batch_end))
        continue

df = df.drop(errors)  # Remove rows with errors
df.loc[:, 'embedding'] = embeddings


Batch 0 to 19
Batch 20 to 39
Batch 40 to 59
Batch 60 to 79
Batch 80 to 99
Batch 100 to 119
Batch 120 to 139
Batch 140 to 159
Error in batch 140 to 159: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.
Batch 160 to 179
Batch 180 to 199
Batch 200 to 219
Batch 220 to 239
Batch 240 to 259
Batch 260 to 279
Batch 280 to 299
Batch 300 to 319
Batch 320 to 339


In [59]:
df.head()

Unnamed: 0,File Name,Chunk,embedding
0,100.docx,100 In this subtitle criminal proceedings incl...,"[0.0020429801661521196, 0.0176449753344059, -0..."
1,100.docx,Then the law prescribes the punishment This of...,"[0.003880781354382634, 0.03524026274681091, 0...."
2,100.docx,The law lays down certain formalities as to ho...,"[0.007246658205986023, 0.023993026465177536, 0..."
3,100.docx,There is conflicting case law that says that t...,"[0.0009440628928132355, 0.016755245625972748, ..."
4,100.docx,Otherwise there is no agreement between author...,"[0.0015908252680674195, -0.0016626475844532251..."


In [28]:
from scipy import spatial  # for calculating vector similarities for search

In [60]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["Chunk"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [94]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("mens rea", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.850


'Criminal Intent  Mens Rea for the offence of Rape  Criminal Intent  Mens Rea for the offence of Rape  Just because in the definition of the offence of rape we find violence this is part of the material element and is not to be taken to constitute the mens rea The mens rea is a reference to the state of mind of the agent and the violence in this case refers to the passive subject As in most other definitions there is no reference to the state of mind of the agent this does not mean that no state of mind is required or that no criminal intent is required The minimum state of mind required of the agent in all criminal offences is always the generic intent consisting in will and understanding In certain definitions sometimes a special purpose besides the generic ingredient is required and this special purpose transforms the criminal intent from a generic to a specific one There is no such explicit requirement in this case  There is no such explicit requirement in this case  If notwithstan

relatedness=0.833


'You must at least prove the material conduct but you must also prove that this conduct was committed by will and understanding  MANZINI goes a little further than Crivellari He agrees that what is required is the conscious and free will to do the act and the intention to cause the event but he adds that this must be done in order to deceive others as to the author or the content or tenor of the document Whilst Crivelleri stops at requiring general ingredients of criminal intent but Manzini goes further as to say that you must intent to decieve others or the content Without this special kind of intention to deceive intention is lacking Whilst Crivelleri stops at requiring general ingredients of criminal intent but Manzini goes further as to say that you must intent to decieve others or the content Without this special kind of intention to deceive intention is lacking ANTOLISEI makes a brief review of the opinions of text writers and points out that the current trend taken by Italian Co

relatedness=0.832


'We require criminal intent According to English Law an honest belief that one is not married even though not really proven is a defense  In our law the more reasonable approach would be to make an evaluation of the situation in terms of the requirement of the defense of mistake of fact We have to refer to the principles which underline this defense of mistake of fact ie the mistake must concern an essential fact as in belief that wife is dead and not that the wife is committing adultery The mistake must also be inevitable in the sense that it could not have been avoided by the exercise of reasonable care Smith and Hogan suggest a method in order to overcome this particular difficulty ie where the defense of mistake is raised then it would be sufficient for the accused of introducing evidence capable of raising a reasonable doubt as to the existence of the required criminal intent and then it would be up to the prosecution to persuade regarding the exclusion of the doubt Once the accus

relatedness=0.830


'Would this amount to a fraudulent device Would this amount to a fraudulent device   The agent has done nothing at to induce it how can it be a fraudulent device This is a mistake which has been voluntarily assumed by the passive subject There is one situation were this might develop into a fraudulent device Let us say that the agent becomes aware that the passive subject believes him to be his partner or husband and he goes along with this mistake that voluntary going along would or could amount to the fraudulent device mentioned here  Let us say that the agent becomes aware that the passive subject believes him to be his partner or husband and he goes along with this mistake that voluntary going along would or could amount to the fraudulent device mentioned here      The Mens Rea of this Offence The Mens Rea of this Offence   Dissent is not required even when the lack of consent is not required neither in the case of violence nor in that rape based on lack of consent This strengths t

relatedness=0.829


'There is the inference that the conduct must have been done with the special intention to deceive  There is the inference that the conduct must have been done with the special intention to deceive  183 Any other person who shall commit forgery of any authentic and public instrument or of any commercial document or private bank document by counterfeiting or altering the writing or signature by feigning any fictitious agreement disposition obligation or discharge or by the insertion of any such agreement disposition obligation or discharge in any of the said instruments or documents after the formation thereof or by any addition to or alteration of any clause declaration or fact which such instruments or documents were intended to contain or prove shall on conviction be liable to imprisonment for a term from thirteen months to four years with or without solitary confinement 183 Any other person who shall commit forgery of any authentic and public instrument or of any commercial document

In [71]:
import tiktoken

In [72]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [63]:
def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below articles on the Maltese Criminal Law to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nLaw Notes Section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


In [64]:
GPT_MODEL = "gpt-3.5-turbo"

In [67]:
def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You are an excellent law student answering questions of a law exam."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

** LLB 1 Year Criminal Law ** 

In [73]:
ask('Discusss how will and understanding are the building blocks with which the general theory of criminal intent in Maltese Law is constructed. Furthermore explain the relevance of motive, premeditation and good faith in this context. ')

"In Maltese Law, the general theory of criminal intent is constructed based on the building blocks of will and understanding. Will refers to the conscious and voluntary decision to commit a criminal act, while understanding refers to the knowledge and awareness of the consequences of that act.\n\nWill is an essential element in criminal intent as it demonstrates the deliberate choice to engage in the prohibited conduct. It requires a conscious and free decision to commit the act, showing that the individual had control over their actions and intentionally chose to engage in the criminal behavior.\n\nUnderstanding, on the other hand, involves the knowledge and awareness of the consequences of the act. It requires the individual to have a clear understanding of the nature and potential harm caused by their actions. This element ensures that the person committing the act is aware of the potential consequences and is therefore responsible for their actions.\n\nMotive, premeditation, and go

In [75]:
ask('Expain in detail in which self-defence bay be successfully pleaded as a defence against a criminal charge under Maltese law')

"Under Maltese law, self-defense may be successfully pleaded as a defense against a criminal charge if certain conditions are met. The relevant articles to consider are Article 223 and Article 224 of the Maltese Criminal Code.\n\nArticle 223 states that not all those who act to defend themselves can invoke self-defense. In order to plead self-defense, the aggression suffered must be unjust, grave, and inevitable. This means that the person claiming self-defense must have been subjected to an unjust and serious attack that could not have been avoided or averted in any other way. If the person instead confronts the aggressor without a valid reason and participates in a physical confrontation that could have been reasonably avoided, the element of inevitability is lacking, and self-defense cannot be successfully pleaded.\n\nFurthermore, the act of self-defense must be proportional. This means that the person acting in self-defense cannot use more force than necessary to repel the attack. 

# For anyone who just want to query a dataset. It stops here, next is testing some abilities such as clustering using openai. 

We will be stemming the text and removing stopwords to reduce the text count (costs [although costs are almost negligible])

In [78]:
import numpy as np
import pandas as pd
from ast import literal_eval

In [81]:
matrix = np.vstack(df.embedding.values)
matrix.shape


(320, 1536)

In [93]:
import openai

# Reading a review which belongs to each group.
n_clusters = 10
rev_per_cluster = 10
sample_text_length = 50

for i in range(n_clusters):
    print(f"Cluster {i} Theme:", end=" ")
    
    cluster_df = df[df.Cluster == i]
    cluster_sample = cluster_df.sample(min(rev_per_cluster, len(cluster_df)), random_state=42)
    
    reviews = "\n".join(
        cluster_sample["Chunk"]
        .str[:sample_text_length]
        .values
    )
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=f'What do the following Law notes have in common? Name the common Law topics for each cluster \n\nLaw notes:\n"""\n{reviews}\n"""\n\nTheme:',
        temperature=0,
        max_tokens=64,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    print(response["choices"][0]["text"].replace("\n", ""))
    
    for index, row in cluster_sample.iterrows():
        print(row["File Name"], end=", ")  # Replace "File Name" with the correct column name
        print(row["Chunk"][:sample_text_length])  # Replace "Chunk" with the correct column name
    
    print("-" * 100)


Cluster 0 Theme:  Evidence LawCommon Law Topics: Authentication, Forgery, Fraud, Juridical Relevance, Public Authentic Instruments.
criminal 9th january camilleri.docx, 9th January 2008  9th January 2008  A copy is auth
Criminal Law - camilleri 12th december.docx, Any one of these means will be sufficient to const
Criminal Law - camilleri.docx, Thus it is a document when it is attributed public
Criminal Law - camilleri.docx, As long as the writing is invisible it cannot be a
Criminal Law - camilleri.docx, There is fraud yet no forgery  One has to distingu
Criminal Law - Tuesday 15th January 2008.docx, It must be a document which is bound to be issued 
Criminal Law - camilleri.docx, If it is not juridically relevant then the offence
Criminal Law - camilleri.docx, The document is already a public authentic instrum
Criminal Law - camilleri.docx, Criminal Law 14th November 2007 Wednesday  Crimina
Criminal Law - camilleri 12th december.docx, e articles 627 to 633 and articles 635 to 637 rel

Not quite as interesting as would have liked 

In [89]:
df.columns

Index(['File Name', 'Chunk', 'embedding', 'Cluster'], dtype='object')