# ACL 2023
## Semantic Legal Searcher(SLS) : Neural Information Retrieval-based Semantic Search for Case Law
Modify for Taiwan Case Law

## PIP

In [1]:
# ! pip install transformers
# ! pip install -U sentence-transformers
# ! pip install sentencepiece
# ! pip install faiss-gpu
# ! pip install funcy pickle5

## STEP1. Load Dataframe & KRLawBERT

In [2]:
import os

def list_json_files(directory):
    json_files = []
    # Iterate through all the files and directories
    for root, dirs, files in os.walk(directory):
        for name in files:
            if name.endswith('.json'):
                # Append the absolute path of each .json file to the list
                file_path = os.path.abspath(os.path.join(root, name))
                json_files.append(file_path)
    return json_files

# Specify the directory you want to list .json files from
directory_path = '/home/tzuchichen/Mou/SLS/data-tw-decompress'
json_files = list_json_files(directory_path)
print(f"Total JSON files found: {len(json_files)}")

Total JSON files found: 84497


In [3]:
import json
def extract_fields(json_files):
    all_fields = set()
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                # Get the keys of the JSON object
                fields = data.keys()
                all_fields.update(fields)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    return all_fields
# Extract and print fields in JSON files
fields = extract_fields(json_files)
print(f"Fields found in JSON files: {fields}")

Fields found in JSON files: {'JYEAR', 'JNO', 'JCASE', 'JFULL', 'JPDF', 'JDATE', 'JID', 'JTITLE'}


In [4]:
import pandas as pd
def load_json_files_to_dataframe(json_files):
    data_frames = []
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                # flatten the JSON nested structure to a table so that can be stored in DataFrame
                df = pd.json_normalize(data)
                data_frames.append(df)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    # Concatenate all DataFrames into one
    if data_frames:
        combined_df = pd.concat(data_frames, ignore_index=True)
    else:
        combined_df = pd.DataFrame()
    return combined_df
# Load JSON files and combine into a single DataFrame
df = load_json_files_to_dataframe(json_files)
df.dropna(how='any', inplace=True)
print(f">> Law cases data size : {df.shape[0]}")
display(df.head())

>> Law cases data size : 84497


Unnamed: 0,JID,JYEAR,JCASE,JNO,JDATE,JTITLE,JFULL,JPDF
0,"NHEM,113,湖秩聲,2,20240329,2",113,湖秩聲,2,20240329,聲明異議,臺灣士林地方法院裁定\r\n113年度湖秩聲字第2號\r\n原處分機關 臺北市政府警察局南...,https://data.judicial.gov.tw/opendl/JDocFile/N...
1,"NHEM,113,湖秩聲,1,20240319,1",113,湖秩聲,1,20240319,聲明異議,臺灣士林地方法院裁定\r\n113年度湖秩聲字第1號\r\n原處分機關 臺北市政府警察局內...,https://data.judicial.gov.tw/opendl/JDocFile/N...
2,"NHEM,113,湖秩,4,20240311,1",113,湖秩,4,20240311,違反社會秩序維護法,臺灣士林地方法院裁定\r\n113年度湖秩字第4號 \r\n移送機關 臺北市政府警察局...,https://data.judicial.gov.tw/opendl/JDocFile/N...
3,"NHEM,113,湖秩,9,20240313,1",113,湖秩,9,20240313,違反社會秩序維護法,臺灣士林地方法院裁定\r\n113年度湖秩字第9號\r\n移送機關 新北市政府警察局汐...,https://data.judicial.gov.tw/opendl/JDocFile/N...
4,"NHEM,112,湖秩,49,20240311,1",112,湖秩,49,20240311,違反社會秩序維護法,臺灣士林地方法院裁定\r\n112年度湖秩字第49號 \r\n移送機關 臺北市政府警察...,https://data.judicial.gov.tw/opendl/JDocFile/N...


In [5]:
# 3. Load pre-trained & fine-tuned models(KRLawBERT)
my_plms = 'distiluse-base-multilingual-cased-v1'

## STEP 2. Parallel clustering-based Topic Modeling

In [6]:
from models.parallel_clustering_TM import *
df=df.head(1000) #太多資料會跑不動

  from tqdm.autonotebook import tqdm, trange


In [7]:
# 1. Obtain Legal Embeddings
target_text = 'JFULL'

cluster = ParallelCluster(
    dataframe = df,
    tgt_col = target_text,
    model_name = my_plms,
    use_sentence_bert = True,
    )

Batches: 100%|██████████| 32/32 [00:13<00:00,  2.36it/s]

>> Data embeddings shape(Items x PLM_output) : (1000, 512)





In [8]:
# 2. Parallel Clustering
clusters, unclusters = cluster.parallel_cluster(
    clusters = None,
    threshold = 0.71,
    page_size = 1000,
    iterations = 20
    )

===== Iteration 1 / 20 =====


>> Number of Total Clusters :  19
>> Percentage clusted Doc Embeddings : 75.60%


===== Iteration 2 / 20 =====


>> Number of Total Clusters :  26
>> Percentage clusted Doc Embeddings : 89.60%


===== Iteration 3 / 20 =====


>> Number of Total Clusters :  31
>> Percentage clusted Doc Embeddings : 93.70%


===== Iteration 4 / 20 =====


>> Number of Total Clusters :  35
>> Percentage clusted Doc Embeddings : 95.50%


===== Iteration 5 / 20 =====


>> Precautions! Reduce the number of Iterations or the Threshold!
>> Number of Total Clusters :  35
>> Percentage clusted Doc Embeddings : 95.50%


===== Iteration 6 / 20 =====


>> Precautions! Reduce the number of Iterations or the Threshold!
>> Number of Total Clusters :  35
>> Percentage clusted Doc Embeddings : 95.50%


===== Iteration 7 / 20 =====


>> Precautions! Reduce the number of Iterations or the Threshold!
>> Number of Total Clusters :  35
>> Percentage clusted Doc Embeddings : 95.50%


===== Itera

In [9]:
# 3. Stack : Stack the clustered results in order of cluster size
col_list = ['JID', 'JTITLE', 'JFULL']
new_df = cluster.cluster_stack(
    col_list = col_list,
    clusters = clusters,
    unclusters = unclusters
    )

# 4. Extract Keywords from each documents
top_n_words = cluster.extract_top_n_words_per_topic(
    dataframe = new_df,
    n = 10,
    en = False
    )
new_df['keywords'] = [', '.join(top_n_words[i]) for i in new_df['Topic'].values]
display(new_df.head())

# 5. Save the Parallel Clusted Dataset 
new_df.to_csv("./data-tw/clusted_df.csv", sep=',', na_rep="NaN")

Unnamed: 0,JID,JTITLE,JFULL,Topic,keywords
11,"KMDV,113,司促,293,20240331,1",支付命令,福建金門地方法院支付命令\r\n113年度司促字第293號\r\n債 權 人 台新國際...,0,"臺灣南投地方法院支付命令, 請債權人, 支付命令及確定證明書聲請強制執行, 債務人未於不變期..."
13,"KMDV,113,司促,249,20240331,1",支付命令,福建金門地方法院支付命令\r\n113年度司促字第249號\r\n債 權 人 滙豐(台...,0,"臺灣南投地方法院支付命令, 請債權人, 支付命令及確定證明書聲請強制執行, 債務人未於不變期..."
19,"KMDV,113,司促,215,20240308,1",支付命令,福建金門地方法院支付命令\r\n113年度司促字第215號\r\n債 權 人 中華電信...,0,"臺灣南投地方法院支付命令, 請債權人, 支付命令及確定證明書聲請強制執行, 債務人未於不變期..."
21,"KMDV,113,司促,322,20240330,1",支付命令,福建金門地方法院支付命令\r\n113年度司促字第322號\r\n債 權 人 仲信資融...,0,"臺灣南投地方法院支付命令, 請債權人, 支付命令及確定證明書聲請強制執行, 債務人未於不變期..."
22,"KMDV,113,司促,303,20240331,1",支付命令,福建金門地方法院支付命令\r\n113年度司促字第303號\r\n債 權 人 中國信託...,0,"臺灣南投地方法院支付命令, 請債權人, 支付命令及確定證明書聲請強制執行, 債務人未於不變期..."


## STEP 3. Embedding modelization(split-merge) and scoring(multi-interactions)

In [10]:
import importlib
import sys
from models.semantic_legal_searcher import *
from models.semantic_legal_searcher import SLS
importlib.reload(sys.modules['models.semantic_legal_searcher'])

<module 'models.semantic_legal_searcher' from '/home/tzuchichen/Mou/SLS/models/semantic_legal_searcher.py'>

In [11]:
# 1. Obtain query, documents, keywords embeddings from KRLawBERT
my_plms = 'paraphrase-multilingual-MiniLM-L12-v2'
sls = SLS(
    dataframe = new_df,
    doc_col = 'JFULL',
    key_col = 'keywords',
    model_name = my_plms,
    use_sentence_bert = True,
    split_and_merge = True,
    multi_inter = True,
    )

# 2. Build the Index
# (Strategy 1) : All Distance Metric
all_index = sls.all_distance_metric()
# (Strategy 2) : Restricted Distance Metric
restricted_index = sls.restricted_distance_metric(nlist = 10, nprobe = 6)
# restricted_index = sls.restricted_distance_metric(nlist = 200, nprobe = 6)

>> Embedding dimension of the model : 384
>> Split and Merage embeddings shape(Items x PLMs_dim) : (1000, 384)


Batches: 100%|██████████| 32/32 [00:04<00:00,  7.23it/s]

>> Keywords embeddings shape(Items x PLMs_dim) : (1000, 384)





## STEP 4. Semantic law case search with SLS

In [12]:
my_query = "被控告販賣毒品 怎麼上訴 刑事判決 最高法院 案例分析"

In [13]:
# 3. Semantic case law search (Legal Question-Answering)
search_results = sls.semantic_search(
    user_query = my_query,
    top_k = 5,
    index = all_index
    )
search_results_df = pd.DataFrame(search_results)
display(search_results_df)


 === Calculate run time : 20.684 ms === 



Unnamed: 0,JID,JTITLE,JFULL,Topic,keywords,L2 Distance
0,"SYEV,113,營簡,31,20240301,1",損害賠償,臺灣臺南地方法院柳營簡易庭民事判決\r\n113年度營簡字第31號\r\n原 告 ...,1,"為有理由, 臺灣臺南地方法院柳營簡易庭, 應予准許, 計算式, 按週年利率百分之5計算之利息...",6.766739
1,"SYEV,113,營小,83,20240312,1",損害賠償,臺灣臺南地方法院柳營簡易庭民事判決\r\n113年度營小字第83號\r\n原 告 ...,1,"為有理由, 臺灣臺南地方法院柳營簡易庭, 應予准許, 計算式, 按週年利率百分之5計算之利息...",6.806182
2,"SYEV,113,營簡,193,20240326,1",侵權行為損害賠償（交通）,臺灣臺南地方法院柳營簡易庭民事裁定\r\n113年度營簡字第193號\r\n原 告...,3,"存款, 權利範圍, 家事法庭, 耕作權, 訴訟代理人, 段000地號土地, 6分之1, 許鈞...",6.839384
3,"NTDV,112,司聲,192,20240311,1",確定訴訟費用額,臺灣南投地方法院民事裁定\r\n112年度司聲字第192號\r\n聲 請 人 林清安 ...,10,"被移送人, 臺灣士林地方法院裁定, 上列被移送人因違反社會秩序維護法案件, 內湖簡易庭, 以...",6.848912
4,"SYEV,113,營小,20,20240308,1",侵權行為損害賠償（交通）,臺灣臺南地方法院柳營簡易庭民事判決\r\n113年度營小字第20號\r\n原 告 ...,12,"折舊額, 殘價, 耐用年數, 小數點以下四捨五入, 交通, 事件, 取得成本, 按年息百分之...",6.850277


## Compare Result
### 1. Without Multi-interactions mechanism(Single-interaction)

In [14]:
# 1. Obtain query, documents, keywords embeddings from KRLawBERT
sls = SLS(
    dataframe = new_df,
    doc_col = 'JFULL',
    key_col = 'keywords',
    model_name = my_plms,
    use_sentence_bert = True,
    split_and_merge = False,
    multi_inter = False,
    )

# 2. Build the Index
all_index = sls.all_distance_metric()

# 3. Semantic case law search (Legal Question-Answering)
search_results = sls.semantic_search(
    user_query = my_query,
    top_k = 5,
    index = all_index
    )
search_results_df = pd.DataFrame(search_results)
display(search_results_df)

>> Embedding dimension of the model : 384


Batches: 100%|██████████| 32/32 [00:06<00:00,  4.96it/s]

>> Documents embeddings shape(Items x PLMs_dim) : (1000, 384)



Batches: 100%|██████████| 32/32 [00:04<00:00,  7.14it/s]

>> Keywords embeddings shape(Items x PLMs_dim) : (1000, 384)

 === Calculate run time : 7.4139 ms === 






Unnamed: 0,JID,JTITLE,JFULL,Topic,keywords,L2 Distance
0,"SYEV,111,營簡,377,20240322,1",侵權行為損害賠償（交通）,臺灣臺南地方法院柳營簡易庭民事判決\r\n111年度營簡字第377號\r\n原 告...,1,"為有理由, 臺灣臺南地方法院柳營簡易庭, 應予准許, 計算式, 按週年利率百分之5計算之利息...",6.989759
1,"SYEV,113,營簡,20,20240315,1",侵權行為損害賠償（交通）,臺灣臺南地方法院柳營簡易庭民事判決\r\n113年度營簡字第20號\r\n原 告 ...,1,"為有理由, 臺灣臺南地方法院柳營簡易庭, 應予准許, 計算式, 按週年利率百分之5計算之利息...",7.054715
2,"SYEV,113,營簡,193,20240326,1",侵權行為損害賠償（交通）,臺灣臺南地方法院柳營簡易庭民事裁定\r\n113年度營簡字第193號\r\n原 告...,3,"存款, 權利範圍, 家事法庭, 耕作權, 訴訟代理人, 段000地號土地, 6分之1, 許鈞...",7.124142
3,"NTDV,111,訴,409,20240320,1",分割共有物,臺灣南投地方法院民事裁定\r\n111年度訴字第409號\r\n原 告 張學本 ...,3,"存款, 權利範圍, 家事法庭, 耕作權, 訴訟代理人, 段000地號土地, 6分之1, 許鈞...",7.127678
4,"SYEV,113,營簡,85,20240305,1",侵權行為損害賠償（交通）,宣 示 判 決 筆 錄\r\n ...,12,"折舊額, 殘價, 耐用年數, 小數點以下四捨五入, 交通, 事件, 取得成本, 按年息百分之...",7.167915


### 2. Without Split-merge mechanism(Encoding documents-level)

In [15]:
# 1. Obtain query, documents, keywords embeddings from KRLawBERT
sls = SLS(
    dataframe = new_df,
    doc_col = 'JFULL',
    key_col = 'keywords',
    model_name = my_plms,
    use_sentence_bert = True,
    split_and_merge = False,
    multi_inter = True,
    )

# 2. Build the Index
all_index = sls.all_distance_metric()

# 3. Semantic case law search (Legal Question-Answering)
search_results = sls.semantic_search(
    user_query = my_query,
    top_k = 5,
    index = all_index
    )
search_results_df = pd.DataFrame(search_results)
display(search_results_df)

>> Embedding dimension of the model : 384


Batches: 100%|██████████| 32/32 [00:06<00:00,  4.96it/s]

>> Documents embeddings shape(Items x PLMs_dim) : (1000, 384)



Batches: 100%|██████████| 32/32 [00:04<00:00,  7.26it/s]

>> Keywords embeddings shape(Items x PLMs_dim) : (1000, 384)

 === Calculate run time : 7.0236 ms === 






Unnamed: 0,JID,JTITLE,JFULL,Topic,keywords,L2 Distance
0,"SYEV,113,營簡,31,20240301,1",損害賠償,臺灣臺南地方法院柳營簡易庭民事判決\r\n113年度營簡字第31號\r\n原 告 ...,1,"為有理由, 臺灣臺南地方法院柳營簡易庭, 應予准許, 計算式, 按週年利率百分之5計算之利息...",6.534093
1,"SYEV,113,營簡,20,20240315,1",侵權行為損害賠償（交通）,臺灣臺南地方法院柳營簡易庭民事判決\r\n113年度營簡字第20號\r\n原 告 ...,1,"為有理由, 臺灣臺南地方法院柳營簡易庭, 應予准許, 計算式, 按週年利率百分之5計算之利息...",6.766739
2,"SYEV,113,營簡,193,20240326,1",侵權行為損害賠償（交通）,臺灣臺南地方法院柳營簡易庭民事裁定\r\n113年度營簡字第193號\r\n原 告...,3,"存款, 權利範圍, 家事法庭, 耕作權, 訴訟代理人, 段000地號土地, 6分之1, 許鈞...",6.806181
3,"NTDV,112,司聲,192,20240311,1",確定訴訟費用額,臺灣南投地方法院民事裁定\r\n112年度司聲字第192號\r\n聲 請 人 林清安 ...,10,"被移送人, 臺灣士林地方法院裁定, 上列被移送人因違反社會秩序維護法案件, 內湖簡易庭, 以...",6.839384
4,"SYEV,113,營小,20,20240308,1",侵權行為損害賠償（交通）,臺灣臺南地方法院柳營簡易庭民事判決\r\n113年度營小字第20號\r\n原 告 ...,12,"折舊額, 殘價, 耐用年數, 小數點以下四捨五入, 交通, 事件, 取得成本, 按年息百分之...",6.848912
