#### COLIEE 2024

This is the implementation of Task 1 of the 2024 Competition on Legal Information and Extraction/Entailment (COLIEE) by Damian Curran and Mike Conway.

Details of the implementation can be found in our paper 'Similarity Ranking of Case Law Using Propositions as Features' (2024).

#### Imports

In [1]:
# Import functions from helper python files.

import t5train_code, file_code, pairs_code, model_code
import importlib

#### t5 Proposition Extraction Model

In [None]:
# Fine-tune t5-base on training data

importlib.reload(t5train_code)
from t5train_code import get_trainer, train_save_model

trainer = get_trainer()
train_save_model(trainer)

#### Files

In [None]:
importlib.reload(file_code)
from file_code import (
    get_files, add_paragraphs, get_paragraphs_formatted, add_suppressed_sections, add_propositions, get_english_propositions,
    add_sentences, get_english_sentences,
    add_quotes, add_entities, add_strings_sets, add_set_lists, add_judge_name, add_year,
    get_embeddings)

In [None]:
# Generate files dataframe, one row per file. Extract file features using the following functions:

files = get_files()
add_paragraphs(files)
get_paragraphs_formatted(files)
add_suppressed_sections(files)
add_propositions(files)
get_english_propositions(files)
add_sentences(files)
get_english_sentences(files)
add_quotes(files)
add_entities(files)
add_strings_sets(files)
add_set_lists(files)
add_judge_name(files)
add_year(files)
get_embeddings(files)

#### Pairs

In [None]:
importlib.reload(pairs_code)
from pairs_code import (get_pairs, add_bins, get_prop_max_cos_sim_sents, get_prop_max_cos_sim_paras,
                        get_prop_max_jaccard_sents, get_prop_max_jaccard_paras, get_prop_max_overlap_sents, get_prop_max_overlap_paras, add_max_overall,
                        get_case_jaccard_sims, check_same_case, get_case_tfidf_scores, get_num_quotes, binarize_quotes, check_years, add_judge_checks)

In [None]:
# Generate pairs dataframe. One query-candidate case pair per row. Compare file features from files df to generate pair features:

pairs = get_pairs(files)
get_prop_max_cos_sim_sents(files, pairs)
get_prop_max_cos_sim_paras(files, pairs)
get_prop_max_jaccard_sents(files,pairs)
get_prop_max_jaccard_paras(files,pairs)
get_prop_max_overlap_sents(files,pairs)
get_prop_max_overlap_paras(files,pairs)
add_max_overall(pairs,files)
get_case_jaccard_sims(files,pairs)
check_same_case(pairs)
get_case_tfidf_scores(files,pairs)
get_num_quotes(files,pairs)
binarize_quotes(pairs)
check_years(files,pairs)
add_judge_checks(files,pairs)
add_bins(files, pairs)

#### Model

In [None]:
# Do k-fold validation on train set to identify best hyperparameters:

importlib.reload(model_code)
from model_code import get_k_fold_model_dev_pairs, save_model_df_pairs

model_df_pairs = get_k_fold_model_dev_pairs(pairs)
save_model_df_pairs(model_df_pairs)

In [None]:
importlib.reload(model_code)
from model_code import apply_models_to_dfs
apply_models_to_dfs(model_df_pairs, infer_type=1)

In [None]:
importlib.reload(model_code)
from model_code import apply_models_to_dfs
apply_models_to_dfs(model_df_pairs, infer_type=2)

#### Final Inference

In [1]:
# Train model

importlib.reload(model_code)
from model_code import build_train_model

train_df = pairs[pairs['set']=='train']
model, train_df = build_train_model(train_df)

In [2]:
# Generate final results

importlib.reload(model_code)
from model_code import inference_on_test

test_df = pairs[pairs['set']=='test']

for infer_type in [1,2]:
    results_df = inference_on_test(model, test_df, infer_type)
    print()