In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import numpy as np


### Read in Data

In [14]:
data_directory = './Data/'

In [15]:
# Set the directory for the txt files
questions = data_directory +'questions/'
subject_matter = data_directory + 'subject_matter/'

# Initialize an empty list to store the data
questions_list = []
subject_matter_list = []

# Iterate over the subject matter txt files
for file in os.listdir(subject_matter):
    # Read the txt file into a dataframe
    with open(subject_matter + file, 'r') as text:
        data = [file, text.read()]
    # Append the data to the respective lists
    subject_matter_list.append(data)

# Iterate over the question txt files
for file in os.listdir(questions):
    # Read the txt file into a dataframe
    with open(questions + file, 'r') as text:
        data = [file, text.read()]
    # Append the data to the respective lists
    questions_list.append(data)

#questions_list
questions_df = pd.DataFrame(questions_list,columns=['Filename','Questions'])
subject_matter_df = pd.DataFrame(subject_matter_list,columns=['Filename','Subject Matter'])

In [16]:
subject_matter_df['Filename'] = subject_matter_df['Filename'].str.replace('.txt', '')
questions_df['Filename'] = questions_df['Filename'].str.replace('.txt', '')

### Word Count

In [43]:
questions_df['Word Count'] = questions_df['Questions'].str.split().str.len()
subject_matter_df['Word Count'] = subject_matter_df['Subject Matter'].str.split().str.len()

In [44]:
subject_matter_df[subject_matter_df['Word Count'] > 10000].sort_values(by='Word Count', ascending=False)

Unnamed: 0,Filename,Subject Matter,Word Count
1260,001-123768,"\n1. The applicant, Mr Zayn Al-Abidin Muhammad...",41339
408,001-112302,"\n1. The applicant, Mr Abd Al Rahim Hussayn Mu...",36320
551,001-113814,"THE FACTS\n1. The applicant, Mr Abd al Rahim H...",31110
1805,001-142597,\n1. Mr Khodorkovskiy (the first applicant) wa...,22623
1244,001-122872,"\n\n1. The applicant, Ms Yuliya Volodymyrivna ...",21128
5028,001-187541,"The applicant, Mr Leonid Borisovich Nevzlin, i...",20084
780,001-115816,"\nThe applicant, Mr Abu Zubaydah, is a statele...",16538
2748,001-157725,"The applicant, Mr Zaven Volodyayevich Naltakya...",16013
2244,001-149050,\n1. Application no. 32631/09 was lodged on 11...,14619
2476,001-153895,"The first applicant, Times Newspapers Ltd, is ...",14480


### Link Data

In [80]:
# create merged df
df = pd.merge(questions_df, subject_matter_df, on='Filename')

In [46]:
#create unmatched df to check for missing data
unmatched_df = pd.merge(questions_df, subject_matter_df, on='Filename', how='outer', indicator=True)
unmatched_rows = unmatched_df[unmatched_df['_merge'] != 'both']
len(unmatched_rows)

273

In [47]:
#see how missingness is distributed according to the files
missing_data = []

for file in os.listdir(data_directory):
    # Read the txt file into a dataframe
    if file.endswith('.txt'):
        with open(data_directory + file, 'r') as text:
            data = pd.read_csv(text, sep=' ',header=None)
            missing_data.append(data)
            print(f'filename {file}; count {len(data)}')

filename questions_missing.txt; count 141
filename subject_missing.txt; count 339


In [48]:
questions_missing = missing_data[0]
subject_missing = missing_data[1]

In [29]:
#merge togehter the missing data to see if it affects both files or just one for a case
x = pd.merge(subject_missing, questions_missing, on=0, how='outer', indicator=True)

In [30]:
x.rename(columns={0: 'Filename'}, inplace=True)

In [33]:
#use these lists to manually check data quality
both = x[x['_merge'] == 'both']
right = x[x['_merge'] == 'right_only']
left = x[x['_merge'] == 'left_only']

### Data Processing

In [52]:
df = df.rename(columns={'Word Count_x': 'Question_Count', 'Word Count_y': 'Subject_Matter_Count'})


In [81]:
#remove whitespace
df['Questions'] = df['Questions'].str.strip()
df['Subject Matter'] = df['Subject Matter'].str.strip()

In [82]:
#remove repeat phrases
df['Subject Matter'] = df['Subject Matter'].str.replace('THE FACTS\n', '')
df['Subject Matter'] = df['Subject Matter'].str.replace('\n', '')
df['Questions'] = df['Questions'].str.replace('\n', '')

In [83]:
#link outcome label
labels = pd.read_csv(data_directory + 'importance_labels.csv')
labels = labels.rename(columns={'itemid':'Filename'})

In [84]:
df = pd.merge(df, labels, on='Filename')

In [45]:
df[df['appno'] == '29233/15']

Unnamed: 0,Filename,Questions,Subject Matter,appno,source_file,importance


In [121]:
# check importance counts
importance_counts = df['importance'].value_counts()


In [122]:
importance_counts = pd.DataFrame(importance_counts)

In [130]:
importance_counts['pct'] = importance_counts.apply(lambda x: x / x.sum())

In [138]:
importance_counts['sample_cases'] = importance_counts['pct'].apply(lambda x: round(x * 50))

In [139]:
importance_counts

Unnamed: 0_level_0,count,pct,sample_cases
importance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,5680,0.812009,41
3,1132,0.16183,8
1,150,0.021444,1
2,33,0.004718,0


In [144]:
def sampleHelper(df, importance, size):

    return df[df['importance'] == importance].sample(size, random_state=154)

In [157]:
def sample(df):

    sample_list = []

    for imp in df['importance'].unique():
        if imp == 1:
            sample_list.append(sampleHelper(df, imp, 6))
        elif imp == 2:
            sample_list.append(sampleHelper(df, imp, 6))
        elif imp == 3:
            sample_list.append(sampleHelper(df, imp, 11))
        else:
            sample_list.append(sampleHelper(df, imp, 31))

    return pd.concat(sample_list)

In [162]:
sample_df = sample(df)

In [169]:
valid_data = sample_df[['Filename','Questions','Subject Matter','importance']]

In [165]:
test_data = df.drop(sample_df.index)
test_data = test_data[['Filename','Questions','Subject Matter','importance']]

In [85]:
df

Unnamed: 0,Filename,Questions,Subject Matter,appno,source_file,importance
0,001-109309,1. In the light of the applicants’ claims and ...,"The applicants, M.M., his wife and a minor chi...",72861/11,pruned_ADMISSIBILITY_meta.json,4
1,001-109315,1. Did the applicant have a fair hearing in th...,"The applicant, Mr Aurelian Anghel, is a Romani...",5968/09,pruned_CHAMBER_meta.json,4
2,001-109317,1. Has there been a violation of the applicant...,"The applicant, Mr Vladimirs Oderovs, is a Latv...",21979/08,pruned_COMMITTEE_meta.json,4
3,001-109386,1. Has there been a violation of Article 6 § 1...,"1. The applicant, Mr Victor Sandu, is a Moldov...",16463/08,pruned_CHAMBER_meta.json,3
4,001-109387,Has there been a violation of Article 3 of the...,"1. The applicant, Mr Sergiu Buhaniuc, is a Mol...",56074/10,pruned_CHAMBER_meta.json,4
...,...,...,...,...,...,...
6990,001-229304,1. Has there been a violation of the applicant...,"The applicants are Italian nationals, living i...",54648/21,pruned_ADMISSIBILITYCOM_meta.json,4
6991,001-229359,Has there been a violation of Article 5 § 3 of...,"The applicant’s pre-trial detention, which sta...",9573/23,pruned_ADMISSIBILITYCOM_meta.json,4
6992,001-229708,Has the applicant suffered discrimination in t...,The application concerns the refusal to grant ...,19191/19,pruned_ADMISSIBILITYCOM_meta.json,4
6993,001-229713,Has the applicant suffered a violation of Arti...,"By a judgment of 29 November 2016, the La Spez...",53715/20,pruned_ADMISSIBILITYCOM_meta.json,4


### GPT Tests

In [11]:
import os
os.chdir('./volatile/ECHR_Importance')

In [12]:
from API_key import openai_key
from openai import OpenAI
client = OpenAI(api_key=openai_key)

In [96]:
# read in data
df = pd.read_pickle('valid_data.pkl')

In [100]:
df_main = df[:-4]
df_examples = df[-4:]

In [273]:
JSON_SCHEMAS = [{"Case Importance":"int (1-4)","Summary":"string (description of the case)","Reasoning":"string (give your reason for the importance)" },
                {"Case Importance":"int (1-4)"}
                ]

In [None]:
#To Do:
#- explore different roles
#- explore different prompts both few-shot and zero-shot
#- explore different temperature and max_tokens
#- 

### Experiment 1: What does GPT-4 know about the ECtHR? Give it the case id and appNo and see if it knows the importance of the case already...
#TO DO:
## Write the classification code
## Write the code
## COST it

### Experiment 2: Prediction of the importance of the case based on the questions and subject matter
#TO DO:
## Write the code - inc processing the JSON responses and saving them
## COST it

##### Experiment 1

In [106]:
metadata = pd.read_json('/users/sgdbareh/volatile/ECHR_Importance/Data/overlap_cases/pruned_COMMUNICATEDCASES_meta.json',lines=True)

In [115]:
metadata.rename(columns={'itemid':'Filename'}, inplace=True)

In [305]:
exp1_data = pd.merge(df_main, metadata, on='Filename')

In [118]:
exp1_data.columns

Index(['Filename', 'Questions', 'Subject Matter', 'importance_x',
       'kpthesaurus', 'meetingnumber', 'resolutionnumber', 'conclusion', 'scl',
       'violation', 'ecli', 'application', 'languagenumber', 'sclappnos',
       'nonviolation', 'judgementdate', 'externalsources', 'doctype',
       'advopstatus', 'referencedate', 'doctypebranch', 'extractedappno',
       'advopidentifier', 'reportdate', 'kpdate', 'applicability',
       'representedby', 'appnoparts', 'docname', 'publishedby',
       'introductiondate', 'isplaceholder', 'rulesofcourt', 'resolutiondate',
       'languageisocode', 'appno', 'documentcollectionid2', 'article',
       'decisiondate', 'separateopinion', 'typedescription', 'importance_y',
       'respondent', 'originatingbody', 'documentcollectionid', 'issue',
       'rank'],
      dtype='object')

In [306]:
exp1_data = exp1_data[['Filename','importance_x','appno','docname']]

In [307]:
exp1_data.rename(columns={'importance_x':'importance'}, inplace=True)

In [284]:
def exp1_get_prompt(schema, name, appnos, prompt_type='first'):

    match prompt_type:
        #option to say don't know, no info on HUDOC given
        case 'first':
            exp1_prompt = f''' You are going to tell me the importance of the cases in the European Court of Human Rights. These values are given to every case after a judgment
                or decision has been received. 
                Using the information given to you tell me the case importance giving a response of either key case, 1, 2 or 3. 
                If you do not know the importance, state that you do not have enough information.
                The output should be given directly in JSON format, with the following schema: {schema}.
                The name of the case is {name} and the application number/s is/are {appno}.
                '''
        #option to say don't know, info on HUDOC given
        case 'second':
            exp1_prompt = f''' You are going to tell me the importance of the cases in the European Court of Human Rights. These values are given to every case after a judgment
                or decision has been received. The case importance is part of the metadata on HUDOC.
                Using the information given to you tell me the importance of the case giving a response of either key case, 1, 2 or 3. 
                If you do not know the importance, state that you do not have enough information.
                The output should be given directly in JSON format, with the following schema: {schema}.
                The name of the case is {name} and the application number/s is/are {appno}.
                '''
        #no option to say don't know, info on HUDOC given
        case 'third':
            exp1_prompt = f''' You are going to tell me the importance of the cases in the European Court of Human Rights. These values are given to every case after a judgment
                or decision has been received. The case importance is part of the metadata on HUDOC.
                Using the information given to you tell me the importance of the case giving a response of either key case, 1, 2 or 3. 
                The output should be given directly in JSON format, with the following schema: {schema}.
                The name of the case is {name} and the application number/s is/are {appno}.
                '''
        #no option to say don't know, no info on HUDOC given
        case 'fourth':
            exp1_prompt = f''' You are going to tell me the importance of the cases in the European Court of Human Rights. These values are given to every case after a judgment
                or decision has been received. 
                Using the information given to you tell me the importance of the case giving a response of either key case, 1, 2 or 3. 
                The output should be given directly in JSON format, with the following schema: {schema}.
                The name of the case is {name} and the application number/s is/are {appno}.
                '''
        #demands LLM to use HUDOC information, no option to say don't know
        case 'fifth':
            exp1_prompt = f''' You are going to tell me the importance of the cases in the European Court of Human Rights.
                Find the information from HUDOC and tell me the importance of the case giving a response of either key case, 1, 2 or 3. 
                The output should be given directly in JSON format, with the following schema: {schema}.
                The name of the case is {name} and the application number/s is/are {appno}.
                '''

    return exp1_prompt

In [336]:
exp1_data.index[1]

1

In [162]:
test_case = exp1_data.iloc[0]

In [163]:
test_case

Filename                 001-179979
importance                        4
appno                      73487/12
docname       TAŞ (ÇAKAR) v. TURKEY
Name: 0, dtype: object

In [331]:
prompt_exp1 = exp1_get_prompt(JSON_SCHEMAS[1], test_case['docname'], test_case['appno'],'first')

In [296]:
prompt_exp1

" You are going to tell me the importance of the cases in the European Court of Human Rights. These values are given to every case after a judgment\n                or decision has been received. The case importance is part of the metadata on HUDOC.\n                Using the information given to you tell me the importance of the case giving a response of either key case, 1, 2 or 3. \n                The output should be given directly in JSON format, with the following schema: {'Case Importance': 'int (1-4)', 'Summary': 'string (description of the case)', 'Reasoning': 'string (give your reason for the importance)'}.\n                The name of the case is TAŞ (ÇAKAR) v. TURKEY and the application number/s is/are 6697/18.\n                "

In [332]:
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[{"role": "user", "content": prompt_exp1}],
  response_format={'type': 'json_object'},
  max_tokens=500,
  temperature=0
)

In [333]:
response

ChatCompletion(id='chatcmpl-9brm8VjQpd6MMvtgh1pg6yYaNrK7N', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n  "Case Importance": "I do not have enough information"\n}', role='assistant', function_call=None, tool_calls=None))], created=1718811876, model='gpt-4o-2024-05-13', object='chat.completion', system_fingerprint='fp_f4e629d0a5', usage=CompletionUsage(completion_tokens=15, prompt_tokens=155, total_tokens=170))

In [298]:
x = response.choices[0].message.content

In [299]:
try:
    data = json.loads(x)
except (json.JSONDecodeError, IndexError):
    raise IndexError(f'Error in decoding JSON response: {x}')

In [320]:
test_df = pd.DataFrame()

In [325]:
# Create a DataFrame from the loaded JSON data
temp_df = pd.DataFrame(data,index=[0])

# Concatenate the DataFrame to 'test_df'
test_df = pd.concat([test_df, temp_df], ignore_index=True)


In [301]:

pd.DataFrame(data, index=[0])

Unnamed: 0,Case Importance,Summary,Reasoning
0,2,The case TAŞ (ÇAKAR) v. TURKEY (application nu...,The case has been assigned an importance level...


In [328]:
#evaluation of json files
pd.read_pickle('exp_1_output.pkl')

Unnamed: 0,Case Importance,Summary,Reasoning
0,3,"The case TAŞ (ÇAKAR) v. TURKEY, application nu...","Based on the information provided, there is no..."
1,I do not have enough information,The case KRDŽALIJA AND OTHERS v. MONTENEGRO wi...,The importance of the case is determined based...


##### Experiment 2

In [132]:
JSON_SCHEMAS = [{
    "Case Importance": {"Importance":"int (1-4)","Summary":"string (description of the case)","Reasoning":"string (give your reason for the importance)" }},
    {"Case Importance": {"Importance":"int (1-4)"}}]

In [217]:
def get_prompt(row, zero_shot:bool =True, text:int = 3, examples:list = [], schema:dict = JSON_SCHEMAS[1]):

    '''
    Function to generate a prompt for the GPT-4o model.
    
    Parameters: 
    row: pd.Series
        A row from the dataframe containing the data.
    zero_shot: bool
        A boolean to determine if the prompt is for zero-shot learning.
    text: int
        The section/s of the text to include in the prompt:
            1 = Subject Matter
            2 = Questions
            3 = Both
    examples: list
        A list of the examples to include in the prompt.
        
    Returns:
    prompt: str
        The prompt to be used for the GPT-4o model.
    '''

    match text:
        case 1:
            text = row['Subject Matter']
            text_amount = 'subject matter of the case'
        case 2:
            text = row['Questions']
            text_amount = 'questions asked to the parties'
        case 3:
            text = row['Subject Matter'] + ' ' + row['Questions']
            text_amount = 'subject matter of the case and the questions asked to the parties'
        case _:
            raise ValueError('Invalid text value. Please enter a value between 1 and 3.')

    if zero_shot:
        additional_context = ''
    else:
        #examples = [f'Importance: {i}\n{e}' for i, e in zip(row['importance'], examples)]
        additional_context = f'''You are also given a number of examples for each level of importance. 
                                 1: {examples[0]}; 2: {examples[1]}; 3: {examples[2]}; 4: {examples[3]}'''

    importance_levels = '''1: These are the most important and have been selected as key cases and have been selected for publication in the Court\'s official reports; 
                           2: The case is of high importance. The case makes a significant contribution to the development, clarification or modification of its case law, either generally or in relation to a particular case; 
                           3: The case is of medium importance. The case while not making a significant contribution to the case-law, nevertheless it goes beyond merely applying existing case law; 
                           4: The case is of low importance. The case is of limited interest and simply applies existing case law'''
    
    prompt = f''' You are a lawyer in the European Court of Human Rights, and your goal is to predict the importance of a case, based on information provided from a communicated case. 
    The following information is provided to you:
    You will be given a communicated case, including the {text_amount}.
    You are given a description of the different levels of importance: {importance_levels}.
    {additional_context}.
    Based only on the information given to you predict the importance of the case according to the criteria given, giving a response of either 1, 2, 3 or 4. 
    If you do not have enough information to make a prediction, state that you do not have enough information.
    The output should be given directly in JSON format, with the following schema: {schema}.
    The communicated case information you should base your judgement on is as follows: {text}.
    '''

    return prompt

In [222]:
#basic test
prompt = get_prompt(df.iloc[0],True,3,schema=JSON_SCHEMAS[0])

In [223]:
prompt.strip()

"You are a lawyer in the European Court of Human Rights, and your goal is to predict the importance of a case, based on information provided from a communicated case. \n    The following information is provided to you:\n    You will be given a communicated case, including the subject matter of the case and the questions asked to the parties.\n    You are given a description of the different levels of importance: 1: These are the most important and have been selected as key cases and have been selected for publication in the Court's official reports; \n                           2: The case is of high importance. The case makes a significant contribution to the development, clarification or modification of its case law, either generally or in relation to a particular case; \n                           3: The case is of medium importance. The case while not making a significant contribution to the case-law, nevertheless it goes beyond merely applying existing case law; \n                

In [224]:
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[{"role": "user", "content": prompt}],
  response_format={'type': 'json_object'},
  max_tokens=500,
  temperature=0
  
)

In [225]:
print(response.choices[0].message.content)


{
  "Case Importance": {
    "Importance": 2,
    "Summary": "The applicant, a Turkish national, was convicted for disseminating propaganda in favor of an illegal armed organization during a demonstration. She was sentenced to ten months' imprisonment, with the pronouncement of her conviction suspended on the condition of not committing another intentional offense for five years. The applicant claims violations of her rights under Articles 6, 10, and 11 of the Convention.",
    "Reasoning": "The case raises significant questions regarding the balance between national security concerns and the fundamental rights to freedom of expression and assembly. The reference to a previous case (Gülcü v. Turkey) suggests that this case could contribute to the development or clarification of the Court's case law on these issues, particularly in the context of Turkey's Prevention of Terrorism Act. Therefore, it is of high importance."
  }
}


#### MONITOR BATCH API

In [367]:
client.batches.list(limit=10)

SyncCursorPage[Batch](data=[Batch(id='batch_mKqY9rqtexQZCZOtGQakwehZ', completion_window='24h', created_at=1718816633, endpoint='/v1/chat/completions', input_file_id='file-QI0yrMIEo031Ugkq7XV7zVQJ', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1718816643, error_file_id=None, errors=None, expired_at=None, expires_at=1718903033, failed_at=None, finalizing_at=1718816640, in_progress_at=1718816633, metadata={'description': 'Experiment 1 Valid_4:  experiment_1_third.jsonl'}, output_file_id='file-nggCEpQaQpVN9oa0Aj9cZFKv', request_counts=BatchRequestCounts(completed=50, failed=0, total=50)), Batch(id='batch_qGlMxlC1TKehLdriBbgDZLfS', completion_window='24h', created_at=1718816632, endpoint='/v1/chat/completions', input_file_id='file-6vsDA9yztLOuZNygHrC7bx4Q', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1718903032, failed_at=None, finalizi