In [102]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import numpy as np


### Read in Data

In [22]:
data_directory = './Data/'

In [23]:
# Set the directory for the txt files
questions = data_directory +'questions/'
subject_matter = data_directory + 'subject_matter/'

# Initialize an empty list to store the data
questions_list = []
subject_matter_list = []

# Iterate over the subject matter txt files
for file in os.listdir(subject_matter):
    # Read the txt file into a dataframe
    with open(subject_matter + file, 'r') as text:
        data = [file, text.read()]
    # Append the data to the respective lists
    subject_matter_list.append(data)

# Iterate over the question txt files
for file in os.listdir(questions):
    # Read the txt file into a dataframe
    with open(questions + file, 'r') as text:
        data = [file, text.read()]
    # Append the data to the respective lists
    questions_list.append(data)

#questions_list
questions_df = pd.DataFrame(questions_list,columns=['Filename','Questions'])
subject_matter_df = pd.DataFrame(subject_matter_list,columns=['Filename','Subject Matter'])

In [24]:
subject_matter_df['Filename'] = subject_matter_df['Filename'].str.replace('.txt', '')
questions_df['Filename'] = questions_df['Filename'].str.replace('.txt', '')

### Word Count

In [43]:
questions_df['Word Count'] = questions_df['Questions'].str.split().str.len()
subject_matter_df['Word Count'] = subject_matter_df['Subject Matter'].str.split().str.len()

In [44]:
subject_matter_df[subject_matter_df['Word Count'] > 10000].sort_values(by='Word Count', ascending=False)

Unnamed: 0,Filename,Subject Matter,Word Count
1260,001-123768,"\n1. The applicant, Mr Zayn Al-Abidin Muhammad...",41339
408,001-112302,"\n1. The applicant, Mr Abd Al Rahim Hussayn Mu...",36320
551,001-113814,"THE FACTS\n1. The applicant, Mr Abd al Rahim H...",31110
1805,001-142597,\n1. Mr Khodorkovskiy (the first applicant) wa...,22623
1244,001-122872,"\n\n1. The applicant, Ms Yuliya Volodymyrivna ...",21128
5028,001-187541,"The applicant, Mr Leonid Borisovich Nevzlin, i...",20084
780,001-115816,"\nThe applicant, Mr Abu Zubaydah, is a statele...",16538
2748,001-157725,"The applicant, Mr Zaven Volodyayevich Naltakya...",16013
2244,001-149050,\n1. Application no. 32631/09 was lodged on 11...,14619
2476,001-153895,"The first applicant, Times Newspapers Ltd, is ...",14480


### Link Data

In [45]:
# create merged df
df = pd.merge(questions_df, subject_matter_df, on='Filename')

In [46]:
#create unmatched df to check for missing data
unmatched_df = pd.merge(questions_df, subject_matter_df, on='Filename', how='outer', indicator=True)
unmatched_rows = unmatched_df[unmatched_df['_merge'] != 'both']
len(unmatched_rows)

273

In [47]:
#see how missingness is distributed according to the files
missing_data = []

for file in os.listdir(data_directory):
    # Read the txt file into a dataframe
    if file.endswith('.txt'):
        with open(data_directory + file, 'r') as text:
            data = pd.read_csv(text, sep=' ',header=None)
            missing_data.append(data)
            print(f'filename {file}; count {len(data)}')

filename questions_missing.txt; count 141
filename subject_missing.txt; count 339


In [48]:
questions_missing = missing_data[0]
subject_missing = missing_data[1]

In [29]:
#merge togehter the missing data to see if it affects both files or just one for a case
x = pd.merge(subject_missing, questions_missing, on=0, how='outer', indicator=True)

In [30]:
x.rename(columns={0: 'Filename'}, inplace=True)

In [33]:
#use these lists to manually check data quality
both = x[x['_merge'] == 'both']
right = x[x['_merge'] == 'right_only']
left = x[x['_merge'] == 'left_only']

### Data Processing

In [52]:
df = df.rename(columns={'Word Count_x': 'Question_Count', 'Word Count_y': 'Subject_Matter_Count'})


In [54]:
#remove whitespace
df['Questions'] = df['Questions'].str.strip()
df['Subject Matter'] = df['Subject Matter'].str.strip()

In [71]:
#remove repeat phrases
df['Subject Matter'] = df['Subject Matter'].str.replace('THE FACTS\n', '')
df['Subject Matter'] = df['Subject Matter'].str.replace('\n', '')
df['Questions'] = df['Questions'].str.replace('\n', '')

In [97]:
#link outcome label
labels = pd.read_csv(data_directory + 'importance_labels.csv')
labels = labels.rename(columns={'itemid':'Filename'})

In [99]:
df = pd.merge(df, labels, on='Filename')

In [100]:
df

Unnamed: 0,Filename,Questions,Question_Count,Subject Matter,Subject_Matter_Count,appno,source_file,importance
0,001-109309,1. In the light of the applicants’ claims and ...,94,"The applicants, M.M., his wife and a minor chi...",935,72861/11,pruned_ADMISSIBILITY_meta.json,4
1,001-109315,1. Did the applicant have a fair hearing in th...,216,"The applicant, Mr Aurelian Anghel, is a Romani...",3392,5968/09,pruned_CHAMBER_meta.json,4
2,001-109317,1. Has there been a violation of the applicant...,61,"The applicant, Mr Vladimirs Oderovs, is a Latv...",780,21979/08,pruned_COMMITTEE_meta.json,4
3,001-109386,1. Has there been a violation of Article 6 § 1...,146,"1. The applicant, Mr Victor Sandu, is a Moldov...",1641,16463/08,pruned_CHAMBER_meta.json,3
4,001-109387,Has there been a violation of Article 3 of the...,48,"1. The applicant, Mr Sergiu Buhaniuc, is a Mol...",805,56074/10,pruned_CHAMBER_meta.json,4
...,...,...,...,...,...,...,...,...
6990,001-229304,1. Has there been a violation of the applicant...,167,"The applicants are Italian nationals, living i...",54,54648/21,pruned_ADMISSIBILITYCOM_meta.json,4
6991,001-229359,Has there been a violation of Article 5 § 3 of...,98,"The applicant’s pre-trial detention, which sta...",300,9573/23,pruned_ADMISSIBILITYCOM_meta.json,4
6992,001-229708,Has the applicant suffered discrimination in t...,102,The application concerns the refusal to grant ...,130,19191/19,pruned_ADMISSIBILITYCOM_meta.json,4
6993,001-229713,Has the applicant suffered a violation of Arti...,135,"By a judgment of 29 November 2016, the La Spez...",293,53715/20,pruned_ADMISSIBILITYCOM_meta.json,4


In [121]:
# check importance counts
importance_counts = df['importance'].value_counts()


In [122]:
importance_counts = pd.DataFrame(importance_counts)

In [130]:
importance_counts['pct'] = importance_counts.apply(lambda x: x / x.sum())

In [138]:
importance_counts['sample_cases'] = importance_counts['pct'].apply(lambda x: round(x * 50))

In [139]:
importance_counts

Unnamed: 0_level_0,count,pct,sample_cases
importance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,5680,0.812009,41
3,1132,0.16183,8
1,150,0.021444,1
2,33,0.004718,0


In [144]:
def sampleHelper(df, importance, size):

    return df[df['importance'] == importance].sample(size, random_state=154)

In [157]:
def sample(df):

    sample_list = []

    for imp in df['importance'].unique():
        if imp == 1:
            sample_list.append(sampleHelper(df, imp, 6))
        elif imp == 2:
            sample_list.append(sampleHelper(df, imp, 6))
        elif imp == 3:
            sample_list.append(sampleHelper(df, imp, 11))
        else:
            sample_list.append(sampleHelper(df, imp, 31))

    return pd.concat(sample_list)

In [162]:
sample_df = sample(df)

In [169]:
valid_data = sample_df[['Filename','Questions','Subject Matter','importance']]

In [165]:
test_data = df.drop(sample_df.index)
test_data = test_data[['Filename','Questions','Subject Matter','importance']]

In [None]:
valid_data.to_pickle('valid_data.pkl')
test_data.to_pickle('test_data.pkl')

### GPT Tests

In [178]:
from API_key import openai_key
from openai import OpenAI
client = OpenAI(api_key=openai_key)

In [172]:
# read in data
df = pd.read_pickle('valid_data.pkl')

In [184]:
df

Unnamed: 0,Filename,Questions,Subject Matter,importance
4430,001-179979,Has there been a violation of the applicant’s ...,The applicant is a Turkish national who was bo...,4
4497,001-180761,1. Have the applicants complied with the six-m...,The application concerns the deportation of th...,4
3593,001-169580,Was the applicant subjected to inhuman and deg...,"The applicant, Mr Mihai Basturea, is a Romania...",4
3329,001-165054,"Was Demyan Khadzhyradov’s right to life, ensur...","The applicants, Mr Mykola Demyanovych Khadzhyr...",4
6377,001-209011,Has there been an interference with the applic...,"The applicant complains, under Article 1 of Pr...",4
158,001-110558,1. Was the applicant subjected to ill-treatmen...,"The applicant, Mr Iurie Craciuneac, is a Moldo...",4
4968,001-187172,1. Was Article 6 § 1 of the Convention under i...,"The applicant, Mr Ali Özdemir, is a Turkish na...",4
1027,001-118892,1. Did the applicant have a fair hearing in th...,"The applicant, Mr Jerome Colloredo-Mansfeld, i...",4
5771,001-200157,Has there been a violation of the applicants’ ...,The circumstances of the caseThe facts of the ...,4
6888,001-220247,Has the applicant been convicted twice for the...,The application concerns the applicant’s right...,4


In [None]:
### Experiment 1: What does GPT-4 know about the ECtHR? Give it the case id and appNo and see if it knows the importance of the case already...
#TO DO:
## Write the classification code
## Write the code
## COST it

### Experiment 2: Prediction of the importance of the case based on the questions and subject matter

#TO DO:
## Write the code
## COST it

##### Experiment 1

In [None]:
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
)

##### Experiment 2

In [216]:
JSON_SCHEMAS = [{
    "Case Importance": {"Importance":"int (1-4)","Summary":"string (description of the case)","Reasoning":"string (give your reason for the importance)" }},
    {"Case Importance": {"Importance":"int (1-4)"}}]

In [217]:
def get_prompt(row, zero_shot:bool =True, text:int = 3, examples:list = [], schema:dict = JSON_SCHEMAS[1]):

    '''
    Function to generate a prompt for the GPT-4o model.
    
    Parameters: 
    row: pd.Series
        A row from the dataframe containing the data.
    zero_shot: bool
        A boolean to determine if the prompt is for zero-shot learning.
    text: int
        The section/s of the text to include in the prompt:
            1 = Subject Matter
            2 = Questions
            3 = Both
    examples: list
        A list of the examples to include in the prompt.
        
    Returns:
    prompt: str
        The prompt to be used for the GPT-4o model.
    '''

    match text:
        case 1:
            text = row['Subject Matter']
            text_amount = 'subject matter of the case'
        case 2:
            text = row['Questions']
            text_amount = 'questions asked to the parties'
        case 3:
            text = row['Subject Matter'] + ' ' + row['Questions']
            text_amount = 'subject matter of the case and the questions asked to the parties'
        case _:
            raise ValueError('Invalid text value. Please enter a value between 1 and 3.')

    if zero_shot:
        additional_context = ''
    else:
        #examples = [f'Importance: {i}\n{e}' for i, e in zip(row['importance'], examples)]
        additional_context = f'''You are also given a number of examples for each level of importance. 
                                 1: {examples[0]}; 2: {examples[1]}; 3: {examples[2]}; 4: {examples[3]}'''

    importance_levels = '''1: These are the most important and have been selected as key cases and have been selected for publication in the Court\'s official reports; 
                           2: The case is of high importance. The case makes a significant contribution to the development, clarification or modification of its case law, either generally or in relation to a particular case; 
                           3: The case is of medium importance. The case while not making a significant contribution to the case-law, nevertheless it goes beyond merely applying existing case law; 
                           4: The case is of low importance. The case is of limited interest and simply applies existing case law'''
    
    prompt = f''' You are a lawyer in the European Court of Human Rights, and your goal is to predict the importance of a case, based on information provided from a communicated case. 
    The following information is provided to you:
    You will be given a communicated case, including the {text_amount}.
    You are given a description of the different levels of importance: {importance_levels}.
    {additional_context}.
    Based only on the information given to you predict the importance of the case according to the criteria given, giving a response of either 1, 2, 3 or 4. 
    If you do not have enough information to make a prediction, state that you do not have enough information.
    The output should be given directly in JSON format, with the following schema: {schema}.
    The communicated case information you should base your judgement on is as follows: {text}.
    '''

    return prompt

In [222]:
#basic test
prompt = get_prompt(df.iloc[0],True,3,schema=JSON_SCHEMAS[0])

In [223]:
prompt.strip()

"You are a lawyer in the European Court of Human Rights, and your goal is to predict the importance of a case, based on information provided from a communicated case. \n    The following information is provided to you:\n    You will be given a communicated case, including the subject matter of the case and the questions asked to the parties.\n    You are given a description of the different levels of importance: 1: These are the most important and have been selected as key cases and have been selected for publication in the Court's official reports; \n                           2: The case is of high importance. The case makes a significant contribution to the development, clarification or modification of its case law, either generally or in relation to a particular case; \n                           3: The case is of medium importance. The case while not making a significant contribution to the case-law, nevertheless it goes beyond merely applying existing case law; \n                

In [224]:
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[{"role": "user", "content": prompt}],
  response_format={'type': 'json_object'},
  max_tokens=500,
  temperature=0
  
)

In [225]:
print(response.choices[0].message.content)


{
  "Case Importance": {
    "Importance": 2,
    "Summary": "The applicant, a Turkish national, was convicted for disseminating propaganda in favor of an illegal armed organization during a demonstration. She was sentenced to ten months' imprisonment, with the pronouncement of her conviction suspended on the condition of not committing another intentional offense for five years. The applicant claims violations of her rights under Articles 6, 10, and 11 of the Convention.",
    "Reasoning": "The case raises significant questions regarding the balance between national security concerns and the fundamental rights to freedom of expression and assembly. The reference to a previous case (Gülcü v. Turkey) suggests that this case could contribute to the development or clarification of the Court's case law on these issues, particularly in the context of Turkey's Prevention of Terrorism Act. Therefore, it is of high importance."
  }
}
