In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
def most_severe(offence_list, severity_scores_data):
    """
    Determine the most severe offence based on a list of offences and their severity scores.

    Parameters:
    - offence_list (list): A list of offence names to consider.
    - severity_scores_data (pd.DataFrame): A DataFrame containing severity scores for each offence.

    Returns:
    - str: The name of the highest severity offence.
    """
    # Filter the scores DataFrame for the relevant offences
    severity_scores_relevant = severity_scores_data[list(offence_list)]
    # Identify the offence with the maximum score and return its name
    severity_score_highest = severity_scores_relevant.idxmax(axis=1).iloc[0]

    return severity_score_highest

In [3]:
def increased_severity(offence_list_index, offence_list_previous, severity_scores_data):
    """
    Check if the severity of current index offences has increased compared to previous offences.

    Parameters:
    - offence_list_index (list): A list of current index offences.
    - offence_list_previous (list): A list of previous offences.
    - severity_scores_data (pd.DataFrame): A DataFrame containing severity scores.

    Returns:
    - str: 'yes' if the current index offences have increased in severity, 'no' otherwise.
    """
    # Filter the scores for the index offences and compute the maximum severity
    severity_scores_relevant_index = severity_scores_data[list(offence_list_index)]
    severity_score_highest_index = severity_scores_relevant_index.loc['mean'].max()

    # Filter the scores for the previous offences and compute the maximum severity
    severity_scores_relevant_previous = severity_scores_data[list(offence_list_previous)]
    severity_score_highest_previous = severity_scores_relevant_previous.loc['mean'].max()

    # Compare the maximum severities and return the appropriate response
    if severity_score_highest_index > severity_score_highest_previous:
        return 'yes'
    else:
        return 'no'

In [4]:
def age_group(age):
    """
    Categorise an individual's age into predefined age groups.

    Parameters:
    - age (int): The age of the individual.

    Returns:
    - str: The age group category.
    """
    # Determine the appropriate age group based on the provided age
    if age <= 17:
        return '<18'
    elif 18 <= age <= 21:
        return '18-21'
    elif 22 <= age <= 39:
        return '22-39'
    elif 40 <= age <= 55:
        return '40-55'
    elif 56 <= age <= 70:
        return '56-70'
    elif age > 70:
        return '>79'

In [5]:
def get_tax_year(hearing_date):
    """
    Determine the tax year based on a given hearing date.

    Parameters:
    - hearing_date (pd.Timestamp): The hearing date to evaluate.

    Returns:
    - str: The tax year corresponding to the date, or 'outside range' if not applicable.
    """
    # Check the hearing date against predefined tax year ranges
    if pd.Timestamp('2017-04-06') <= hearing_date <= pd.Timestamp('2018-04-05'):
        return '2017/18'
    if pd.Timestamp('2018-04-06') <= hearing_date <= pd.Timestamp('2019-04-05'):
        return '2018/19'
    if pd.Timestamp('2019-04-06') <= hearing_date <= pd.Timestamp('2020-04-05'):
        return '2019/20'
    
    # Return a message if the hearing date is outside the defined ranges
    return 'outside range'

In [6]:
def convert_to_set(string):
    """
    Converts a string representation of a list to a set.

    This function directly evaluates a string containing a list (e.g., "[1, 2, 3]") 
    and converts it into a Python set.

    Args:
    - string (str): The string representation of a list to be converted.

    Returns:
    - set: A set containing the elements from the evaluated list.
    """
    
    # Directly evaluate the string to a list and convert to set
    return set(ast.literal_eval(string))


In [7]:
def prepare_data(linked_data, severity_scores_data):
    """
    Prepare and transform linked data for analysis by applying various replacements and calculations.

    Parameters:
    - linked_data (pd.DataFrame): The DataFrame containing linked data.
    - severity_scores_data (pd.DataFrame): The DataFrame containing the severity scores of each offending category. 

    Returns:
    - pd.DataFrame: A cleaned and prepared DataFrame ready for analysis.
    """
    # Create a copy of the linked data to avoid modifying the original DataFrame
    linked_copy = linked_data.copy()

    # Replace categorical descriptions with their corresponding dictionary mappings
    linked_copy['decision_binary'] = linked_copy['decision'].replace(binary_dict)
    linked_copy['sentence_type'] = linked_copy['custody_type_description'].replace(sentence_type_dict)
    linked_copy['review_reason'] = linked_copy['review_reason_description'].replace(review_reason_dict)
    linked_copy['representation'] = linked_copy['representation_status_description'].replace(representation_dict)
    linked_copy['ethnic_group'] = linked_copy['ethnicity_description'].replace(ethnicity_dict)
    linked_copy['gender'] = linked_copy['gender'].replace(gender_dict)

    # Apply the age group categorization to create a new column
    linked_copy['age_group'] = linked_copy['years_old_at_hearing'].apply(age_group)

    # Convert index and previous offence lists to sets
    linked_copy['index_offences'] = linked_copy['index_offences'].apply(convert_to_set)
    linked_copy['previous_offences'] = linked_copy['previous_offences'].apply(convert_to_set)

    # Determine the most severe index offence
    linked_copy['most_severe_index'] = linked_copy['index_offences'].apply(lambda x: most_severe(x, severity_scores_data))
    
    # Determine if there were previous offences
    linked_copy['previous'] = linked_copy['previous_offences'].apply(lambda x: 'yes' if len(x) > 0 else 'no')

    # Determine if the severity of offending has increased
    linked_copy['increased_severity'] = linked_copy.apply(lambda row: increased_severity(row['index_offences'], row['previous_offences'], severity_scores_data), axis=1)

    # Check for repeat offending by determining if any index offences intersect with previous offences
    linked_copy['repeat_offending'] = linked_copy.apply(
        lambda row: 'yes' if row['index_offences'].intersection(row['previous_offences']) else 'no',
        axis=1
    )

    # Calculate the number of days elapsed between hearing and original target dates
    linked_copy['elapsed_days'] = (linked_copy['hearing_date'] - linked_copy['original_target_date']).dt.days

    # Assign hearing status based on elapsed days
    linked_copy['hearing_status'] = np.where(linked_copy['elapsed_days'] > 0, 'delayed',
                                            np.where(linked_copy['elapsed_days'] == 0, 'on time', 'early'))

    # Extract year from the hearing date and determine the tax year
    linked_copy['year'] = linked_copy['hearing_date'].dt.year.astype(str)
    linked_copy['tax_year'] = linked_copy['hearing_date'].apply(get_tax_year)

    # Identify if there are multiple hearings for a prisoner
    linked_copy['multiple_hearings'] = linked_copy['prisoner_id'].duplicated(keep=False).replace({True: 'yes', False: 'no'})

    # Initialize previous hearing status
    linked_copy['previous_hearing'] = 'no'
    for index, row in linked_copy.iterrows():
        prisoner_id = row['prisoner_id']
        current_date = row['hearing_date']
        # Check if there were any previous hearings for the current prisoner
        if ((linked_copy['prisoner_id'] == prisoner_id) & (linked_copy['hearing_date'] < current_date)).any():
            linked_copy.at[index, 'previous_hearing'] = 'yes'

    # Drop unnecessary columns from the prepared data
    prepared_data = linked_copy.drop(
        columns=['decision', 'custody_type_description', 'review_reason_description',
                 'representation_status_description', 'nationality_description', 'ethnicity_description',
                 'index_offences', 'previous_offences', 'current_establishment_description', 'original_target_date',
                 'difference_days', 'hearing_type', 'years_old_at_hearing', 'link_type']
    )

    return prepared_data

In [8]:
binary_dict = {
    'no direction for release': 'knockback',
    'direction for release': 'progression',
    'open conditions': 'progression',
    'no open conditions': 'knockback'
}

sentence_type_dict = {
    'Mandatory (MLP)': 'life',
    'Discretionary': 'life',
    'Discretionary (Tariff Expired)': 'life',
    'Automatic': 'life',
    'HMP [*]': 'life',
    'CFL (murder) (S93)': 'life',
    'CFL (non-murder) (S94)': 'life',
    'DFL': 'life',
    'Life sentence for 2nd listed offence': 'life',
    'Lifer Migration': 'life',
    'IPP': 'ipp',
    'DPP': 'ipp',
    'EDS (non parole)': 'extended determinate',
    'EDS': 'extended determinate',
    'EPP': 'extended determinate',
    'ESP': 'extended determinate',
    'Determinate': 'determinate',
    'DCR': 'determinate',
    'SOPC': 'determinate'
}

review_reason_dict = {
    'Pre Tariff': 'pre tariff',
    'First Review [*]': 'on tariff',
    'On Tariff': 'on tariff',
    'Post Tariff': 'post tariff',
    'Post tariff consideration for open conditions': 'post tariff',
    '01 RECALL': 'recall',
    '02 ESP': 'recall',
    'Subsequent Review [*]': 'recall',
    'Oral Lifer Recall Hearing': 'recall',
    'Recall Outcome': 'recall',
    'Advice Case': 'advice',
    'Review_outcome': np.nan,
    'Oral Hearing': np.nan,
    'Miscellaneous Review': np.nan,
    'Oral Determinate Pre Release': np.nan
}

representation_dict = {
    'Not Represented': 'not represented',
    'Represented': 'represented',
    'Not Applicable': np.nan,
    'Not Specified': np.nan
}

ethnicity_dict = {
    'White - British': 'white',
    'White - Other': 'white other',
    'White - Irish': 'white other',
    'White Gypsy or Irish Traveller': 'white other',
    'Black or Black British - Caribbean': 'black',
    'Black or Black British - Africa': 'black',
    'Black or Black British - Other': 'black',
    'Mixed - White & Black Caribbean': 'mixed',
    'Mixed - Other': 'mixed',
    'Mixed - White & Black African': 'mixed',
    'Mixed - white & Asian': 'mixed',
    'Asian or Asian British - Pakistani': 'asian',
    'Asian or Asian British - Other': 'asian',
    'Asian or Asian British - Indian': 'asian',
    'Asian or Asian British - Bangladeshi': 'asian',
    'Chinese': 'asian',
    'Refusal': np.nan,
    'Not Applicable': np.nan,
    'Other - Arab': 'other',
    'Other Ethnic Group': 'other',
    'Not Known': np.nan
}

gender_dict = {
    'M': 'male',
    'F': 'female',
    'F ( Was M )': 'transgender',
    'M ( Was F )': 'transgender'
}

In [14]:
# Load the linked data
linked_mcadl = pd.read_excel('../data/linked_data/mcadl/linked_mcadl.xlsx', dtype={'letter_id': str})
linked_ohdl = pd.read_excel('../data/linked_data/ohdl/linked_ohdl.xlsx', dtype={'letter_id': str})
# Load severity scores
severity_scores = pd.read_excel('../data/supplementary_data/severity_scores.xlsx', index_col=0)

In [15]:
# Run data preparation
prepared_mcadl = prepare_data(linked_mcadl, severity_scores)
prepared_ohdl = prepare_data(linked_ohdl, severity_scores)

In [18]:
# Save prepared data
prepared_mcadl.to_excel('../data/linked_data/mcadl/prepared_mcadl.xlsx', index=False)
prepared_ohdl.to_excel('../data/linked_data/ohdl/prepared_ohdl.xlsx', index=False)