In [1]:
import spacy
import re
import pandas as pd
import joblib
import sklearn

In [10]:
def simplify_crimes(extract_data, model_cat, model_type):
    """
    Simplify crime data by classifying offences into categories and types, before reducing them.

    Parameters:
    - extract_data: DataFrame containing extracted primary crime data.
    - model_cat: Model for classifying offence categories.
    - model_type: Model for classifying offence types.

    Returns:
    - DataFrame with simplified offence classifications.
    """

    extract_data_copy = extract_data.copy()
    
    # Classify crimes into categories
    extract_data_copy['offence_category'] = model_cat.predict(extract_data['crime'])

    # Remove rows where the crime is classified as a 'none' category
    extract_data_copy = extract_data_copy[extract_data_copy['offence_category'] != 'none'].reset_index(drop=True)

    # Classify crime contexts into types
    extract_data_copy['offence_type'] = model_type.predict(extract_data_copy['context'])

    # Only keep letters where there is at least one context which has been classified as index
    extract_data_copy = extract_data_copy[extract_data_copy.groupby('letter_id')['offence_type'].transform(lambda x: 'i' in x.values)].reset_index(drop=True)

    # Group offence categories and types by letter
    extract_data_simplified = extract_data_copy.groupby('letter_id').apply(
        lambda x: pd.Series({
            'index_offences': set(x.loc[x['offence_type'] == 'i', 'offence_category']), # Unique index offence categories per letter
            'previous_offences': set(x.loc[x['offence_type'] == 'o', 'offence_category']) # Unique previous offence categories per letter
        })
    ).reset_index()

    return extract_data_simplified



In [11]:
# Load extracted crime data from Excel files
extract_mcadl = pd.read_excel('../data/primary_data/extract/mcadl/extract_mcadl.xlsx', dtype={'letter_id': str})
extract_ohdl = pd.read_excel('../data/primary_data/extract/ohdl/extract_ohdl.xlsx', dtype={'letter_id': str})

In [12]:
# Load offence category and type classification models
model_path_cat = '../data/models/offence_cat_model.pkl'
model_cat = joblib.load(model_path_cat)

model_path_type_mcadl = '../data/models/offence_type_model_mcadl.pkl'
model_type_mcadl = joblib.load(model_path_type_mcadl)

model_path_type_ohdl = '../data/models/offence_type_model_ohdl.pkl'
model_type_ohdl = joblib.load(model_path_type_ohdl)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
# Simplify crimes for both data sets
simplified_mcadl = simplify_crimes(extract_mcadl, model_cat, model_type_mcadl)
simplified_ohdl = simplify_crimes(extract_ohdl, model_cat, model_type_ohdl)

  extract_data_simplified = extract_data_copy.groupby('letter_id').apply(
  extract_data_simplified = extract_data_copy.groupby('letter_id').apply(


In [14]:
simplified_mcadl.to_excel('../data/primary_data/extract/mcadl/simplified_mcadl.xlsx', index=False)
simplified_ohdl.to_excel('../data/primary_data/extract/ohdl/simplified_ohdl.xlsx', index=False)