# Law

In [None]:
import sys

# append the directory of law module to sys.path list
sys.path.append('../../modules/')

In [None]:
import datetime as dt
import glob
import math
import os
import re

import arrest
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

## NIBRS

In [None]:
nibrs_file = '../../US/01_inputs/FBI/N-DEx-IEPD-4.0/documentation/nibrs/N-DEx_4.0_CodeTables_NIBRS.xlsx'

In [None]:
nibrs = pd.ExcelFile(nibrs_file)

In [None]:
nibrs_df = pd.read_excel(
    nibrs_file, sheet_name='OffenseCode', dtype=str, skiprows=2)

In [None]:
nibrs_df.dropna(axis=1, how='all', inplace=True)

In [None]:
nibrs_df['NCIC Code'].fillna('', inplace=True)

In [None]:
fbi_offense_categories = pd.read_csv(
    '../../US/01_inputs/processed/fbi_offense_categories.csv',
    dtype=str,
    # usecols=['_offense_category_code', '_offense_category', '_crime_against'],
    keep_default_na=False,
)

## CIBRS

In [None]:
cibrs_file = pd.ExcelFile('../01_inputs/CIBRS Offense Listing.xlsx')

In [None]:
cibrs_file.sheet_names

In [None]:
cibrs_df = pd.read_excel(
    '../01_inputs/CIBRS Offense Listing.xlsx', sheet_name='Offense Codes', dtype=str)

In [None]:
cibrs_subset = cibrs_df[['CJIS CODE', 'NIBRS CODES']].copy()

In [None]:
cibrs_subset['NIBRS Offense Code'] = cibrs_subset['NIBRS CODES'].str.split(',')

In [None]:
cibrs_subset = cibrs_subset.explode('NIBRS Offense Code')

In [None]:
cibrs_subset[['CJIS CODE', 'NIBRS Offense Code']].drop_duplicates()

In [None]:
nibrs_categories = pd.read_csv(
    '../../US/01_inputs/processed/nibrs_categories.csv', dtype=str)

In [None]:
nibrs_df = pd.merge(
    cibrs_subset[['CJIS CODE', 'NIBRS Offense Code']].drop_duplicates(), nibrs_categories)

In [None]:
cjis_nibrs_df = nibrs_df.groupby('CJIS CODE').agg(
    {'Crime Against': arrest.format_unique, 'NIBRS Offense Description': arrest.format_unique, 'NIBRS Offense Category': arrest.format_unique}).reset_index()

In [None]:
cjis_nibrs_df.rename(columns={'CJIS CODE': 'CJISCode'}, inplace=True)

## `CHSOFF`

- [Data source](https://oag.ca.gov/law/code-tables) (specifically [Offense Codes (with LEI codes)](https://oag.ca.gov/sites/all/files/agweb/law-enforcement/code-tables/chsoff.csv?041820220129))
- [Names Source (XML)](https://oag.ca.gov/sites/all/files/agweb/law-enforcement/code-tables/chsoff.xml)

In [None]:
chsoff_xml = pd.read_xml('../01_inputs/chsoff_20220415.xml')

In [None]:
chsoff_xml.columns

In [None]:
chsoff_original = pd.read_csv(
    '../01_inputs/chsoff_20220204.csv',
    header=None,
    names=chsoff_xml.columns,
    dtype=str,
)

In [None]:
chsoff = chsoff_original.drop(
    labels=[
        'ValidationCode',
        'TransactionTypeCode',
        'DefaultTypeofCharge',
        'LiteralIndentifierCode',
        'ChargeDegree',
        'BCSCodesLEI',
        'BCSHierarchyCodes',
        'ALPSCognizantCode',
    ],
    axis=1,
).copy()

In [None]:
chsoff['EnactDate'] = pd.to_datetime(chsoff['EnactDate'])

In [None]:
chsoff['RepealAmendDate'] = pd.to_datetime(
    chsoff['RepealAmendDate'], errors='coerce')

### Subset data to applicable time periods

In [None]:
chsoff[chsoff['TypeOfStatCode'] == 'ZZ']

In [None]:
chsoff = (
    chsoff[
        (chsoff['EnactDate'] < '2021')
        & (
            (chsoff['RepealAmendDate'] < '2020-12-31')
            | chsoff['RepealAmendDate'].isnull()
        )
        & (~chsoff['TypeOfStatCode'].isin(['ZZ']))
    ]
    .copy()
    .reset_index(drop=True)
)

### Split `StatutoryNumericCodes` into section and subparts

In [None]:
chsoff[
    ['_section', '_subdivision', '_paragraph', '_subparagraph', '_clause']
] = chsoff.apply(
    lambda x: arrest.Charge.parse_code(
        x['StatutoryNumericCodes'], state='CA', arrests=False),
    axis=1,
    result_type='expand',
)

### Reassemble into _charge_reconstructed for joining on arrest data

In [None]:
chsoff['_charge_reconstructed'] = chsoff['TypeOfStatCode'].str.cat(
    chsoff[['_section', '_subdivision', '_paragraph', '_subparagraph', '_clause']],
    sep='',
    na_rep='',
)

### Join on NIBRS data for later comparison

In [None]:
chsoff_nibrs_df = pd.merge(chsoff, cjis_nibrs_df, how='inner')

In [None]:
chsoff_nibrs_df.to_csv('../01_inputs/processed/c01_nibrs_chsoff.csv',index=False)

### Prepare for joining on arrest data

In [None]:
chsoff.rename(
    columns={
        'StatutoryNumericCodes': '_original_code',
        'TypeOfStatCode': '_code_type',
        'TypeOfCharge': '_offense_level',
        'StatuteLiteral': '_charge_description'
    },
    inplace=True,
)

In [None]:
chsoff.sort_values(
    by=[
        '_code_type',
        '_original_code',
        '_offense_level',
        'EnactDate',
        'RepealAmendDate',
    ],
    ascending=[True, True, True, False, False],
    inplace=True,
    na_position='first',
)

### Handle amendments

In [None]:
repealed_amended = set(
    chsoff[chsoff['RepealAmendDate'].notnull()]['_charge_reconstructed'])

In [None]:
not_repealed_amended = set(
    chsoff[chsoff['RepealAmendDate'].isnull()]['_charge_reconstructed'])

In [None]:
amended = chsoff[chsoff['_charge_reconstructed'].isin(
    repealed_amended & not_repealed_amended)].copy()

#### Subset those amended in 2020 because I'll use the descriptions therein

In [None]:
amended_late = amended[amended['RepealAmendDate'] >= '2020'].copy()

In [None]:
not_replaced = chsoff[(chsoff['_charge_reconstructed'].isin(
    repealed_amended - not_repealed_amended))].copy()

In [None]:
not_amended = chsoff[(chsoff['_charge_reconstructed'].isin(
    not_repealed_amended - repealed_amended))].copy()

#### Combine DataFrames

In [None]:
processed = pd.concat(
    [amended_late, not_replaced, not_amended], ignore_index=True)

In [None]:
processed.columns

### Identify duplicates in processed data

In [None]:
processed_to_dedupe = processed[
    processed.duplicated(
        subset=['_charge_reconstructed', '_offense_level'], keep=False)
].copy()

In [None]:
processed_charges = set(processed['_charge_reconstructed'])

In [None]:
unique_processed = processed[~processed['_charge_reconstructed'].isin(
    set(processed_to_dedupe['_charge_reconstructed']))].copy()

In [None]:
unique_processed[unique_processed.duplicated(
    subset=['_charge_reconstructed', '_offense_level'], keep=False)]

In [None]:
unique_processed['statute_object'] = unique_processed.apply(
    lambda x: {
        'EnactDate': x['EnactDate'],
        'RepealAmendDate': x['RepealAmendDate'],
        '_charge_description': x['_charge_description'],
    },
    axis=1,
)

In [None]:
unique_processed.columns

#### Combine duplicates in processed data with unprocessed data

In [None]:
unprocessed_to_dedupe = chsoff[~chsoff['_charge_reconstructed'].isin(
    processed_charges)].copy()

In [None]:
unprocessed_to_dedupe.columns

In [None]:
to_dedupe = pd.concat(
    [processed_to_dedupe, unprocessed_to_dedupe], ignore_index=True)

In [None]:
to_dedupe.columns

### Prepare for joinng on arrest data with missing or incorrect levels, subparts

#### Create objects with date and description data

I want to use unique charge codes while keeping this data for retrieval later.

In [None]:
to_dedupe.sort_values(
    by=['_charge_reconstructed', 'EnactDate', 'RepealAmendDate'],
    ascending=False,
    inplace=True,
)

In [None]:
to_dedupe['statute_object'] = to_dedupe.apply(
    lambda x: {
        'EnactDate': x['EnactDate'],
        'RepealAmendDate': x['RepealAmendDate'],
        '_charge_description': x['_charge_description'],
    },
    axis=1,
)

In [None]:
to_dedupe.columns

In [None]:
deduped = (
    to_dedupe.groupby(
        ['_original_code', '_code_type', '_offense_level', '_section', '_subdivision',
         '_paragraph', '_subparagraph', '_clause', '_charge_reconstructed']
    )
    .agg(_enactment_data=('statute_object', list),
         _charge_description=('_charge_description', arrest.format_unique))
    .reset_index()
)

In [None]:
deduped[
    deduped.duplicated(
        subset=['_charge_reconstructed', '_offense_level'], keep=False)
].sort_values(by='_charge_reconstructed')

In [None]:
chsoff_dedupe = pd.concat([deduped, unique_processed], ignore_index=True)

In [None]:
deduped.columns

In [None]:
chsoff_dedupe.columns

In [None]:
chsoff_dedupe[
    [
        '_original_code',
        '_code_type',
        '_offense_level',
        '_section',
        '_charge_reconstructed',
        '_enactment_data',
        '_charge_description',
        'EnactDate',
        'RepealAmendDate',
    ]
].to_json('../01_inputs/processed/c01_chsoff.json', date_format='iso', date_unit='s')

In [None]:
chsoff_dedupe['_level_data'] = chsoff_dedupe.apply(
    lambda x: {x['_offense_level']: x['statute_object']}, axis=1)

In [None]:
chsoff_dedupe_without_level = (
    chsoff_dedupe.groupby(
        ['_code_type', '_section', '_charge_reconstructed'])
    .agg(_data_per_charge_level=('_level_data', list),
         _potential_offense_levels=('_offense_level', arrest.format_unique),
         _charge_description=('_charge_description', arrest.format_unique),
         )
    .reset_index()
)

In [None]:
chsoff_dedupe_without_level[chsoff_dedupe_without_level.duplicated(
    subset=['_charge_reconstructed'], keep=False)]

In [None]:
chsoff_dedupe_without_level.columns

In [None]:
chsoff_dedupe_without_level[['_code_type', '_section', '_charge_reconstructed',
                             '_data_per_charge_level', '_potential_offense_levels',
                             '_charge_description']].to_json(
    '../01_inputs/processed/c01_chsoff_ignoring_level.json', date_format='iso', date_unit='s')

In [None]:
chsoff_dedupe['_level_and_subpart_data'] = chsoff_dedupe.apply(
    lambda x: {x['_section']+x['_subdivision']: x['_level_data']}, axis=1
)

The most common issue with data entry is the omission of subdivision data altogether where the charge actually corresponds to the first subdivision of the statute.

In [None]:
first_subdivision = chsoff_dedupe[chsoff_dedupe['_subdivision'].isin(
    ['', '(A)', '(1)'])].copy()

In [None]:
first_subdivision['_level_quantified'] = first_subdivision['_offense_level'].replace(
    {'X': 0, 'I': 1, 'M': 2, 'F': 3})

In [None]:
first_subdivision.sort_values(
    by=['_charge_reconstructed', '_level_quantified'], inplace=True)

#### Account for charges missing subparts, but including charge level

In [None]:
first_subdivision_with_level = first_subdivision.drop_duplicates(
    subset=['_code_type', '_section', '_offense_level'], keep='first'
).copy()

In [None]:
first_subdivision_with_level['_match_on_level'] = True

#### Account for charges missing both subparts and charge level

In [None]:
first_subdivision_without_level = first_subdivision.drop_duplicates(
    subset=['_code_type', '_section'], keep='first'
)[['_original_code', '_code_type', '_section', '_charge_reconstructed',
   '_charge_description', 'statute_object', '_level_data']].copy()

In [None]:
first_subdivision_without_level['_match_on_level'] = False

In [None]:
ignoring_subparts = pd.concat(
    [first_subdivision_with_level, first_subdivision_without_level], ignore_index=True)

In [None]:
ignoring_subparts[ignoring_subparts.duplicated(
    subset=['_code_type', '_section', '_offense_level'], keep=False)]

In [None]:
ignoring_subparts.columns

In [None]:
ignoring_subparts.drop(labels=['_subdivision', '_paragraph', '_subparagraph',
                       '_clause', '_level_quantified'], axis=1, inplace=True)

In [None]:
ignoring_subparts[['_original_code', '_code_type', '_offense_level', '_section',
                   '_charge_reconstructed', '_enactment_data', '_charge_description', 'EnactDate', 'RepealAmendDate', 'statute_object',
                   '_level_data', '_level_and_subpart_data', '_match_on_level']].to_json(
    '../01_inputs/processed/c01_chsoff_ignoring_subparts.json',
    date_format='iso',
    date_unit='s',
)