In [1]:
import sys

# append the directory of law module to sys.path list
sys.path.append('../../modules/')

In [2]:
import datetime as dt
import glob
import math
import os
import re

import arrest
import numpy as np
import pandas as pd

## `CHSOFF`

- [Data source](https://oag.ca.gov/law/code-tables) (specifically [Offense Codes (with LEI codes)](https://oag.ca.gov/sites/all/files/agweb/law-enforcement/code-tables/chsoff.csv?041820220129))
- [Names Source (XML)](https://oag.ca.gov/sites/all/files/agweb/law-enforcement/code-tables/chsoff.xml)

In [3]:
chsoff_xml = pd.read_xml('../01_inputs/chsoff_20220415.xml')

In [4]:
chsoff_xml.columns

Index(['ValidationCode', 'CJISCode', 'TransactionTypeCode',
       'StatutoryNumericCodes', 'TypeOfStatCode', 'StatuteLiteral',
       'DefaultTypeofCharge', 'TypeOfCharge', 'LiteralIndentifierCode',
       'ChargeDegree', 'BCSCodesLEI', 'BCSHierarchyCodes', 'EnactDate',
       'RepealAmendDate', 'ALPSCognizantCode'],
      dtype='object')

In [5]:
chsoff_original = pd.read_csv(
    '../01_inputs/chsoff_20220204.csv',
    header=None,
    names=chsoff_xml.columns,
    dtype=str,
)

In [6]:
chsoff = chsoff_original.drop(labels=['ValidationCode', 'CJISCode', 'TransactionTypeCode', 'DefaultTypeofCharge',
                                      'LiteralIndentifierCode', 'ChargeDegree', 'BCSCodesLEI', 'BCSHierarchyCodes',
                                      'ALPSCognizantCode'], axis=1).copy()

In [7]:
chsoff['EnactDate'] = pd.to_datetime(chsoff['EnactDate'])

In [8]:
chsoff['RepealAmendDate'] = pd.to_datetime(
    chsoff['RepealAmendDate'], errors='coerce')

## Subset data to applicable time periods

In [9]:
chsoff = (
    chsoff[
        (chsoff['EnactDate'] < '2021')
        & (
            (chsoff['RepealAmendDate'] < '2020-12-31')
            | chsoff['RepealAmendDate'].isnull()
        )
        & (~chsoff['TypeOfStatCode'].isin(['ZZ']))
    ]
    .copy()
    .reset_index(drop=True)
)

## Split `StatutoryNumericCodes` into section and subparts

In [10]:
chsoff[
    ['_section', '_subdivision', '_paragraph', '_subparagraph', '_clause']
] = chsoff.apply(
    lambda x: arrest.Charge.parse_section_and_subparts(
        x['StatutoryNumericCodes'], arrests=False),
    axis=1,
    result_type='expand',
)

### Reassemble into _charge_reconstructed for joining on arrest data

In [11]:
chsoff['_charge_reconstructed'] = chsoff['TypeOfStatCode'].str.cat(
    chsoff[['_section', '_subdivision', '_paragraph', '_subparagraph', '_clause']],
    sep='',
    na_rep='',
)

## Violent

In [12]:
chsoff_violent = pd.read_csv(
    '../01_inputs/processed/c01_chsoff_violent.csv',
    usecols=['_charge_reconstructed'],
    dtype=str,
)

In [13]:
chsoff.loc[chsoff['_charge_reconstructed'].str.contains('^PC460',regex=True)]

Unnamed: 0,StatutoryNumericCodes,TypeOfStatCode,StatuteLiteral,TypeOfCharge,EnactDate,RepealAmendDate,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed
1502,460(A),PC,BURGLARY:FIRST DEGREE,F,1992-01-01,NaT,460,(A),,,,PC460(A)
1503,460(B),PC,BURGLARY:SECOND DEGREE,F,1992-01-01,NaT,460,(B),,,,PC460(B)
1504,460(B),PC,BURGLARY:SECOND DEGREE,M,1992-01-01,NaT,460,(B),,,,PC460(B)
3249,4600(A),PC,DAMAGE JAIL/PRISON/ETC,M,1997-01-01,NaT,4600,(A),,,,PC4600(A)
3250,4600(A),PC,DAMAGE JAIL/PRISON/ETC,F,1997-01-01,2020-11-02,4600,(A),,,,PC4600(A)
3251,4600(A),PC,DAMAGE JAIL/PRISON $950+,F,2020-11-03,NaT,4600,(A),,,,PC4600(A)


In [14]:
chsoff.loc[chsoff['_charge_reconstructed'].str.contains('^PC460',regex=True), '_chsoff_violent'] = True

In [15]:
chsoff['_chsoff_violent'] = chsoff['_charge_reconstructed'].isin(
    set(chsoff_violent['_charge_reconstructed']))

In [16]:
chsoff.loc[chsoff['_charge_reconstructed'].str.contains('241/243')]

Unnamed: 0,StatutoryNumericCodes,TypeOfStatCode,StatuteLiteral,TypeOfCharge,EnactDate,RepealAmendDate,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed,_chsoff_violent
5463,241/243,PC,ASLT/BATT PEACE OFR/ETC,M,1982-01-01,NaT,241/243,,,,,PC241/243,True


## Prepare for joining on arrest data

In [17]:
chsoff.rename(
    columns={
        'StatutoryNumericCodes': '_original_code',
        'TypeOfStatCode': '_code_type',
        'TypeOfCharge': '_offense_level',
    },
    inplace=True,
)

In [18]:
chsoff.sort_values(by=['_code_type', '_original_code',
                   '_offense_level', 'EnactDate', 'RepealAmendDate'], ascending=[True, True, True, False, False], inplace=True, na_position='first')

### Handle amendments

In [19]:
repealed_amended = set(
    chsoff[chsoff['RepealAmendDate'].notnull()]['_charge_reconstructed'])

In [20]:
not_repealed_amended = set(
    chsoff[chsoff['RepealAmendDate'].isnull()]['_charge_reconstructed'])

In [21]:
amended = chsoff[chsoff['_charge_reconstructed'].isin(
    repealed_amended & not_repealed_amended)].copy()

#### Subset those amended in 2020 because I'll use the descriptions therein

In [22]:
amended_late = amended[amended['RepealAmendDate'] >= '2020'].copy()

In [23]:
not_replaced = chsoff[(chsoff['_charge_reconstructed'].isin(
    repealed_amended - not_repealed_amended))].copy()

In [24]:
not_amended = chsoff[(chsoff['_charge_reconstructed'].isin(
    not_repealed_amended - repealed_amended))].copy()

#### Combine DataFrames

In [25]:
processed = pd.concat(
    [amended_late, not_replaced, not_amended], ignore_index=True)

### Identify duplicates in processed data

In [26]:
processed_to_dedupe = processed[processed.duplicated(subset=['_charge_reconstructed', '_offense_level'],
                                                     keep=False)].copy()

In [27]:
processed_charges = set(processed['_charge_reconstructed'])

In [28]:
unique_processed = processed[~processed['_charge_reconstructed'].isin(
    set(processed_to_dedupe['_charge_reconstructed']))].copy()

In [29]:
unique_processed[unique_processed.duplicated(
    subset=['_charge_reconstructed', '_offense_level'], keep=False)]

Unnamed: 0,_original_code,_code_type,StatuteLiteral,_offense_level,EnactDate,RepealAmendDate,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed,_chsoff_violent


In [30]:
unique_processed['statute_object'] = unique_processed.apply(
    lambda x: {
        'EnactDate': x['EnactDate'],
        'RepealAmendDate': x['RepealAmendDate'],
        'StatuteLiteral': x['StatuteLiteral'],
    },
    axis=1,
)

#### Combine duplicates in processed data with unprocessed data

In [31]:
unprocessed_to_dedupe = chsoff[~chsoff['_charge_reconstructed'].isin(
    processed_charges)].copy()

In [32]:
to_dedupe = pd.concat(
    [processed_to_dedupe, unprocessed_to_dedupe], ignore_index=True)

### Prepare for joinng on arrest data with missing or incorrect levels, subparts

#### Create objects with date and description data

I want to use unique charge codes while keeping this data for retrieval later.

In [33]:
to_dedupe.sort_values(by=['_charge_reconstructed', 'EnactDate',
                      'RepealAmendDate'], ascending=False, inplace=True)

In [34]:
to_dedupe['statute_object'] = to_dedupe.apply(
    lambda x: {
        'EnactDate': x['EnactDate'],
        'RepealAmendDate': x['RepealAmendDate'],
        'StatuteLiteral': x['StatuteLiteral'],
    },
    axis=1,
)

In [35]:
deduped = (
    to_dedupe.groupby(
        ['_original_code', '_code_type', '_offense_level', '_section', '_subdivision',
         '_paragraph', '_subparagraph', '_clause', '_charge_reconstructed',
         '_chsoff_violent']
    )
    .agg(enactment_data=('statute_object', list),
         StatuteLiteral=('StatuteLiteral', arrest.format_unique))
    .reset_index()
)

In [36]:
deduped[deduped.duplicated(
    subset=['_charge_reconstructed', '_offense_level'], keep=False)].sort_values(by='_charge_reconstructed')

Unnamed: 0,_original_code,_code_type,_offense_level,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed,_chsoff_violent,enactment_data,StatuteLiteral


In [37]:
chsoff_dedupe = pd.concat([deduped, unique_processed], ignore_index=True)

In [38]:
chsoff_dedupe.columns

Index(['_original_code', '_code_type', '_offense_level', '_section',
       '_subdivision', '_paragraph', '_subparagraph', '_clause',
       '_charge_reconstructed', '_chsoff_violent', 'enactment_data',
       'StatuteLiteral', 'EnactDate', 'RepealAmendDate', 'statute_object'],
      dtype='object')

In [39]:
chsoff_dedupe[chsoff_dedupe['_chsoff_violent']==True][['_original_code', '_code_type', '_offense_level', '_section',
               '_charge_reconstructed', '_chsoff_violent', 'StatuteLiteral', 'EnactDate',
               'RepealAmendDate']].to_clipboard()

In [40]:
chsoff_dedupe[['_original_code', '_code_type', '_offense_level', '_section', '_subdivision',
               '_charge_reconstructed', '_chsoff_violent', 'enactment_data', 'StatuteLiteral', 'EnactDate',
               'RepealAmendDate', ]].to_json('../01_inputs/processed/c01_chsoff.json',
                                             date_format='iso', date_unit='s')

In [41]:
# chsoff_dedupe.to_csv('../01_inputs/processed/c01_chsoff.csv', index=False)

In [42]:
chsoff_dedupe['_level_data'] = chsoff_dedupe.apply(
    lambda x: {x['_offense_level']: x['statute_object']}, axis=1)

In [43]:
chsoff_dedupe_without_level = (
    chsoff_dedupe.groupby(['_code_type', '_section', '_charge_reconstructed'])
    .agg(data_per_charge_level=('_level_data', list),
         potential_offense_levels=('_offense_level', arrest.format_unique),
         StatuteLiteral=('StatuteLiteral', arrest.format_unique))
    .reset_index()
)

In [44]:
chsoff_dedupe_without_level[chsoff_dedupe_without_level.duplicated(
    subset=['_charge_reconstructed'], keep=False)]

Unnamed: 0,_code_type,_section,_charge_reconstructed,data_per_charge_level,potential_offense_levels,StatuteLiteral


In [45]:
chsoff_dedupe_without_level.to_json(
    '../01_inputs/processed/c01_chsoff_ignoring_level.json', date_format='iso', date_unit='s')

In [46]:
chsoff_dedupe['_level_and_subpart_data'] = chsoff_dedupe.apply(
    lambda x: {x['_section']+x['_subdivision']: x['_level_data']}, axis=1
)

In [47]:
chsoff_dedupe_without_level_or_subparts = chsoff_dedupe.groupby(['_code_type', '_section']).agg(
    data_per_subdivision_and_level=('_level_and_subpart_data', list),
    subdivisions=('_subdivision', arrest.format_unique)).reset_index()

In [48]:
chsoff_dedupe_without_level_or_subparts

Unnamed: 0,_code_type,_section,data_per_subdivision_and_level,subdivisions
0,BP,10085.5,[{'10085.5': {'M': {'EnactDate': Timestamp('19...,
1,BP,10139,[{'10139': {'M': {'EnactDate': Timestamp('1943...,
2,BP,10153.1,[{'10153.1': {'M': {'EnactDate': Timestamp('19...,
3,BP,10167.11,[{'10167.11(A)': {'M': {'EnactDate': Timestamp...,"(B), (A)"
4,BP,10167.2,[{'10167.2': {'M': {'EnactDate': Timestamp('19...,
...,...,...,...,...
2655,WI,777,[{'777': {'X': {'EnactDate': Timestamp('1961-0...,(A)
2656,WI,8101,[{'8101(A)': {'F': {'EnactDate': Timestamp('19...,"(B), (A)"
2657,WI,8103,[{'8103(A)': {'F': {'EnactDate': Timestamp('19...,"(F), (A), (I)"
2658,WI,871,[{'871(A)': {'M': {'EnactDate': Timestamp('198...,"(B), (A), (C), (D)"


The most common issue with data entry is the omission of subdivision data altogether where the charge actually corresponds to the first subdivision of the statute.

In [49]:
first_subdivision = chsoff_dedupe[chsoff_dedupe['_subdivision'].isin(
    ['', '(A)', '(1)'])].copy()

In [50]:
first_subdivision['_level_quantified'] = first_subdivision['_offense_level'].replace(
    {'X': 0, 'I': 1, 'M': 2, 'F': 3})

In [51]:
first_subdivision.sort_values(
    by=['_charge_reconstructed', '_level_quantified'], inplace=True)

#### Account for charges missing subparts, but including charge level

In [52]:
first_subdivision_with_level = first_subdivision.drop_duplicates(
    subset=['_code_type', '_section', '_offense_level'], keep='first'
).copy()

In [53]:
first_subdivision.columns

Index(['_original_code', '_code_type', '_offense_level', '_section',
       '_subdivision', '_paragraph', '_subparagraph', '_clause',
       '_charge_reconstructed', '_chsoff_violent', 'enactment_data',
       'StatuteLiteral', 'EnactDate', 'RepealAmendDate', 'statute_object',
       '_level_data', '_level_and_subpart_data', '_level_quantified'],
      dtype='object')

#### Account for charges missing both subparts and charge level

In [54]:
first_subdivision_without_level = first_subdivision.drop_duplicates(
    subset=['_code_type', '_section'], keep='first'
)[['_original_code', '_code_type', '_section', '_charge_reconstructed',
   '_chsoff_violent', 'StatuteLiteral', 'statute_object', '_level_data']].copy()

In [55]:
ignoring_subparts = pd.concat(
    [first_subdivision_with_level, first_subdivision_without_level], ignore_index=True)

In [56]:
ignoring_subparts[ignoring_subparts.duplicated(
    subset=['_code_type', '_section', '_offense_level'], keep=False)]

Unnamed: 0,_original_code,_code_type,_offense_level,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed,_chsoff_violent,enactment_data,StatuteLiteral,EnactDate,RepealAmendDate,statute_object,_level_data,_level_and_subpart_data,_level_quantified


In [57]:
ignoring_subparts.to_json(
    '../01_inputs/processed/c01_chsoff_ignoring_subparts.json', date_format='iso', date_unit='s')