In [1]:
import sys

# append the directory of law module to sys.path list
sys.path.append('../../modules/')

In [2]:
import datetime as dt
import glob
import math
import os
import re

import arrest
import numpy as np
import pandas as pd

## `CHSOFF`

- [Data source](https://oag.ca.gov/law/code-tables) (specifically [Offense Codes (with LEI codes)](https://oag.ca.gov/sites/all/files/agweb/law-enforcement/code-tables/chsoff.csv?041820220129))
- [Names Source (XML)](https://oag.ca.gov/sites/all/files/agweb/law-enforcement/code-tables/chsoff.xml)

In [3]:
chsoff_xml = pd.read_xml('../01_inputs/chsoff_20220415.xml')

In [4]:
chsoff_xml.columns

Index(['ValidationCode', 'CJISCode', 'TransactionTypeCode',
       'StatutoryNumericCodes', 'TypeOfStatCode', 'StatuteLiteral',
       'DefaultTypeofCharge', 'TypeOfCharge', 'LiteralIndentifierCode',
       'ChargeDegree', 'BCSCodesLEI', 'BCSHierarchyCodes', 'EnactDate',
       'RepealAmendDate', 'ALPSCognizantCode'],
      dtype='object')

In [5]:
chsoff_original = pd.read_csv(
    '../01_inputs/chsoff_20220204.csv',
    header=None,
    names=chsoff_xml.columns,
    dtype=str,
)

In [6]:
chsoff = chsoff_original.drop(
    labels=[
        'ValidationCode',
        'CJISCode',
        'TransactionTypeCode',
        'DefaultTypeofCharge',
        'LiteralIndentifierCode',
        'ChargeDegree',
        'BCSCodesLEI',
        'BCSHierarchyCodes',
        'ALPSCognizantCode',
    ],
    axis=1,
).copy()

In [7]:
chsoff['EnactDate'] = pd.to_datetime(chsoff['EnactDate'])

In [8]:
chsoff['RepealAmendDate'] = pd.to_datetime(
    chsoff['RepealAmendDate'], errors='coerce')

## Subset data to applicable time periods

In [9]:
chsoff = (
    chsoff[
        (chsoff['EnactDate'] < '2021')
        & (
            (chsoff['RepealAmendDate'] < '2020-12-31')
            | chsoff['RepealAmendDate'].isnull()
        )
        & (~chsoff['TypeOfStatCode'].isin(['ZZ']))
    ]
    .copy()
    .reset_index(drop=True)
)

## Split `StatutoryNumericCodes` into section and subparts

In [10]:
chsoff[
    ['_section', '_subdivision', '_paragraph', '_subparagraph', '_clause']
] = chsoff.apply(
    lambda x: arrest.Charge.parse_code(
        x['StatutoryNumericCodes'], arrests=False),
    axis=1,
    result_type='expand',
)

### Reassemble into _charge_reconstructed for joining on arrest data

In [11]:
chsoff['_charge_reconstructed'] = chsoff['TypeOfStatCode'].str.cat(
    chsoff[['_section', '_subdivision', '_paragraph', '_subparagraph', '_clause']],
    sep='',
    na_rep='',
)

## Violent

In [12]:
chsoff_violent = pd.read_csv(
    '../01_inputs/processed/c01_chsoff_violent.csv',
    usecols=['_charge_reconstructed'],
    dtype=str,
)

In [13]:
chsoff.loc[chsoff['_charge_reconstructed'].str.contains('^PC460', regex=True)]

Unnamed: 0,StatutoryNumericCodes,TypeOfStatCode,StatuteLiteral,TypeOfCharge,EnactDate,RepealAmendDate,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed
1502,460(A),PC,BURGLARY:FIRST DEGREE,F,1992-01-01,NaT,460,(A),,,,PC460(A)
1503,460(B),PC,BURGLARY:SECOND DEGREE,F,1992-01-01,NaT,460,(B),,,,PC460(B)
1504,460(B),PC,BURGLARY:SECOND DEGREE,M,1992-01-01,NaT,460,(B),,,,PC460(B)
3249,4600(A),PC,DAMAGE JAIL/PRISON/ETC,M,1997-01-01,NaT,4600,(A),,,,PC4600(A)
3250,4600(A),PC,DAMAGE JAIL/PRISON/ETC,F,1997-01-01,2020-11-02,4600,(A),,,,PC4600(A)
3251,4600(A),PC,DAMAGE JAIL/PRISON $950+,F,2020-11-03,NaT,4600,(A),,,,PC4600(A)


In [14]:
chsoff.loc[chsoff['_charge_reconstructed'].str.contains(
    '^PC460', regex=True), '_chsoff_violent'] = True

In [15]:
chsoff['_chsoff_violent'] = chsoff['_charge_reconstructed'].isin(
    set(chsoff_violent['_charge_reconstructed']))

In [16]:
chsoff.loc[chsoff['_charge_reconstructed'].str.contains('241/243')]

Unnamed: 0,StatutoryNumericCodes,TypeOfStatCode,StatuteLiteral,TypeOfCharge,EnactDate,RepealAmendDate,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed,_chsoff_violent
5463,241/243,PC,ASLT/BATT PEACE OFR/ETC,M,1982-01-01,NaT,241/243,,,,,PC241/243,True


## Prepare for joining on arrest data

In [17]:
chsoff.rename(
    columns={
        'StatutoryNumericCodes': '_original_code',
        'TypeOfStatCode': '_code_type',
        'TypeOfCharge': '_offense_level',
        'StatuteLiteral': '_charge_description'
    },
    inplace=True,
)

In [18]:
chsoff.columns

Index(['_original_code', '_code_type', '_charge_description', '_offense_level',
       'EnactDate', 'RepealAmendDate', '_section', '_subdivision',
       '_paragraph', '_subparagraph', '_clause', '_charge_reconstructed',
       '_chsoff_violent'],
      dtype='object')

In [19]:
chsoff.sort_values(
    by=[
        '_code_type',
        '_original_code',
        '_offense_level',
        'EnactDate',
        'RepealAmendDate',
    ],
    ascending=[True, True, True, False, False],
    inplace=True,
    na_position='first',
)

In [20]:
chsoff[chsoff['_original_code'].str.contains('664')]

Unnamed: 0,_original_code,_code_type,_charge_description,_offense_level,EnactDate,RepealAmendDate,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed,_chsoff_violent
4795,25664,BP,ADV:ENCORAGE MNR TO DRINK,M,1953-09-09,NaT,25664.0,,,,,BP25664,False
5206,66499.30(B),GC,UNLAW SELL/LEASE PROPERTY,F,1975-03-01,NaT,66499.3,(B),,,,GC66499.30(B),False


### Handle amendments

In [21]:
repealed_amended = set(
    chsoff[chsoff['RepealAmendDate'].notnull()]['_charge_reconstructed'])

In [22]:
not_repealed_amended = set(
    chsoff[chsoff['RepealAmendDate'].isnull()]['_charge_reconstructed'])

In [23]:
amended = chsoff[chsoff['_charge_reconstructed'].isin(
    repealed_amended & not_repealed_amended)].copy()

#### Subset those amended in 2020 because I'll use the descriptions therein

In [24]:
amended_late = amended[amended['RepealAmendDate'] >= '2020'].copy()

In [25]:
not_replaced = chsoff[(chsoff['_charge_reconstructed'].isin(
    repealed_amended - not_repealed_amended))].copy()

In [26]:
not_amended = chsoff[(chsoff['_charge_reconstructed'].isin(
    not_repealed_amended - repealed_amended))].copy()

#### Combine DataFrames

In [27]:
processed = pd.concat(
    [amended_late, not_replaced, not_amended], ignore_index=True)

In [28]:
processed.columns

Index(['_original_code', '_code_type', '_charge_description', '_offense_level',
       'EnactDate', 'RepealAmendDate', '_section', '_subdivision',
       '_paragraph', '_subparagraph', '_clause', '_charge_reconstructed',
       '_chsoff_violent'],
      dtype='object')

### Identify duplicates in processed data

In [29]:
processed_to_dedupe = processed[
    processed.duplicated(subset=['_charge_reconstructed', '_offense_level'], keep=False)
].copy()

In [30]:
processed_charges = set(processed['_charge_reconstructed'])

In [31]:
unique_processed = processed[~processed['_charge_reconstructed'].isin(
    set(processed_to_dedupe['_charge_reconstructed']))].copy()

In [32]:
unique_processed[unique_processed.duplicated(
    subset=['_charge_reconstructed', '_offense_level'], keep=False)]

Unnamed: 0,_original_code,_code_type,_charge_description,_offense_level,EnactDate,RepealAmendDate,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed,_chsoff_violent


In [33]:
unique_processed['statute_object'] = unique_processed.apply(
    lambda x: {
        'EnactDate': x['EnactDate'],
        'RepealAmendDate': x['RepealAmendDate'],
        '_charge_description': x['_charge_description'],
    },
    axis=1,
)

In [34]:
unique_processed.columns

Index(['_original_code', '_code_type', '_charge_description', '_offense_level',
       'EnactDate', 'RepealAmendDate', '_section', '_subdivision',
       '_paragraph', '_subparagraph', '_clause', '_charge_reconstructed',
       '_chsoff_violent', 'statute_object'],
      dtype='object')

#### Combine duplicates in processed data with unprocessed data

In [35]:
unprocessed_to_dedupe = chsoff[~chsoff['_charge_reconstructed'].isin(
    processed_charges)].copy()

In [36]:
unprocessed_to_dedupe.columns

Index(['_original_code', '_code_type', '_charge_description', '_offense_level',
       'EnactDate', 'RepealAmendDate', '_section', '_subdivision',
       '_paragraph', '_subparagraph', '_clause', '_charge_reconstructed',
       '_chsoff_violent'],
      dtype='object')

In [37]:
to_dedupe = pd.concat(
    [processed_to_dedupe, unprocessed_to_dedupe], ignore_index=True)

In [38]:
to_dedupe.columns

Index(['_original_code', '_code_type', '_charge_description', '_offense_level',
       'EnactDate', 'RepealAmendDate', '_section', '_subdivision',
       '_paragraph', '_subparagraph', '_clause', '_charge_reconstructed',
       '_chsoff_violent'],
      dtype='object')

### Prepare for joinng on arrest data with missing or incorrect levels, subparts

#### Create objects with date and description data

I want to use unique charge codes while keeping this data for retrieval later.

In [39]:
to_dedupe.sort_values(
    by=['_charge_reconstructed', 'EnactDate', 'RepealAmendDate'],
    ascending=False,
    inplace=True,
)

In [40]:
to_dedupe['statute_object'] = to_dedupe.apply(
    lambda x: {
        'EnactDate': x['EnactDate'],
        'RepealAmendDate': x['RepealAmendDate'],
        '_charge_description': x['_charge_description'],
    },
    axis=1,
)

In [41]:
deduped = (
    to_dedupe.groupby(
        ['_original_code', '_code_type', '_offense_level', '_section', '_subdivision',
         '_paragraph', '_subparagraph', '_clause', '_charge_reconstructed',
         '_chsoff_violent']
    )
    .agg(enactment_data=('statute_object', list),
         charge_description=('_charge_description', arrest.format_unique))
    .reset_index()
)

In [42]:
deduped.rename(columns={'enactment_data': '_enactment_data','charge_description': '_charge_description'}, inplace=True)

In [43]:
deduped[
    deduped.duplicated(
        subset=['_charge_reconstructed', '_offense_level'], keep=False)
].sort_values(by='_charge_reconstructed')

Unnamed: 0,_original_code,_code_type,_offense_level,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed,_chsoff_violent,_enactment_data,_charge_description


In [44]:
chsoff_dedupe = pd.concat([deduped, unique_processed], ignore_index=True)

In [45]:
deduped.columns

Index(['_original_code', '_code_type', '_offense_level', '_section',
       '_subdivision', '_paragraph', '_subparagraph', '_clause',
       '_charge_reconstructed', '_chsoff_violent', '_enactment_data',
       '_charge_description'],
      dtype='object')

In [46]:
chsoff_dedupe[
    [
        '_original_code',
        '_code_type',
        '_offense_level',
        '_section',
        '_charge_reconstructed',
        '_chsoff_violent',
        '_enactment_data',
        '_charge_description',
        'EnactDate',
        'RepealAmendDate',
    ]
].to_json('../01_inputs/processed/c01_chsoff.json', date_format='iso', date_unit='s')

In [47]:
chsoff_dedupe['_level_data'] = chsoff_dedupe.apply(
    lambda x: {x['_offense_level']: x['statute_object']}, axis=1)

In [48]:
chsoff_dedupe_without_level = (
    chsoff_dedupe.groupby(['_code_type', '_section', '_charge_reconstructed','_chsoff_violent'])
    .agg(data_per_charge_level=('_level_data', list),
         potential_offense_levels=('_offense_level', arrest.format_unique),
         charge_description=('_charge_description', arrest.format_unique))
    .reset_index()
)

In [49]:
chsoff_dedupe_without_level.rename(columns={'charge_description':'_charge_description'},inplace=True)

In [50]:
chsoff_dedupe_without_level[chsoff_dedupe_without_level.duplicated(
    subset=['_charge_reconstructed'], keep=False)]

Unnamed: 0,_code_type,_section,_charge_reconstructed,_chsoff_violent,data_per_charge_level,potential_offense_levels,_charge_description


In [51]:
chsoff_dedupe_without_level.rename(
    columns={
        'potential_offense_levels': '_potential_offense_levels',
        'data_per_charge_level': '_data_per_charge_level',
    }, inplace=True
)

In [52]:
chsoff_dedupe_without_level

Unnamed: 0,_code_type,_section,_charge_reconstructed,_chsoff_violent,_data_per_charge_level,_potential_offense_levels,_charge_description
0,BP,10085.5,BP10085.5,False,"[{'M': {'EnactDate': 1998-08-28 00:00:00, 'Rep...",M,ADV FEE/SECURE LOAN
1,BP,10139,BP10139,False,"[{'M': {'EnactDate': 1943-08-04 00:00:00, 'Rep...",M,ACT/ETC RELATR/ETC:NO LIC
2,BP,10153.1,BP10153.1,False,"[{'M': {'EnactDate': 1979-01-01 00:00:00, 'Rep...",M,ID FRAUD:EXAMINATION/APPL
3,BP,10167.11,BP10167.11(A),False,"[{'M': {'EnactDate': 1994-01-01 00:00:00, 'Rep...",M,FALSE ADV/ETC:RENTAL LIST
4,BP,10167.11,BP10167.11(B),False,"[{'M': {'EnactDate': 1994-01-01 00:00:00, 'Rep...",M,REFER FALSE/ETC RENTALS
...,...,...,...,...,...,...,...
4201,WI,871,WI871(C),False,"[{'M': {'EnactDate': 1994-01-01 00:00:00, 'Rep...",M,ESC:MNR NOT RET:FURLO/ETC
4202,WI,871,WI871(D),False,"[{'M': {'EnactDate': 1998-01-01 00:00:00, 'Rep...",M,MNR REMOVE ELECT MONITOR
4203,WI,871.5,WI871.5(A),False,"[{'F': {'EnactDate': 1982-01-01 00:00:00, 'Rep...","M, F","BRNG/ETC C/SUB/ETC:J/HALL, BRNG/ETC CNT SUB:JU..."
4204,WI,871.5,WI871.5(B),False,"[{'F': {'EnactDate': 1986-01-01 00:00:00, 'Rep...",F,USE TEAR GAS/WPN:HALL/ETC


In [53]:
chsoff_dedupe_without_level.to_json(
    '../01_inputs/processed/c01_chsoff_ignoring_level.json', date_format='iso', date_unit='s')

In [54]:
chsoff_dedupe['_level_and_subpart_data'] = chsoff_dedupe.apply(
    lambda x: {x['_section']+x['_subdivision']: x['_level_data']}, axis=1
)

The most common issue with data entry is the omission of subdivision data altogether where the charge actually corresponds to the first subdivision of the statute.

In [55]:
first_subdivision = chsoff_dedupe[chsoff_dedupe['_subdivision'].isin(
    ['', '(A)', '(1)'])].copy()

In [56]:
first_subdivision['_level_quantified'] = first_subdivision['_offense_level'].replace(
    {'X': 0, 'I': 1, 'M': 2, 'F': 3})

In [57]:
first_subdivision.sort_values(
    by=['_charge_reconstructed', '_level_quantified'], inplace=True)

#### Account for charges missing subparts, but including charge level

In [58]:
first_subdivision_with_level = first_subdivision.drop_duplicates(
    subset=['_code_type', '_section', '_offense_level'], keep='first'
).copy()

In [59]:
first_subdivision_with_level['_match_on_level'] = True

#### Account for charges missing both subparts and charge level

In [60]:
first_subdivision_without_level = first_subdivision.drop_duplicates(
    subset=['_code_type', '_section'], keep='first'
)[['_original_code', '_code_type', '_section', '_charge_reconstructed',
   '_chsoff_violent', '_charge_description', 'statute_object', '_level_data']].copy()

In [61]:
first_subdivision_without_level['_match_on_level'] = False

In [62]:
ignoring_subparts = pd.concat(
    [first_subdivision_with_level, first_subdivision_without_level], ignore_index=True)

In [63]:
ignoring_subparts[ignoring_subparts.duplicated(
    subset=['_code_type', '_section', '_offense_level'], keep=False)]

Unnamed: 0,_original_code,_code_type,_offense_level,_section,_subdivision,_paragraph,_subparagraph,_clause,_charge_reconstructed,_chsoff_violent,_enactment_data,_charge_description,EnactDate,RepealAmendDate,statute_object,_level_data,_level_and_subpart_data,_level_quantified,_match_on_level


In [64]:
ignoring_subparts.columns

Index(['_original_code', '_code_type', '_offense_level', '_section',
       '_subdivision', '_paragraph', '_subparagraph', '_clause',
       '_charge_reconstructed', '_chsoff_violent', '_enactment_data',
       '_charge_description', 'EnactDate', 'RepealAmendDate', 'statute_object',
       '_level_data', '_level_and_subpart_data', '_level_quantified',
       '_match_on_level'],
      dtype='object')

In [65]:
y =[
            '_original_code',
            '_code_type',
            '_offense_level',
            '_section',
            '_charge_reconstructed',
            '_chsoff_violent',
            '_enactment_data',
            '_charge_description',
            'EnactDate',
            'RepealAmendDate',
            'statute_object',
            '_level_data',
            '_level_and_subpart_data',
            '_match_on_level',
        ]

In [66]:
[z for z in ignoring_subparts.columns if z not in y]

['_subdivision', '_paragraph', '_subparagraph', '_clause', '_level_quantified']

In [67]:
ignoring_subparts.drop(labels=['_subdivision', '_paragraph', '_subparagraph', '_clause', '_level_quantified'],axis=1, inplace=True)



In [68]:
ignoring_subparts.to_json(
    '../01_inputs/processed/c01_chsoff_ignoring_subparts.json',
    date_format='iso',
    date_unit='s',
)