In [1]:
import dataclasses
import datetime as dt
import re

import pandas as pd
import pdfplumber

In [2]:
with pdfplumber.open("Offenders_Sentenced_to_20_yrs_or_More.pdf") as pdf:
    print (pdf.pages[0].extract_text())

Mississippi Department of Correctons
Ofenders Sentenced to 20 years or more, Habitual and No Parole Date
Report Date :  August 4, 2021
MDOC Number : 140035 Offender Name : ABSTON,DARIUS Admission Type : RETURN PAROLE Race: BLACK
Age : 31 Gender : MALE Total Term to Serve : 24 YRS 6 Mnths 0 Days
INDEX NCIC OFFENSE DATE CRIME  DATE HABITUAL COUNTY OF
CASE NUMBER  NUMBER DESCRIPTION COMMITTED SENTENCED FLAG CONVICTION SENTENCE TERM
061-15  1  POSS CONTRABAND IN PRISON 08/12/2014 09/30/2015 N LAUDERDALE 4 YRS 6 Mnths 0 Days
BOI 2018-10,023  2  BURGLARY-RESIDENTIAL 08/17/2017 03/02/2018 Y JACKSON 20 YRS 0 Mnths 0 Days
BOI 2018-10,023  3  KIDNAP- 08/17/2017 03/02/2018 Y JACKSON 20 YRS 0 Mnths 0 Days
BOI 2018-10,023  4  KIDNAP MINOR 08/17/2017 03/02/2018 Y JACKSON 20 YRS 0 Mnths 0 Days
BOI 2018-10,023  5  KIDNAP MINOR 08/17/2017 03/02/2018 Y JACKSON 20 YRS 0 Mnths 0 Days
BOI 2018-10,023  6  ARMED ROBBERY 08/17/2017 03/02/2018 Y JACKSON 20 YRS 0 Mnths 0 Days
BOI 2018-10,024  7  BURGLARY-RESIDE

To extract the data of interest, we will use the following strategy:

1. Extract text from each page of the PDF using the pdfplumber package.
2. Split the text on \n to get each line separately.
3. Use the case header rows to delimit each case.
4. Use the regular expression for the charge rows to delimit each charge.
5. Extract relevant data.


### Data Model

In [3]:
@dataclasses.dataclass
class Charge:
    mdoc_no: str
    prisoner_name: str
    case_no: str
    index_no: str
    offense_desc: str
    offense_date: dt.datetime
    sentence_date: dt.datetime
    habitual_flag: str
    conviction_county: str
    sentence_term: float
        
        
@dataclasses.dataclass
class Prisoner:
    mdoc_no: str
    name: str
    admission_type: str
    race: str
    age: int
    gender: str
    time_to_serve: float

In [4]:
def get_regex(pattern, text, match_group=0, alt_value="NULL"):
    match = pattern.search(text)
    if match is not None:
        return match.group(match_group)
    return alt_value

### Tokens
We define tokens using the following regular expressions.

In [5]:
MDOC_NO = re.compile("MDOC\s*Number\s*:\s*([A-Z\d]+)")
NAME = re.compile("Offender\s*Name\s*:\s*([A-Z,]+)")
ADMIT_TYPE = re.compile("Admission\s*Type\s*:\s*([A-Z\s]+)Race")
RACE = re.compile("Race\s*:\s*(\w+)")
AGE = re.compile("Age\s*:\s*(\d+)")
GENDER = re.compile("Gender\s*:\s*(MALE|FEMALE)")
TOTAL_SENTENCE_TERM = re.compile("Total\s*Term\s*to\s*Serve\s*:\s*(\d+\s*YRS\s*\d+\s*Mnths\s*\d+\s*Days)")
CASE_NO = re.compile("^(.+?)\s+\d{1,2}\s+[A-Z]{3,}")  # Fails on 'Age: \d+ Gender'
INDEX_NO = re.compile("\s+(\d+)\s+[A-Z]{3,}.{40,}$")
OFFENSE_DESC = re.compile("\d+\s+([A-Z]+[-\sA-Z/\d]*?)\s*\d+/\d+/\d+")
OFFENSE_DATE = re.compile("[A-Z]+[-\sA-Z]*\s*(\d+/\d+/\d+)")
SENTENCE_DATE = re.compile("\d+/\d+/\d+\s*(\d+/\d+/\d+)")
HABITUAL_FLAG = re.compile("\d+/\d+/\d+\s*(Y|N)")
CONVICTION_COUNTY = re.compile("\d+/\d+/\d+\s*(?:Y|N)\s*([A-Z ]+)")
SENTENCE_TERM = re.compile("(\d+\s*YRS\s*\d+\s*Mnths\s*\d+\s*Days)")

In [6]:
def create_charge(line, prisoner):
    return Charge(
        mdoc_no=prisoner.mdoc_no,
        prisoner_name=prisoner.name,
        case_no=get_regex(CASE_NO, line, 1),
        index_no=get_regex(INDEX_NO, line, 1),
        offense_desc=get_regex(OFFENSE_DESC, line, 1),
        offense_date=parse_date(get_regex(OFFENSE_DATE, line, 1)),
        sentence_date=parse_date(get_regex(SENTENCE_DATE, line, 1)),
        habitual_flag=get_regex(HABITUAL_FLAG, line, 1),
        conviction_county=get_regex(CONVICTION_COUNTY, line, 1),
        sentence_term=parse_sentence(get_regex(SENTENCE_TERM, line, 1))
    )


def create_prisoner(lines):
    return Prisoner(
        mdoc_no=get_regex(MDOC_NO, lines[0], 1),
        name=get_regex(NAME, lines[0], 1),
        admission_type=get_regex(ADMIT_TYPE, lines[0], 1),
        race=get_regex(RACE, lines[0], 1),
        age=int(get_regex(AGE, lines[1], 1, alt_value=-5)),
        gender=get_regex(GENDER, lines[1], 1),
        time_to_serve=parse_sentence(get_regex(TOTAL_SENTENCE_TERM, lines[1], 1)),
    )


def parse_date(date_str):
    if date_str == "NULL":
        return dt.datetime(1000, 1, 1)
    return dt.datetime.strptime(date_str, "%m/%d/%Y")


def parse_sentence(sentence_str):
    try:
        parsed = re.search(
            "(?P<years>\d+)\s*YRS\s*(?P<months>\d+)\s*Mnths\s*(?P<days>\d+)\s*Days",
            sentence_str
        )
        return (
            float(parsed.group("years"))
            + float(parsed.group("months")) / 12
            + float(parsed.group("days")) / 365.24
        )
    except ValueError:
        return -5.0

In [7]:
prisoners = []
charges = []
status = "IN HEADER"
prisoner_buffer = []
curr_charge = None
curr_prisoner = None
with pdfplumber.open("Offenders_Sentenced_to_20_yrs_or_More.pdf") as pdf:
    for page in pdf.pages:
        lines = page.extract_text().split("\n")
        for line in lines[3:-1]:  # skip first three and last lines of each page
            if line.strip().startswith("MDOC Number"):
                status = "IN HEADER"
                prisoner_buffer.append(line.strip())
            elif line.strip().startswith("Age"):
                prisoner_buffer.append(line.strip())
                prisoner = create_prisoner(prisoner_buffer)
                curr_prisoner = prisoner
                prisoners.append(prisoner)
                prisoner_buffer = []
            elif line.strip().startswith("CASE"):
                status = "IN CHARGES"
            elif (
                (status == "IN CHARGES") 
                and (len(line) < 50)
                and (curr_charge is not None)
            ):
                curr_charge.offense_desc += line.strip()
            elif (status == "IN CHARGES") and (curr_charge is not None):
                charges.append(curr_charge)
                curr_charge = create_charge(line.strip(), curr_prisoner)
            elif status == "IN CHARGES":
                curr_charge = create_charge(line.strip(), curr_prisoner)
    charges.append(curr_charge)

In [8]:
len(prisoners)

596

In [9]:
len(charges)

1788

### Charges

In [10]:
charge_df = pd.DataFrame(charges)
charge_df.head()

Unnamed: 0,mdoc_no,prisoner_name,case_no,index_no,offense_desc,offense_date,sentence_date,habitual_flag,conviction_county,sentence_term
0,140035,"ABSTON,DARIUS",061-15,1,POSS CONTRABAND IN PRISON,2014-08-12,2015-09-30 00:00:00,N,LAUDERDALE,4.5
1,140035,"ABSTON,DARIUS","BOI 2018-10,023",2,BURGLARY-RESIDENTIAL,2017-08-17,2018-03-02 00:00:00,Y,JACKSON,20.0
2,140035,"ABSTON,DARIUS","BOI 2018-10,023",3,KIDNAP-,2017-08-17,2018-03-02 00:00:00,Y,JACKSON,20.0
3,140035,"ABSTON,DARIUS","BOI 2018-10,023",4,KIDNAP MINOR,2017-08-17,2018-03-02 00:00:00,Y,JACKSON,20.0
4,140035,"ABSTON,DARIUS","BOI 2018-10,023",5,KIDNAP MINOR,2017-08-17,2018-03-02 00:00:00,Y,JACKSON,20.0


There are a few inaccurate sentence dates:

In [11]:
charge_df.loc[lambda df: df["sentence_date"] == dt.datetime(1000, 1, 1)]

Unnamed: 0,mdoc_no,prisoner_name,case_no,index_no,offense_desc,offense_date,sentence_date,habitual_flag,conviction_county,sentence_term
549,98904,"HALL,PATRICIA",2214,1,ARMED ROBBERY,1995-03-01,1000-01-01 00:00:00,Y,CHOCTAW,50.0
550,98904,"HALL,PATRICIA",2213,2,AGGRAVATED ASSAULT,1995-03-01,1000-01-01 00:00:00,Y,CHOCTAW,20.0
551,98904,"HALL,PATRICIA",2214,3,UTTERING FORGERY,1996-02-21,1000-01-01 00:00:00,N,CHOCTAW,3.0
647,K9185,"HEBERT,FRANK",DATACONV-1,1,SEXUAL BATTERY,2001-12-14,1000-01-01 00:00:00,Y,LINCOLN,20.0
648,K9185,"HEBERT,FRANK",DATACONV-2,2,EXPLOITATION/CHILD,2001-12-14,1000-01-01 00:00:00,Y,LINCOLN,2.0
649,K9185,"HEBERT,FRANK",DATACONV-3,3,FONDLING A CHILD,2001-12-14,1000-01-01 00:00:00,Y,LINCOLN,1.0
961,44393,"MACKLIN,THADDIUS",4414,3,COCAINE-SELL,1993-02-11,1000-01-01 00:00:00,Y,SCOTT,30.0
1334,57368,"RUFFIN,TRAVIS",DATACONV-2,2,MARIJUANA-POSSESS,2002-09-09,1000-01-01 00:00:00,N,LEAKE,3.0
1484,31857,"THIGPEN,JOHNNY",DATACONV-1,1,ARMED ROBBERY,1985-06-18,1000-01-01 00:00:00,Y,BOLIVAR,38.0
1577,36932,"TOWNSEND,WILLIE",DATACONV-1,1,ARMED ROBBERY,1993-06-16,1000-01-01 00:00:00,Y,SCOTT,33.0


Inspecting the PDF, it appears that this happened whenever the offense date was blank. We can rectify it by swapping the offense date and charge date for these rows, then replacing the missing offense dates with `pd.NA`

In [12]:
old_sentence_dates = charge_df["sentence_date"]
charge_df["sentence_date"] = charge_df["offense_date"].where(
    charge_df["sentence_date"] == dt.datetime(1000, 1, 1),
    charge_df["sentence_date"]
)
charge_df["offense_date"] = charge_df["offense_date"].mask(
    old_sentence_dates == dt.datetime(1000, 1, 1),
)

In [13]:
charge_df.iloc[1680]

mdoc_no                               47571
prisoner_name                   WHITE,EDDIE
case_no                          DATACONV-3
index_no                                  3
offense_desc         CARRY CONCEALED WEAPON
offense_date                            NaT
sentence_date           1989-03-23 00:00:00
habitual_flag                             Y
conviction_county                    HINDS 
sentence_term                           5.0
Name: 1680, dtype: object

We also want to separate out the charges with the habitual flag from those without it:

In [14]:
habitual_only = charge_df.loc[charge_df["habitual_flag"] == "Y"]

### Prisoners

In [15]:
prisoner_df = pd.DataFrame(prisoners)
prisoner_df.head()

Unnamed: 0,mdoc_no,name,admission_type,race,age,gender,time_to_serve
0,140035,"ABSTON,DARIUS",RETURN PAROLE,BLACK,31,MALE,24.5
1,T9658,"ADAMS,ANDREW",RETURN PAROLE,BLACK,40,MALE,30.0
2,201081,"ADAMS,BLESSED",PROBATION REVOKED,BLACK,27,MALE,35.0
3,39486,"ADAMS,JAMES",NEW PRISONER,BLACK,45,MALE,30.0
4,194121,"ADAMS,LATERRANCE",NEW PRISONER,BLACK,31,MALE,25.0


## Calculations!
Now that we've extracted the data, we can do our calculations.

#### Number prisoners

In [50]:
no_prisoners = len(prisoner_df["mdoc_no"].unique())
no_prisoners

596

#### Prisoner Race

In [17]:
prisoner_df["race"].value_counts(normalize=True)*100

BLACK      75.167785
WHITE      24.161074
SPANISH     0.335570
NULL        0.167785
ASIAN       0.167785
Name: race, dtype: float64

#### Charges

In [18]:
charge_df["offense_desc"].value_counts().head(50)

ARMED ROBBERY                              136
BURGLARY-RESIDENTIAL                       126
AGGRAVATED ASSAULT                          89
POSSESSION OF FIREARM BY CONVICTEDFELON     85
COCAINE-SELL                                73
BURGLARY LARCENY-UNOCCUPIEDDWELLING         71
POSSESSION OF CONTROLLEDSUBSTANCE           57
POSS OF CNTLD SUBST WITH INTENT             53
SEXUAL BATTERY                              50
GRAND LARCENY                               47
BURGLARY-NONRESID                           45
COCAINE-POSSESS                             44
KIDNAP-                                     41
BURGLARY-VEHICLE                            40
ROBBERY                                     35
UTTERING FORGERY                            30
MANSLAUGHTER                                30
METHAMPHETAMINE - POSSESSION                28
CONSPIRACY TO COMMIT A CRIME                26
POSS CONTRABAND IN PRISON                   25
ESCAPE-JAIL                                 22
DELIVERY OF C

##### Habitual only

In [19]:
habitual_only["offense_desc"].value_counts().head(50)

ARMED ROBBERY                                    102
BURGLARY-RESIDENTIAL                              94
AGGRAVATED ASSAULT                                74
POSSESSION OF FIREARM BY CONVICTEDFELON           64
COCAINE-SELL                                      44
SEXUAL BATTERY                                    37
POSS OF CNTLD SUBST WITH INTENT                   32
KIDNAP-                                           31
POSSESSION OF CONTROLLEDSUBSTANCE                 30
MANSLAUGHTER                                      28
BURGLARY LARCENY-UNOCCUPIEDDWELLING               25
GRAND LARCENY                                     22
BURGLARY-NONRESID                                 20
ROBBERY                                           20
METHAMPHETAMINE - POSSESSION                      19
DELIVERY OF CONTROLLED SUBSTANCE                  18
COCAINE-POSSESS                                   18
CONSPIRACY TO COMMIT A CRIME                      17
BURGLARY-VEHICLE                              

#### Murder/manslaughter

In [20]:
charge_df["offense_desc"].loc[
    charge_df["offense_desc"].str.contains("MURDER|HOMICIDE|MANSLAUGHTER")
].value_counts()

MANSLAUGHTER                 30
MURDER 2ND DEGREE             9
HOMICIDE/MURDER               5
WLK  1  MANSLAUGHTER          1
VEHICULAR HOMICIDE            1
JAW  2  MURDER 2ND DEGREE     1
TTG  1  MANSLAUGHTER          1
Name: offense_desc, dtype: int64

#### Habitual only

In [21]:
habitual_only["offense_desc"].loc[
    habitual_only["offense_desc"].str.contains("MURDER|HOMICIDE|MANSLAUGHTER")
].value_counts()

MANSLAUGHTER                 28
MURDER 2ND DEGREE             6
HOMICIDE/MURDER               3
WLK  1  MANSLAUGHTER          1
VEHICULAR HOMICIDE            1
JAW  2  MURDER 2ND DEGREE     1
TTG  1  MANSLAUGHTER          1
Name: offense_desc, dtype: int64

In [22]:
charge_df.loc[charge_df["offense_desc"] == "HOMICIDE/MURDER"]

Unnamed: 0,mdoc_no,prisoner_name,case_no,index_no,offense_desc,offense_date,sentence_date,habitual_flag,conviction_county,sentence_term
786,115718,"JARVIS,VIRGIL",55:16-C R-0460,6,HOMICIDE/MURDER,2016-02-24,2017-08-08 00:00:00,N,PEARL RIVER,27.0
1156,146899,"NEWELL,JACOLBY",53213,2,HOMICIDE/MURDER,2013-03-26,2014-04-07 00:00:00,Y,LAUDERDALE,40.0
1324,174763,"ROBERTS,TROY",B2402-2012-191,2,HOMICIDE/MURDER,2011-06-01,2013-11-19 00:00:00,Y,HARRISON,30.0
1400,121310,"SMITH,JAMES",B2301-14-0149,6,HOMICIDE/MURDER,2013-12-18,2016-01-11 00:00:00,Y,HANCOCK,36.0
1537,168075,"THORNTON,DEVIN",2015-CR-016-SC-G,2,HOMICIDE/MURDER,2014-02-17,2015-06-01 00:00:00,N,SCOTT,30.0


In [23]:
charge_df.loc[charge_df["offense_desc"].str.contains("MURDER 2")]

Unnamed: 0,mdoc_no,prisoner_name,case_no,index_no,offense_desc,offense_date,sentence_date,habitual_flag,conviction_county,sentence_term
286,K2733,"COOPER,JONATHAN",2018-0169,2,MURDER 2ND DEGREE,2018-01-06,2020-08-21 00:00:00,Y,WASHINGTON,30.0
335,163246,"DAVIS,JOHN",46:18-CR-00196AM,1,MURDER 2ND DEGREE,2017-02-17,2019-09-18 00:00:00,N,MARION,27.0
525,151954,"HAGIN,STEPHEN",B2401-2015-384,2,MURDER 2ND DEGREE,2014-06-02,2018-01-09 00:00:00,Y,HARRISON,40.0
798,192469,"JOHNSON,BRANDON",2014-0029CR,1,MURDER 2ND DEGREE,2013-10-23,2014-08-05 00:00:00,Y,WEBSTER,40.0
885,178843,"KELLY,TRAVIS",CR2019-06,2,MURDER 2ND DEGREE,2018-08-03,2019-09-24 00:00:00,Y,CLAIBORN,30.0
975,146877,"MANUEL,LORENZO",15-0-660 JAW,2,JAW 2 MURDER 2ND DEGREE,2015-04-01,2017-10-06 00:00:00,Y,HINDS,40.0
1158,157230,"NEWELL,ORLAND",B2402-2017-467,3,MURDER 2ND DEGREE,2016-09-24,2018-11-09 00:00:00,N,HARRISON,22.0
1414,M4034,"SPARKMAN,TYRONE",B2402-2014-239,2,MURDER 2ND DEGREE,2013-08-11,2016-06-27 00:00:00,N,HARRISON,27.0
1739,180227,"WILSON,DONALD",17-0009,1,MURDER 2ND DEGREE,2016-07-03,2017-10-10 00:00:00,Y,HOLMES,25.0
1751,146322,"WOODARD,JAMES",B2401-2018-447,1,MURDER 2ND DEGREE,2017-07-14,2019-07-15 00:00:00,Y,HARRISON,30.0


We can calculate the number of people charged with a specific offense (or set of offenses) by counting the number of unique MDOC numbers associated with that offense.

In [24]:
def number_charged_with(df, offense):
    return len(
        df.loc[df["offense_desc"].str.contains(offense), "mdoc_no"].unique()
    )

In [25]:
number_charged_with(charge_df, "RAPE")

28

In [26]:
number_charged_with(habitual_only, "RAPE")

20

In [27]:
habitual_only.loc[
    habitual_only["offense_desc"].str.contains("RAPE")
]

Unnamed: 0,mdoc_no,prisoner_name,case_no,index_no,offense_desc,offense_date,sentence_date,habitual_flag,conviction_county,sentence_term
104,05996,"BELL,SYLVESTER",2004-0195,3,STATUTORY RAPE,2003-07-20,2007-07-03 00:00:00,Y,TUNICA,30.0
177,50639,"BRYANT,CHRISTOPHER",2000K-397 CTII,2,CTII 2 RAPE,2000-02-04,2001-07-26 00:00:00,Y,LAMAR,45.0
182,44734,"BRYANT,KENNETH",4160-CT3,5,RAPE,1989-03-28,1990-02-09 00:00:00,Y,HINDS,7.5
206,29963,"BURTON,DONALD",01-0-539-01 TTG,3,TTG 3 RAPE,2000-09-12,2004-04-01 00:00:00,Y,HINDS,20.0
412,T9212,"EVANS,MATTHEW",CR08076GM,2,STATUTORY RAPE,2007-01-05,2008-10-31 00:00:00,Y,MONROE,30.0
476,75345,"GARNER,TYRONE",CR-99-81-B(P2),3,RAPE,1999-06-24,2000-01-19 00:00:00,Y,PANOLA,39.0
652,128950,"HENDERSON,WILLIAM",2012-1300,1,STATUTORY RAPE,2011-01-21,2013-04-04 00:00:00,Y,YAZOO,30.0
662,41881,"HERRING,ERNEST",B-372,5,RAPE,1985-07-15,1986-09-12 00:00:00,Y,HINDS,40.0
699,78179,"HUBBARD,ANZIO",4990,2,RAPE,1996-01-01,1997-02-20 00:00:00,Y,WINSTON,32.0
730,158936,"HUNT,KENNETH",20080086CR1,1,STATUTORY RAPE,2003-12-01,2010-05-18 00:00:00,Y,LOWNDES,25.0


In [28]:
number_charged_with(habitual_only, "STATUTORY RAPE")

7

In [29]:
number_charged_with(habitual_only, "MARIJUANA|METHAMPHETAMINE|COCAINE|HEROIN|SUBST")

172

In [30]:
number_charged_with(habitual_only, "FIREARM|WEAPON")

76

In [31]:
number_charged_with(habitual_only, "CONTRABAND")

7

In [32]:
172+76+7

255

We can figure out the number of people who were only charged with non-violent offenses by subtracting the total number of people who were charged with _violent_ offenses from the total number of people incarcerated on HO modifiers.

In [47]:
violent_off = number_charged_with(
    habitual_only, 
    "RAPE|MURDER|HOMICIDE|ASSAULT|BATTERY|MANSLAUGHTER|"
    "ROBBERY|ASSLT|FORCIBLE|KIDNAP|SHOOTING"
)
violent_off

273

In [51]:
nonviolent_off = no_prisoners - violent_off
nonviolent_off

323