## ODOC Public Inmate Data


This notebook is intended as a start for research of the ODOC data.  To use this notebook, follow the setup instructiion found [here](https://github.com/codefortulsa/odoc-parse). Then download the data published [here](http://doc.publishpath.com/odoc-public-inmate-data).  Unzip the file and place the files in a subdirectory called 'data'.

The set of files includes a ReadMe.txt which describes the files and their fixed formats. The sections of this notebook show description of each file and how to import it into pandas dataframes.  NOTE: the widths variables differ slighty from the description to handle some difference in the data.

In [1]:
import pandas as pd

In [72]:
file = 'data/Vendor_Profile_Sample_Text.DAT'

# uncomment this line to use the full dataset
file = 'data/Vendor_Profile_Extract_Text.DAT'

names = [
    "DOC_NUM"       
    ,"LAST_NAME"     
    ,"FIRST_NAME"    
    ,"MIDDLE_NAME"   
    ,"SUFFIX"        
    ,"LAST_MOVE_DATE"
    ,"FACILITY"      
    ,"BIRTH_DATE"    
    ,"SEX"           
    ,"RACE"          
    ,"HAIR"          
    ,"HEIGHT_FT"     
    ,"HEIGHT_IN"     
    ,"WEIGHT"        
    ,"EYE"           
    ,"STATUS"
]
widths = [
    11,
    30,
    30,
    30,
    5,
    9,
    40,
    9,
    1,
    40,
    40,
    2,
    2,
    4,
    40,10
]

profile_df = pd.read_fwf(file, 
    header=None,
    widths=widths,
    names=names)

profile_df

from datetime import datetime

file = 'data/Vendor_sentence_Sample_Text.dat'

# uncomment this line to use the full dataset
file = 'data/Vendor_sentence_Extract_Text.dat'


names =[
    "DOC_NUM",                   
    "STATUTE_CODE",              
    "SENTENCING_COUNTY",         
    "JS_DATE",                   
    "CRF_NUMBER",                
    "INCARCERATED_TERM_IN_YEARS",
    "PROBATION_TERM_IN_YEARS"
]

widths = [
    11,
    40,
    40,
    9,
    40,
    13,
    13
]

sentence_df = pd.read_fwf(file, 
    header=None,
    widths=widths,
    names=names)

# merge, but keep all sentencing data (profile doc_nums are unique. Sentencing doc_nums are not)
combined_data = pd.merge(sentence_df, profile_df, on='DOC_NUM', how='left')
# drop invalid data
combined_data.dropna(subset=['LAST_NAME'], inplace=True)  
combined_data

Unnamed: 0,DOC_NUM,STATUTE_CODE,SENTENCING_COUNTY,JS_DATE,CRF_NUMBER,INCARCERATED_TERM_IN_YEARS,PROBATION_TERM_IN_YEARS,LAST_NAME,FIRST_NAME,MIDDLE_NAME,...,FACILITY,BIRTH_DATE,SEX,RACE,HAIR,HEIGHT_FT,HEIGHT_IN,WEIGHT,EYE,STATUS
0,8359,21-1720,POTTAWATOMIE COUNTY COURT,13-MAR-13,2010-300,,1.5,MESLES,ARDELL,,...,INACTIVE,06-APR-87,M,,,,,,,Inactive
1,10337,21-701.7,BRYAN COUNTY COURT,13-APR-20,1-0,7777.0,,SAWYER,FRANK,N,...,INACTIVE,01-JAN-00,M,WHITE,UNK,5.0,1.0,,BLACK,Inactive
2,10337,21-701.7,BRYAN COUNTY COURT,14-DEC-62,20-15645,,40.0,SAWYER,FRANK,N,...,INACTIVE,01-JAN-00,M,WHITE,UNK,5.0,1.0,,BLACK,Inactive
3,20413,21-1731,TULSA COUNTY COURT,19-APR-77,77-240,,3.0,KNIGHTON,RAYMOND,J,...,INACTIVE,20-AUG-48,M,BLACK,BROWN,,,,BROWN,Inactive
4,22574,21-741,TULSA COUNTY COURT,03-MAR-64,20-393,7777.0,,STOCKTON,MELVIN,H,...,INACTIVE,07-DEC-45,M,WHITE,BROWN,5.0,10.0,150.0,GRAY,Inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335214,996281,21-798,UNKNOWN COUNTY COURT,08-JUN-82,9-897,7777.0,,PILON,GARY,V,...,INACTIVE,18-SEP-53,M,WHITE,BROWN,6.0,2.0,185.0,GREEN,Inactive
1335215,996281,21-444,WASHINGTON JURISDICTION,29-MAY-80,79-1236,10.0,,PILON,GARY,V,...,INACTIVE,18-SEP-53,M,WHITE,BROWN,6.0,2.0,185.0,GREEN,Inactive
1335216,997494,21-799,CALIFORNIA JURISDICTION,01-FEB-94,94-8235,5.0,,AKERS,MARK,W,...,INACTIVE,17-JUL-70,M,WHITE,BROWN,5.0,9.0,178.0,BLUE,Inactive
1335217,997494,21-799,CALIFORNIA JURISDICTION,01-FEB-94,94-8235,3.0,,AKERS,MARK,W,...,INACTIVE,17-JUL-70,M,WHITE,BROWN,5.0,9.0,178.0,BLUE,Inactive


In [82]:
import oscn
import re


counties = oscn.counties
debug = False  # make True to print bad data that will be dropped

def parse_county(dataCounty):
    if "JURISDICTION" in dataCounty:
        # should be mostly out of state cases
        if debug: print(dataCounty)
        return None
    county = dataCounty.split(" ")[0].lower()
    if county in counties:
        return county
    if debug: print(dataCounty)
    return None

def parse_year(crf_number):
    split_number = re.split("-| ", crf_number)
    for i in range(2):
        try:
            maybe_year = int(split_number[i])
            if (maybe_year < 2021 and maybe_year > 1900):
                return split_number[i]
            if (maybe_year < 2021 and maybe_year > 1900):
                return split_number[i]
            if (maybe_year < 100 and maybe_year > 20):
                return "19" + split_number[i]
            if (maybe_year < 10 and maybe_year >= 0):
                return "200" + split_number[i]
            if (maybe_year <= 20 and maybe_year >= 10):
                return "20" + split_number[i]
        except:
            continue
    if debug: print(crf_number)
    return None
    
def parse_number(crf_number):
    # this could be cleaned up more. Sometimes has letters in it (invalid) and other things wrong
    split_number = crf_number.split("-")
    return split_number[len(split_number)-1]


combined_data['oscn_county'] = combined_data['SENTENCING_COUNTY'].apply(parse_county)
combined_data.dropna(subset=['oscn_county'], inplace=True)  
combined_data['oscn_year'] = combined_data['CRF_NUMBER'].apply(parse_year)
combined_data.dropna(subset=['oscn_year'], inplace=True)  
combined_data['oscn_number'] = combined_data['CRF_NUMBER'].apply(parse_number)
combined_data.dropna(subset=['oscn_number'], inplace=True)  


combined_data

Unnamed: 0,DOC_NUM,STATUTE_CODE,SENTENCING_COUNTY,JS_DATE,CRF_NUMBER,INCARCERATED_TERM_IN_YEARS,PROBATION_TERM_IN_YEARS,LAST_NAME,FIRST_NAME,MIDDLE_NAME,...,RACE,HAIR,HEIGHT_FT,HEIGHT_IN,WEIGHT,EYE,STATUS,oscn_county,oscn_year,oscn_number
0,8359,21-1720,POTTAWATOMIE COUNTY COURT,13-MAR-13,2010-300,,1.5,MESLES,ARDELL,,...,,,,,,,Inactive,pottawatomie,2010,300
1,10337,21-701.7,BRYAN COUNTY COURT,13-APR-20,1-0,7777.0,,SAWYER,FRANK,N,...,WHITE,UNK,5.0,1.0,,BLACK,Inactive,bryan,2001,0
2,10337,21-701.7,BRYAN COUNTY COURT,14-DEC-62,20-15645,,40.0,SAWYER,FRANK,N,...,WHITE,UNK,5.0,1.0,,BLACK,Inactive,bryan,2020,15645
3,20413,21-1731,TULSA COUNTY COURT,19-APR-77,77-240,,3.0,KNIGHTON,RAYMOND,J,...,BLACK,BROWN,,,,BROWN,Inactive,tulsa,1977,240
4,22574,21-741,TULSA COUNTY COURT,03-MAR-64,20-393,7777.0,,STOCKTON,MELVIN,H,...,WHITE,BROWN,5.0,10.0,150.0,GRAY,Inactive,tulsa,2020,393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335036,939190,63-2-407,JEFFERSON COUNTY COURT,24-AUG-89,88-1835,1.0,,TURNER,ALICE,M,...,WHITE,BLACK,5.0,3.0,175.0,BROWN,Inactive,jefferson,1988,1835
1335037,939190,63-2-407,JEFFERSON COUNTY COURT,24-AUG-89,88-1835,1.0,,TURNER,ALICE,M,...,WHITE,BLACK,5.0,3.0,175.0,BROWN,Inactive,jefferson,1988,1835
1335188,990184,21-798,WASHINGTON COUNTY COURT,16-MAY-84,84-6869,20.0,,MOLINA,RUDOLFO,V,...,M,BLACK,6.0,1.0,185.0,BROWN,Inactive,washington,1984,6869
1335189,990184,21-645,WASHINGTON COUNTY COURT,16-MAY-84,84-1686,10.0,,MOLINA,RUDOLFO,V,...,M,BLACK,6.0,1.0,185.0,BROWN,Inactive,washington,1984,1686


In [102]:
sample_df = combined_data.sample(1000)
sample_df

Unnamed: 0,DOC_NUM,STATUTE_CODE,SENTENCING_COUNTY,JS_DATE,CRF_NUMBER,INCARCERATED_TERM_IN_YEARS,PROBATION_TERM_IN_YEARS,LAST_NAME,FIRST_NAME,MIDDLE_NAME,...,RACE,HAIR,HEIGHT_FT,HEIGHT_IN,WEIGHT,EYE,STATUS,oscn_county,oscn_year,oscn_number
1279959,771746,21-1713,CADDO COUNTY COURT,25-MAY-17,2015-289,,5.00,KENEMER,CODY,MARSHALL,...,WHITE,BROWN,5.0,11.0,218.0,BLUE,Active,caddo,2015,289
638685,384982,63-2-401,PITTSBURG COUNTY COURT,07-MAY-15,CF-2014-707,,20.00,CRAIG,STEVEN,R,...,WHITE,BROWN,5.0,10.0,231.0,BLUE,Active,pittsburg,2014,707
410937,232755,63-2-405,OKLAHOMA COUNTY COURT,21-FEB-96,95-203,1.0,,JOHNSON,DAVID,I,...,BLACK,BLACK,6.0,0.0,187.0,BROWN,Active,oklahoma,1995,203
694338,414028,21-1435,MCCURTAIN COUNTY COURT,07-JAN-02,2001-285,3.0,,MAGUIRE,STEVEN,BRANDON,...,WHITE,BROWN,6.0,0.0,209.0,BROWN,Inactive,mccurtain,2001,285
439780,240295,63-2-402,OKLAHOMA COUNTY COURT,16-APR-04,2004-602,,,KUYKENDALL,ROY,GEORGE,...,WHITE,BROWN,5.0,5.0,200.0,BLUE,Inactive,oklahoma,2004,602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327658,209287,63-2-401.G,CREEK COUNTY COURT,03-SEP-03,2003-118,12.0,,LITTLE,JEREMY,SCOTT,...,WHITE,BROWN,6.0,0.0,185.0,BROWN,Active,creek,2003,118
774168,456779,21-1592,COMANCHE COUNTY COURT,18-SEP-03,2003-388,,7.55,CHAO,MARILYN,MONICA,...,HISPANIC,BROWN,5.0,8.0,202.0,BROWN,Inactive,comanche,2003,388
499441,256012,63-2-407,WAGONER COUNTY COURT,21-APR-97,1996-397,,3.00,HOLLEY,TERRI,E,...,NATIVE AMERICAN,BROWN,5.0,6.0,200.0,BROWN,Inactive,wagoner,1996,397
894502,538835,21-1713,COTTON COUNTY COURT,10-FEB-06,2004-62D,,5.00,DENHAM,KEVIN,S,...,WHITE,BROWN,5.0,10.0,180.0,BLUE,Inactive,cotton,2004,62D


In [103]:
def get_oscn_judges(oscn_county, oscn_year, oscn_number, last_name):
    try:
        case = oscn.request.Case(f'{oscn_county}-CF-{oscn_year}-{oscn_number}')
        if case.valid:
            defendants = find_defendants(case)
            if defendants:
                match = name_match(defendants, last_name)
                return (defendants, match, case.judge)
        if debug: print(f'{oscn_county}-CF-{oscn_year}-{oscn_number}')
        return (None, None, None)
    except:
        if debug: print(f'{oscn_county}-CF-{oscn_year}-{oscn_number}')
        return (None, None, None)

def find_defendants(case):
    if case.valid:
        defendants = []
        for party in case.parties:
            if party["type"] == "Defendant":
                defendants.append(party["name"])
        if defendants == []:
            return None
        return defendants
    return None

def name_match(defendants, last_name):
    for defendant in defendants:
        defendant_last_name = defendant.split(",")[0]
        if last_name.lower() == defendant_last_name.lower():
            return True
    return False
    
    
sample_df['defendants'], sample_df['defendant_match'], sample_df['judge'] = zip(*sample_df.apply(
    lambda x: get_oscn_judges(x['oscn_county'], x['oscn_year'], x['oscn_number'], x['LAST_NAME']), axis=1
))
sample_df

[]
[]
[]
[]
[]
oklahoma-CF-2014-661R
delaware-CF-2014-18A
[]
[]
oklahoma-CF-2011-6448R
oklahoma-CF-2008-3357R
craig-CF-2000-13A
[]
[]
carter-CF-2015-83B
rogers-CF-2019-135R
delaware-CF-2005-307R
oklahoma-CF-2013-3087R
oklahoma-CF-2000-636R
[]
pittsburg-CF-2017-452R
[]
[]
[]
[]
tulsa-CF-2008-1077R
[]
[]
oklahoma-CF-2011-5064R
[]
[]
tulsa-CF-1983-732 CT 2
[]
marshall-CF-2014-101R
woodward-CF-2009-411R
oklahoma-CF-2013-132R
oklahoma-CF-2009-2920R
okmulgee-CF-2008-110R
oklahoma-CF-2017-5627R
[]
[]
osage-CF-2007-313C
[]
[]
payne-CF-2010-267R
cleveland-CF-2012-1759R
[]
[]
[]
delaware-CF-2017-138R
[]
carter-CF-2019-372A
[]
cleveland-CF-2001-1641R
oklahoma-CF-2013-1261R
cotton-CF-2004-62D


Unnamed: 0,DOC_NUM,STATUTE_CODE,SENTENCING_COUNTY,JS_DATE,CRF_NUMBER,INCARCERATED_TERM_IN_YEARS,PROBATION_TERM_IN_YEARS,LAST_NAME,FIRST_NAME,MIDDLE_NAME,...,HEIGHT_IN,WEIGHT,EYE,STATUS,oscn_county,oscn_year,oscn_number,defendants,defendant_match,judge
1279959,771746,21-1713,CADDO COUNTY COURT,25-MAY-17,2015-289,,5.00,KENEMER,CODY,MARSHALL,...,11.0,218.0,BLUE,Active,caddo,2015,289,"[KENEMER, CODY MARSHALL]",True,"STEPHENS, DAVID A"
638685,384982,63-2-401,PITTSBURG COUNTY COURT,07-MAY-15,CF-2014-707,,20.00,CRAIG,STEVEN,R,...,10.0,231.0,BLUE,Active,pittsburg,2014,707,"[CRAIG, STEVEN R. II]",True,"MILLS, TIM"
410937,232755,63-2-405,OKLAHOMA COUNTY COURT,21-FEB-96,95-203,1.0,,JOHNSON,DAVID,I,...,0.0,187.0,BROWN,Active,oklahoma,1995,203,"[ISSAC, DAVID]",False,"CASWELL, SUSAN P"
694338,414028,21-1435,MCCURTAIN COUNTY COURT,07-JAN-02,2001-285,3.0,,MAGUIRE,STEVEN,BRANDON,...,0.0,209.0,BROWN,Inactive,mccurtain,2001,285,"[MAGUIRE, STEVEN BRANDON]",True,UNASSIGNED
439780,240295,63-2-402,OKLAHOMA COUNTY COURT,16-APR-04,2004-602,,,KUYKENDALL,ROY,GEORGE,...,5.0,200.0,BLUE,Inactive,oklahoma,2004,602,"[Kuykendall, Roy G]",True,"BRAGG, SUSAN W"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327658,209287,63-2-401.G,CREEK COUNTY COURT,03-SEP-03,2003-118,12.0,,LITTLE,JEREMY,SCOTT,...,0.0,185.0,BROWN,Active,creek,2003,118,"[WIECK, KEVIN CROCKER]",False,"VASSAR, JOE SAM"
774168,456779,21-1592,COMANCHE COUNTY COURT,18-SEP-03,2003-388,,7.55,CHAO,MARILYN,MONICA,...,8.0,202.0,BROWN,Inactive,comanche,2003,388,"[CHAO, MARILYN MONICA]",True,"MCCALL, C"
499441,256012,63-2-407,WAGONER COUNTY COURT,21-APR-97,1996-397,,3.00,HOLLEY,TERRI,E,...,6.0,200.0,BROWN,Inactive,wagoner,1996,397,"[HOLLEY, TERRI ELIZABETH]",True,"SEWELL, BRUCE"
894502,538835,21-1713,COTTON COUNTY COURT,10-FEB-06,2004-62D,,5.00,DENHAM,KEVIN,S,...,10.0,180.0,BLUE,Inactive,cotton,2004,62D,,,


In [106]:
sample_df.to_csv (r'/home/kendall/src/odoc-parse/sentencingData.csv', index = False, header=True)

In [110]:
sample_df['defendant_match'].value_counts()

True     743
False     92
Name: defendant_match, dtype: int64

In [111]:
sample_df['judge'].value_counts()

UNASSIGNED            94
PALUMBO, AMY          50
ELLIOTT, RAY C        47
COYLE, HEATHER        41
KIRBY, RICHARD        39
                      ..
THYGESEN, NORMAN       1
SEWELL, BRUCE          1
LOVELL, BRIAN N        1
HAWORTH, TIMOTHY       1
ROPER,  MICHELLE K     1
Name: judge, Length: 194, dtype: int64