In [67]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [68]:
# Import csv
inmate = pd.read_csv("../data/INMT4AA1.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [69]:
# Check number of observations, columns
print("Number of observations:", inmate.shape[0])
print("Number of columns:", inmate.shape[1])

Number of observations: 468172
Number of columns: 67


In [70]:
# List all variables
inmate.columns

Index(['INMATE_DOC_NUMBER', 'INMATE_LAST_NAME', 'INMATE_FIRST_NAME',
       'INMATE_MIDDLE_INITIAL', 'INMATE_NAME_SUFFIX',
       'INMATE_NAME_SOUNDEX_CODE', 'INMATE_GENDER_CODE', 'INMATE_RACE_CODE',
       'INMATE_BIRTH_DATE', 'INMATE_ETHNIC_AFFILIATION',
       'INMATE_RECORD_STATUS_CODE', 'INMATE_ADMIN._STATUS_CODE',
       'CUSTODY_CLASS_CODE', 'NEXT_CUSTODY_REVIEW_DATE',
       'INMATE_CONTROL_STATUS_CODE', 'NEXT_SECURITY_REVIEW_DATE',
       'INMATE_SPECIAL_CHARACTERISTICS', 'PAROLE_CASE_ANALYST',
       'NEXT_PAROLE_COMM._REVIEW_DATE', 'INMATE_PRIMARY_ASSIGNMENT',
       'INMATE_ADMISSION_DATE', 'ADMITTING_DIAGNOSTIC_CENTER',
       'DATE_OF_LAST_INMATE_MOVEMENT', 'TYPE_OF_LAST_INMATE_MOVEMENT',
       'OTHER_FACILITY_CODE', 'CURRENT_DOP_COMMAND_CODE',
       'CURRENT_DOP_AREA_CODE', 'INMATE_FACILITY_CODE',
       'INMATE_TIME_COMP_STATUS_CODE', 'OLDEST_COMMIT.OF_CURRENT_INCAR',
       'OLDEST_SNT.CMP._OF_CURR.INCAR.', 'OLDEST_CONVICTION_DATE',
       'TOTAL_SENTENCE_COUNT', 'MO

In [84]:
# Check if the DOC ID number is unique
inmate['INMATE_DOC_NUMBER'].is_unique

True

In [71]:
# For project proposal, only want:
#  - Age: INMATE_BIRTH_DATE
#  - Gender: INMATE_GENDER_CODE
#  - Race: INMATE_RACE_CODE
#  - Active (or not): INMATE_RECORD_STATUS_CODE
#  - Felony or misdemeanor: INMATE_IS_FELON/MISDEMEANANT

inmate_explore = inmate[['INMATE_BIRTH_DATE', 'INMATE_GENDER_CODE', 'INMATE_RACE_CODE', 'INMATE_RECORD_STATUS_CODE', 'INMATE_IS_FELON/MISDEMEANANT']]

In [72]:
# Sanity check: headers
inmate_explore.head()

Unnamed: 0,INMATE_BIRTH_DATE,INMATE_GENDER_CODE,INMATE_RACE_CODE,INMATE_RECORD_STATUS_CODE,INMATE_IS_FELON/MISDEMEANANT
0,1961-10-15,MALE,WHITE,INACTIVE,FELON
1,1951-07-17,MALE,WHITE,INACTIVE,MISD.
2,1963-12-29,MALE,WHITE,INACTIVE,FELON
3,1953-05-18,MALE,BLACK,INACTIVE,FELON
4,1921-08-26,MALE,WHITE,INACTIVE,MISD.


In [73]:
# Sanity check: number of observations
inmate_explore.shape

(468172, 5)

In [None]:
# Define a function to help calculate percentages
def calc_pct(df):
    total_count = sum(df['count'])
    df['pct'] = df['count'] * 100 / total_count
    return df.sort_values(['pct'], ascending=False)

In [85]:
# Summarize gender
by_gender = pd.DataFrame(inmate_explore.groupby(['INMATE_GENDER_CODE']).size().reset_index(name='count'))
calc_pct(by_gender)

Unnamed: 0,INMATE_GENDER_CODE,count,pct
1,MALE,408537,87.26216
0,FEMALE,59635,12.73784


In [86]:
# Summarize race
by_race = pd.DataFrame(inmate_explore.groupby(['INMATE_RACE_CODE']).size().reset_index(name='count'))
calc_pct(by_race)

Unnamed: 0,INMATE_RACE_CODE,count,pct
1,BLACK,231567,49.462056
5,WHITE,207389,44.297703
3,OTHER,17297,3.69459
2,INDIAN,9395,2.006745
4,UNKNOWN,1602,0.342183
0,ASIAN/ORTL,921,0.196723


In [77]:
# Summarize by active or not
by_status = pd.DataFrame(inmate_explore.groupby(['INMATE_RECORD_STATUS_CODE']).size().reset_index(name='count'))
calc_pct(by_status)

Unnamed: 0,INMATE_RECORD_STATUS_CODE,count,pct
1,INACTIVE,421662,90.065617
0,ACTIVE,33566,7.169587
2,PAROLED,12944,2.764796


In [78]:
# Summarize by felony/misdemeanor
by_felon = pd.DataFrame(inmate_explore.groupby(['INMATE_IS_FELON/MISDEMEANANT']).size().reset_index(name='count'))
calc_pct(by_felon)

Unnamed: 0,INMATE_IS_FELON/MISDEMEANANT,count,pct
0,FELON,303536,69.983515
1,MISD.,130189,30.016485


In [79]:
# Convert DOB to age
now = pd.Timestamp('now')
inmate_explore['dob'] = pd.to_datetime(inmate_explore['INMATE_BIRTH_DATE'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [80]:
inmate_explore['dob'].head()

0   1961-10-15
1   1951-07-17
2   1963-12-29
3   1953-05-18
4   1921-08-26
Name: dob, dtype: datetime64[ns]

In [81]:
inmate_explore['age'] = (now - inmate_explore['dob']).astype('<m8[Y]')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [82]:
inmate_explore['age'].head()

0    58.0
1    68.0
2    56.0
3    66.0
4    98.0
Name: age, dtype: float64

In [83]:
# Summary statistics for age
inmate_explore['age'].describe()

count    467954.000000
mean         52.290370
std          16.072996
min          15.000000
25%          40.000000
50%          52.000000
75%          63.000000
max         120.000000
Name: age, dtype: float64