# Expungement Exploration

## Connect to PostGreSQL Database

In [1]:
import os

from sqlalchemy import create_engine
import pandas as pd

In [2]:
USER = 'jupyter'
PASSWORD = os.environ['POSTGRES_PASS']
HOST = 'localhost'
PORT = '5432'
DB = 'expunge'

DATABASE_URI = f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}"
engine = create_engine(DATABASE_URI)

I'm using SQL Jupyter magic so I can do some initial exploration in pure SQL. Running the below cells requires installing 2 python packages: 
```bash
pip install --user ipython-sql pgspecial
```

In [3]:
%load_ext sql
%sql {DATABASE_URI}

List all tables in `expunge` database

In [4]:
%sql \dt

 * postgresql://jupyter:***@localhost:5432/expunge
8 rows affected.


Schema,Name,Type,Owner
public,data_100k_sample,table,jupyter
public,data_10k_sample,table,jupyter
public,data_1k_sample,table,jupyter
public,expunge,table,jupyter
public,ids_100k_sample,table,jupyter
public,ids_10k_sample,table,jupyter
public,ids_1k_sample,table,jupyter
public,test_table_jupyter_linshavers,table,jupyter


Column names and types for main `expunge` table

In [5]:
%sql \d expunge

 * postgresql://jupyter:***@localhost:5432/expunge
28 rows affected.


Column,Type,Modifiers
person_id,text,
HearingDate,date,
CodeSection,text,
codesection,text,
ChargeType,text,
chargetype,text,
Class,text,
DispositionCode,text,
disposition,text,
Plea,text,


In [6]:
%%sql
SELECT *
FROM pg_catalog.pg_indexes
WHERE tablename NOT LIKE 'pg%';

 * postgresql://jupyter:***@localhost:5432/expunge
0 rows affected.


schemaname,tablename,indexname,tablespace,indexdef


In [6]:
%%sql
SELECT *
FROM expunge
LIMIT 3

 * postgresql://jupyter:***@localhost:5432/expunge
3 rows affected.


person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,Race,Sex,fips,convictions,arrests,felony10,sevenyear,tenyear,within7,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime
292030000000115,2016-06-20,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,White Caucasian(Non-Hispanic),Male,163,True,False,False,False,False,True,True,False,False,Automatic (pending),False,Automatic (pending),"Conviction of misdemeanor charges listed in 19.2-392.6 B with no convictions since the disposition date. However, because the disposition date is within 7 years of the current date, the record is not yet eligible for expungement",False,False
147170000000107,2012-05-23,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,1.0,Guilty,Conviction,Not Guilty,Hispanic,Male,712,True,False,False,False,False,False,True,False,False,Automatic,False,Automatic,Conviction of misdemeanor charges listed in 19.2-392.6 B with no convictions within 7 years from disposition date,False,False
147170000000107,2015-04-22,A.46.2-865,covered elsewhere,Misdemeanor,Misdemeanor,,Dismissed,Dismissed,Not Guilty,Hispanic,Male,712,True,True,False,False,False,True,True,False,False,Petition,True,Petition,"Dismissal of misdemeanor charges, but with arrests or charges in the past 3 years",False,False


There are ~3M unique persons in `expunge` (perhaps slightly more due to the imperfect anonymization method)

In [7]:
%%sql
SELECT COUNT(DISTINCT person_id)
FROM expunge;

 * postgresql://jupyter:***@localhost:5432/expunge
1 rows affected.


count
3082954


In [8]:
%%sql
SELECT
    "Race",
    COUNT(*)
FROM expunge
GROUP BY "Race"
ORDER BY COUNT(*) DESC
LIMIT 10;

 * postgresql://jupyter:***@localhost:5432/expunge
10 rows affected.


Race,count
White Caucasian(Non-Hispanic),3517348
Black(Non-Hispanic),2501329
White Caucasian (Non-Hispanic),1280318
Black (Non-Hispanic),1006077
Hispanic,291017
Other(Includes Not Applicable.. Unknown),109833
White,101797
Asian Or Pacific Islander,88771
Black,71787
MISSING,31319


## Common Code Sections
There are 28,084 distinct Code Sections in `expunge`

In [9]:
%%sql
SELECT COUNT(DISTINCT "CodeSection")
FROM expunge;

 * postgresql://jupyter:***@localhost:5432/expunge
1 rows affected.


count
28084


However, as seen below in the `cumulative_percent` column, it seems that 60%+ of the cases fall under the 22 most common code sections

In [10]:
%%sql
CREATE TEMPORARY TABLE temp_top_codes AS
WITH code_distribution AS (
    SELECT
        "CodeSection",
        COUNT(*),
        ROUND(
            COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 
        2) AS percent
    FROM expunge
    GROUP BY "CodeSection"
    ORDER BY percent DESC
)
SELECT
    *,
    SUM(percent) OVER(
        ORDER BY percent DESC 
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS cumulative_percent
FROM code_distribution
WHERE percent > 1.0;

 * postgresql://jupyter:***@localhost:5432/expunge
22 rows affected.


[]

In [11]:
%%sql
SELECT *
FROM temp_top_codes;

 * postgresql://jupyter:***@localhost:5432/expunge
22 rows affected.


CodeSection,count,percent,cumulative_percent
A.46.2-862,820869,9.07,9.07
B.46.2-301,808278,8.93,18.0
46.2-300,557710,6.16,24.16
C.46.2-862,360087,3.98,28.14
18.2-250.1,319184,3.53,31.67
A.18.2-266,268080,2.96,34.63
18.2-95,232460,2.57,37.2
18.2-250,214542,2.37,39.57
A.46.2-852,207907,2.3,41.87
18.2-57,206629,2.28,44.15


In [12]:
df = pd.read_sql(f"""
    SELECT 
        "CodeSection",
        "Race"
    FROM expunge
    WHERE "CodeSection" IN ('A.46.2-862')
""", engine)

df.head()

Unnamed: 0,CodeSection,Race
0,A.46.2-862,White Caucasian(Non-Hispanic)
1,A.46.2-862,Hispanic
2,A.46.2-862,Black(Non-Hispanic)
3,A.46.2-862,Black(Non-Hispanic)
4,A.46.2-862,Black(Non-Hispanic)


In [13]:
df['Race'].value_counts()

White Caucasian(Non-Hispanic)                  412730
Black(Non-Hispanic)                            263714
Other(Includes Not Applicable.. Unknown)        41579
Hispanic                                        41557
Asian Or Pacific Islander                       15602
MISSING                                         12228
White Caucasian (Non-Hispanic)                   8428
Unknown (Includes Not Applicable.. Unknown)      7883
Black (Non-Hispanic)                             6093
American Indian                                  3422
White                                            3399
Black                                            2500
Other (Includes Not Applicable.. Unknown)        1034
Unknown                                           552
Asian or Pacific Islander                         109
American Indian or Alaskan Native                  31
American Indian Or Alaskan Native                   8
Name: Race, dtype: int64

In [14]:
def standardize_race_field(race: pd.Series) -> pd.Series:
    return (
        race.str.upper()
            .str.replace('\(.*?\)', '', regex=True)
            .str.strip()
            .replace({
                'WHITE CAUCASIAN': 'WHITE',
                'AMERICAN INDIAN': 'AMERICAN INDIAN OR ALASKAN NATIVE',
                'OTHER': 'UNKNOWN',
                'MISSING': 'UNKNOWN',
            })
    )

In [15]:
df['Race'].pipe(standardize_race_field).value_counts()

WHITE                                424557
BLACK                                272307
UNKNOWN                               63276
HISPANIC                              41557
ASIAN OR PACIFIC ISLANDER             15711
AMERICAN INDIAN OR ALASKAN NATIVE      3461
Name: Race, dtype: int64