# Code Section Analysis

In [1]:
import os

from sqlalchemy import create_engine
import pandas as pd

In [2]:
USER = 'jupyter'
PASSWORD = os.environ['POSTGRES_PASS']
HOST = 'localhost'
PORT = '5432'
DB = 'expunge'

DATABASE_URI = f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}"
engine = create_engine(DATABASE_URI)

In [3]:
%load_ext sql
%sql {DATABASE_URI}

Create a table containing all code sections that account for at least 1% of records

In [9]:
%%sql
CREATE TEMPORARY TABLE temp_top_codes AS
WITH code_distribution AS (
    SELECT
        "CodeSection",
        COUNT(*),
        ROUND(
            COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 
        2) AS percent
    FROM expunge_clean
    GROUP BY "CodeSection"
    ORDER BY percent DESC
)
SELECT
    *,
    SUM(percent) OVER(
        ORDER BY percent DESC 
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS cumulative_percent
FROM code_distribution
WHERE percent > 1.0;

 * postgresql://jupyter:***@localhost:5432/expunge
22 rows affected.


[]

These top 22 code sections account for ~62% of the records

In [10]:
%%sql
SELECT *
FROM temp_top_codes

 * postgresql://jupyter:***@localhost:5432/expunge
22 rows affected.


CodeSection,count,percent,cumulative_percent
A.46.2-862,820869,9.07,9.07
B.46.2-301,808278,8.93,18.0
46.2-300,557710,6.16,24.16
C.46.2-862,360087,3.98,28.14
18.2-250.1,319184,3.53,31.67
A.18.2-266,268080,2.96,34.63
18.2-95,232460,2.57,37.2
18.2-250,214542,2.37,39.57
A.46.2-852,207907,2.3,41.87
18.2-57,206629,2.28,44.15


## Code Sections by Race

Overall distribution of race in `expunge`

In [19]:
%%sql
SELECT
    "Race",
    COUNT(*),
    ROUND(
        COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 
    2) AS percent
FROM expunge_clean
GROUP BY "Race"
ORDER BY COUNT(*) DESC

 * postgresql://jupyter:***@localhost:5432/expunge
6 rows affected.


Race,count,percent
White,4899463,54.11
Black,3579193,39.53
Hispanic,291017,3.21
Unknown,186198,2.06
Asian or Pacific Islander,89262,0.99
American Indian or Alaskan Native,9133,0.1


Below is the racial breakdowns for the top 22 code sections (as found above), with both counts and percents. 

In [18]:
%%sql
SELECT
    c."CodeSection",
    e."Race",
    COUNT(*),
    ROUND(
        COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(PARTITION BY c."CodeSection"), 
    2) AS percent
FROM temp_top_codes c
JOIN expunge_clean e
    ON c."CodeSection" = e."CodeSection"
GROUP BY c."CodeSection", e."Race"
ORDER BY c."CodeSection", COUNT(*) DESC

 * postgresql://jupyter:***@localhost:5432/expunge
132 rows affected.


CodeSection,Race,count,percent
18.2-103,White,67089,62.54
18.2-103,Black,37831,35.26
18.2-103,Hispanic,971,0.91
18.2-103,Asian or Pacific Islander,701,0.65
18.2-103,Unknown,633,0.59
18.2-103,American Indian or Alaskan Native,57,0.05
18.2-119,Black,58075,53.76
18.2-119,White,47865,44.31
18.2-119,Hispanic,732,0.68
18.2-119,Unknown,669,0.62
