In [1]:
from sqlalchemy import create_engine
import psycopg2 as db
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_rows = 999
pd.options.display.max_columns= 999

In [3]:
postPass=os.environ["POSTGRES_PASS"]

In [4]:
engine = create_engine("postgresql+psycopg2://{user}:{pw}@localhost/{db}"
                       .format(user="jupyter", pw=postPass, db="expunge"))

# Table 1: Metrics by FIPS code, code section, and race
One row for every code_section/fips/race, with columns for count within race, count overall, proportion, and disparity index. At the individual level

## a. individual level these results do not make sense

In [None]:
table_1a = """
WITH byrace AS (
    SELECT fips,race, code_section, COUNT(DISTINCT(person_id)) AS distinct_count_race 
    FROM charges
    GROUP BY fips, race, code_section),
overall AS (
    SELECT fips, code_section, COUNT(DISTINCT(person_id)) AS distinct_count_overall
    FROM charges
    GROUP BY fips, code_section)
SELECT c.race, c.code_section,c.fips, c.distinct_count_race , d.distinct_count_overall, l.proportion,
    (cast(c.distinct_count_race as decimal)/cast(d.distinct_count_overall as decimal))/cast(l.proportion as decimal) as disparity_index
FROM byrace c
INNER JOIN overall d
    ON c.code_section=d.code_section
INNER JOIN census_long l
    ON l.race=c.race
WHERE distinct_count_overall > 1000
ORDER BY disparity_index DESC
"""
table1a = pd.read_sql_query(table_1a, con=engine)

In [None]:
table1a

## b. charge level

In [None]:
table_1b = """
WITH byrace AS (
    SELECT fips, race, code_section, COUNT(*) AS total_count_race 
    FROM charges
    GROUP BY fips, race, code_section),
overall AS (
    SELECT fips, code_section, COUNT(*) AS total_count_overall
    FROM charges
    GROUP BY fips, code_section)
SELECT c.fips, c.race, c.code_section,  c.total_count_race, d.total_count_overall, l.proportion,
    (cast(c.total_count_race as decimal)/cast(d.total_count_overall as decimal))/cast(l.proportion as decimal) as disparity_index
FROM byrace c
INNER JOIN overall d
    ON c.code_section=d.code_section
INNER JOIN census_long l
    ON l.race=c.race
WHERE total_count_overall > 1000
ORDER BY disparity_index DESC
"""
table1b = pd.read_sql_query(table_1b, con=engine)

In [None]:
table1b

# Table 2: Metrics by code section and race
One row for every code_section/race, with columns for count within race, count overall, proportion, and disparity index. At the individual level

## a. individual level (for distinct_count_overall > 1000)

In [42]:
table_2a = """
WITH byrace AS (
    SELECT race, code_section, COUNT(DISTINCT(person_id)) AS distinct_count_race 
    FROM charges
    GROUP BY race, code_section),
overall AS (
    SELECT code_section, COUNT(DISTINCT(person_id)) AS distinct_count_overall
    FROM charges
    GROUP BY  code_section)
SELECT c.race, c.code_section, m.max_fips, c.distinct_count_race , d.distinct_count_overall, l.proportion,
    (cast(c.distinct_count_race as decimal)/cast(d.distinct_count_overall as decimal))/cast(l.proportion as decimal) as disparity_index
FROM byrace c
INNER JOIN overall d
    ON c.code_section=d.code_section
INNER JOIN census_va l
    ON l.race=c.race
INNER JOIN maxfips m 
    ON m.code_section = c.code_section
WHERE distinct_count_overall > 1000
ORDER BY disparity_index DESC
"""
table2a = pd.read_sql_query(table_2a, con=engine)

In [43]:
table2a

Unnamed: 0,race,code_section,max_fips,distinct_count_race,distinct_count_overall,proportion,disparity_index
0,Unknown,13-60,041,1807,2348,0.038223,20.134436
1,Unknown,16-3.1,810,261,1462,0.038223,4.670599
2,Black,24-253,740,1648,1936,0.198806,4.281764
3,Black,29-48,711,3585,4382,0.198806,4.115169
4,Black,60.2-632,761,2006,2519,0.198806,4.005656
...,...,...,...,...,...,...,...
1772,Asian or Pacific Islander,24-253,740,1,1936,0.069089,0.007476
1773,Hispanic,18.2-308.1:4,810,1,1412,0.097759,0.007245
1774,Hispanic,29-8,711,2,2834,0.097759,0.007219
1775,Hispanic,46-157,550,2,3401,0.097759,0.006015


## b. charge level (where total_count_overall > 1000)

In [44]:
table_2b = """
WITH byrace AS (
    SELECT race, code_section, COUNT(*) AS total_count_race 
    FROM charges
    GROUP BY race, code_section),
overall AS (
    SELECT code_section, COUNT(*) AS total_count_overall
    FROM charges
    GROUP BY  code_section)
SELECT c.race, c.code_section, m.max_fips, c.total_count_race, d.total_count_overall, l.proportion,
    (cast(c.total_count_race as decimal)/cast(d.total_count_overall as decimal))/cast(l.proportion as decimal) as disparity_index
FROM byrace c
INNER JOIN overall d
    ON c.code_section=d.code_section
INNER JOIN census_va l
    ON l.race=c.race
INNER JOIN maxfips m 
    ON m.code_section = c.code_section
WHERE total_count_overall > 1000
ORDER BY disparity_index DESC
"""
table2b = pd.read_sql_query(table_2b, con=engine)

In [45]:
table2b

Unnamed: 0,race,code_section,max_fips,total_count_race,total_count_overall,proportion,disparity_index
0,Unknown,13-60,041,1990,2638,0.038223,19.735935
1,Black,24-253,740,3372,3818,0.198806,4.442450
2,Unknown,16-3.1,810,482,2925,0.038223,4.311224
3,Black,29-48,711,6106,7170,0.198806,4.283596
4,Black,18.2-53.1,760,42793,51333,0.198806,4.193213
...,...,...,...,...,...,...,...
2008,Hispanic,24-13,650,3,6687,0.097759,0.004589
2009,Hispanic,46-157,550,2,4761,0.097759,0.004297
2010,Asian or Pacific Islander,24-253,740,1,3818,0.069089,0.003791
2011,Hispanic,19.2-152.4:1,153,1,3209,0.097759,0.003188


# Table 3: Metrics by FIPS code and race
One row for every fips/race, with columns for count within race, count overall, proportion, and disparity index. At the individual level

## a. individual level

In [11]:
table_3a = """
WITH byrace AS (
    SELECT fips,race, COUNT(DISTINCT(person_id)) AS distinct_count_race 
    FROM charges
    GROUP BY fips, race),
overall AS (
    SELECT fips, COUNT(DISTINCT(person_id)) AS distinct_count_overall
    FROM charges
    GROUP BY fips)
SELECT c.fips, c.race, c.distinct_count_race , d.distinct_count_overall, l.proportion,
    (cast(c.distinct_count_race as decimal)/cast(d.distinct_count_overall as decimal))/cast(l.proportion as decimal) as disparity_index
FROM byrace c
INNER JOIN overall d
    ON c.fips = d.fips
INNER JOIN census_long l
    ON l.race=c.race
WHERE distinct_count_overall > 1000
ORDER BY disparity_index DESC
"""
table3a = pd.read_sql_query(table_3a, con=engine)

In [40]:
table_3a_test1 = """
WITH byrace AS (
    SELECT fips,race, COUNT(DISTINCT(person_id)) AS distinct_count_race 
    FROM charges
    GROUP BY fips, race),
overall AS (
    SELECT fips, COUNT(DISTINCT(person_id)) AS distinct_count_overall
    FROM charges
    GROUP BY fips)
SELECT c.fips, c.race, c.distinct_count_race , d.distinct_count_overall, l.proportion,
    (cast(c.distinct_count_race as decimal)/cast(d.distinct_count_overall as decimal))/cast(l.proportion as decimal) as disparity_index
FROM byrace c
INNER JOIN overall d
    ON c.fips = d.fips
INNER JOIN census_long l
    ON l.race=c.race
WHERE distinct_count_overall > 1000 AND d.fips = '760'
ORDER BY disparity_index DESC
"""
table3a_test1 = pd.read_sql_query(table_3a_test1, con=engine)

In [41]:
table3a_test1.shape

(798, 6)

Our census_long table has 798 rows. For one FIPS code in table3a_test1 there are 798 rows. It is probably not a coincidence that these row values are the same. 

In [36]:
table3a_test1.shape

(798, 6)

In [20]:
table3a.query("fips == '760'").shape

(798, 6)

## b. charge level