# Building a Racial Disparity Index for the Virginia Court Dataset
## 6/27/2022

In [1]:
from sqlalchemy import create_engine
import psycopg2 as db
import pandas as pd
import numpy as np
import os

In [2]:
postPass=os.environ["POSTGRES_PASS"]

In [3]:
engine = create_engine("postgresql+psycopg2://{user}:{pw}@localhost/{db}"
                       .format(user="jupyter", pw=postPass, db="expunge"))

## This is the 'census_long' table:

In [4]:
myquery = """
SELECT * from census_long
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,FIPS,Jurisdiction,total_pop,race,population,proportion
0,1,Accomack County,32316.0,White,21899.0,0.677652
1,3,Albemarle County,109330.0,White,89388.0,0.817598
2,5,Alleghany County,14860.0,White,13783.0,0.927524
3,7,Amelia County,13145.0,White,10050.0,0.764549
4,9,Amherst County,31605.0,White,24299.0,0.768834
...,...,...,...,...,...,...
793,800,Suffolk city,92108.0,Hispanic,4300.0,0.046684
794,810,Virginia Beach city,449974.0,Hispanic,38235.0,0.084972
795,820,Waynesboro city,22630.0,Hispanic,1966.0,0.086876
796,830,Williamsburg city,14954.0,Hispanic,1069.0,0.071486


## Here is the count of the number of cases within each FIPS/race/code section combination:

In [5]:
myquery = """
SELECT fips, race, code_section, COUNT(*) AS total_count_race 
FROM charges
GROUP BY fips, race, code_section
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,race,code_section,total_count_race
0,001,American Indian or Alaskan Native,18.2-102,1
1,001,American Indian or Alaskan Native,18.2-388,4
2,001,American Indian or Alaskan Native,18.2-57,1
3,001,American Indian or Alaskan Native,46.2-300,1
4,001,American Indian or Alaskan Native,46.2-862,2
...,...,...,...,...
187104,840,White,NO DMV,10
187105,840,White,Z.18.2-47,4
187106,840,White,Z.18.2-67.5:1,1
187107,840,White,Z.18.2-89,7


## Here's the count by just FIPS/code section:

In [6]:
myquery = """
SELECT fips, code_section, COUNT(*) AS total_count_overall 
FROM charges
GROUP BY fips, code_section
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,code_section,total_count_overall
0,001,00-3.1,1
1,001,002-06,1
2,001,002-14,1
3,001,003.1,1
4,001,01-1,1
...,...,...,...
109451,840,Z.18.2-67.5:1,1
109452,840,Z.18.2-89,7
109453,840,Z.18.2-90,1
109454,840,Z.18.2-91,116


## Merging the previous two tables to get the fips/race/code section and fips/code section counts side by side:

In [7]:
myquery = """
SELECT c.fips, c.race, c.code_section, c.total_count_race, d.total_count_overall
FROM (SELECT fips, race, code_section, COUNT(*) AS total_count_race 
    FROM charges
    GROUP BY fips, race, code_section) c
INNER JOIN (SELECT fips, code_section, COUNT(*) AS total_count_overall 
    FROM charges
    GROUP BY fips, code_section) d
    ON c.fips = d.fips AND c.code_section=d.code_section
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,race,code_section,total_count_race,total_count_overall
0,001,American Indian or Alaskan Native,18.2-102,1,54
1,001,American Indian or Alaskan Native,18.2-388,4,1067
2,001,American Indian or Alaskan Native,46.2-300,1,5438
3,001,American Indian or Alaskan Native,46.2-862,2,116
4,001,American Indian or Alaskan Native,A.46.2-853,1,409
...,...,...,...,...,...
187104,840,White,B.46.2-873,1,1
187105,840,White,B.46.2-878,1,1
187106,840,White,C.18.2-266,183,233
187107,840,White,C.46.2-862,201,287


We can do this in a simpler way using the following code:

In [12]:
myquery = """
WITH byrace AS (
    SELECT fips, race, code_section, COUNT(*) AS total_count_race 
    FROM charges
    GROUP BY fips, race, code_section),
overall AS (
    SELECT fips, code_section, COUNT(*) AS total_count_overall 
    FROM charges
    GROUP BY fips, code_section)
SELECT c.fips, c.race, c.code_section, c.total_count_race, d.total_count_overall
FROM byrace c
INNER JOIN overall d
    ON c.fips = d.fips AND c.code_section=d.code_section
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,race,code_section,total_count_race,total_count_overall
0,001,Black,00-3.1,1,1
1,001,White,002-06,1,1
2,001,White,002-14,1,1
3,001,Black,003.1,1,1
4,001,White,01-1,1,1
...,...,...,...,...,...
187104,840,Black,Z.18.2-90,1,1
187105,840,White,Z.18.2-91,90,116
187106,840,Hispanic,Z.18.2-91,3,116
187107,840,Black,Z.18.2-91,23,116


## Joining both tables with census long:

In [15]:
myquery = """
WITH byrace AS (
    SELECT fips, race, code_section, COUNT(*) AS total_count_race 
    FROM charges
    GROUP BY fips, race, code_section),
overall AS (
    SELECT fips, code_section, COUNT(*) AS total_count_overall 
    FROM charges
    GROUP BY fips, code_section)
SELECT c.fips, c.race, c.code_section, c.total_count_race, d.total_count_overall
FROM byrace c
INNER JOIN overall d
    ON c.fips = d.fips AND c.code_section=d.code_section
INNER JOIN census_long l
    ON cast(c.fips as int) = l."FIPS" AND l.race=c.race
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,race,code_section,total_count_race,total_count_overall
0,001,Black,00-3.1,1,1
1,001,White,002-06,1,1
2,001,White,002-14,1,1
3,001,Black,003.1,1,1
4,001,White,01-1,1,1
...,...,...,...,...,...
185595,840,Black,Z.18.2-90,1,1
185596,840,Black,Z.18.2-91,23,116
185597,840,White,Z.18.2-91,90,116
185598,840,Hispanic,Z.18.2-91,3,116


## Generating the Disparity Index

In [46]:
myquery = """
WITH byrace AS (
    SELECT fips, race, code_section, COUNT(*) AS total_count_race 
    FROM charges
    GROUP BY fips, race, code_section),
overall AS (
    SELECT fips, code_section, COUNT(*) AS total_count_overall 
    FROM charges
    GROUP BY fips, code_section)
SELECT c.fips, l."Jurisdiction" as jurisdiction, c.race, c.code_section, c.total_count_race, d.total_count_overall, l.proportion,
    (cast(c.total_count_race as decimal)/cast(d.total_count_overall as decimal))/cast(l.proportion as decimal) as disparity_index
FROM byrace c
INNER JOIN overall d
    ON c.fips = d.fips AND c.code_section=d.code_section
INNER JOIN census_long l
    ON cast(c.fips as int) = l."FIPS" AND l.race=c.race
"""
pd.read_sql_query(myquery, con=engine)

In [47]:
#disp = pd.read_sql_query(myquery, con=engine)
#disp.to_sql("disparity", con=engine, index=False, if_exists='replace')

# Analysis:

## Sorting the Disparity Index and Filtering to Only Charges with > 1000 Occurrences:

In [49]:
myquery = """
SELECT * FROM disparity
WHERE total_count_overall > 1000
ORDER BY disparity_index DESC;
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,jurisdiction,race,code_section,total_count_race,total_count_overall,proportion,disparity_index
0,041,Chesterfield County,Unknown,13-60,1990,2638,0.007208,104.655718
1,081,Greensville County,Unknown,46.2-300,769,4345,0.002911,60.797043
2,035,Carroll County,Unknown,A.46.2-862,841,4713,0.003793,47.044103
3,113,Madison County,Unknown,A.46.2-862,164,1256,0.003016,43.288296
4,081,Greensville County,Unknown,A.46.2-862,12814,102739,0.002911,42.844520
...,...,...,...,...,...,...,...,...
8564,013,Arlington County,Hispanic,18.2-388,1,1437,0.156015,0.004460
8565,059,Fairfax County,Hispanic,18.2-108.01,2,2969,0.165150,0.004079
8566,153,Prince William County,Hispanic,18.2-192,1,1114,0.244874,0.003666
8567,059,Fairfax County,Hispanic,18.2-178,2,3512,0.165150,0.003448


## Considering just African Americans:

In [51]:
myquery = """
SELECT * FROM disparity
WHERE total_count_overall > 1000 AND race='Black'
ORDER BY disparity_index DESC;
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,jurisdiction,race,code_section,total_count_race,total_count_overall,proportion,disparity_index
0,035,Carroll County,Black,46.2-300,1007,5054,0.008828,22.569585
1,035,Carroll County,Black,C.46.2-862,3058,15878,0.008828,21.815794
2,035,Carroll County,Black,18.2-250.1,256,1471,0.008828,19.713177
3,035,Carroll County,Black,A.46.2-862,677,4713,0.008828,16.271237
4,191,Washington County,Black,A.46.2-862,1150,6696,0.015184,11.310711
...,...,...,...,...,...,...,...,...
1493,027,Buchanan County,Black,B.46.2-301,5,2109,0.033803,0.070135
1494,027,Buchanan County,Black,18.2-172,4,1733,0.033803,0.068282
1495,027,Buchanan County,Black,18.2-95,3,1347,0.033803,0.065887
1496,105,Lee County,Black,18.2-172,4,1688,0.037228,0.063652


## Excluding traffic violations

In [52]:
myquery = """
SELECT * FROM disparity
WHERE total_count_overall > 1000 AND race='Black' AND code_section NOT LIKE '%%46.%%'
ORDER BY disparity_index DESC;
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,jurisdiction,race,code_section,total_count_race,total_count_overall,proportion,disparity_index
0,035,Carroll County,Black,18.2-250.1,256,1471,0.008828,19.713177
1,165,Rockingham County,Black,18.2-248,1040,3945,0.025406,10.376335
2,035,Carroll County,Black,18.2-172,106,1396,0.008828,8.601016
3,165,Rockingham County,Black,19.2-306,377,1778,0.025406,8.345786
4,013,Arlington County,Black,18.2-195,842,1063,0.096782,8.184366
...,...,...,...,...,...,...,...,...
1002,105,Lee County,Black,18.2-388,3,1102,0.037228,0.073125
1003,027,Buchanan County,Black,18.2-172,4,1733,0.033803,0.068282
1004,027,Buchanan County,Black,18.2-95,3,1347,0.033803,0.065887
1005,105,Lee County,Black,18.2-172,4,1688,0.037228,0.063652


Additional analyses:
* Generate bar chart of top 10 (or 15, 20) code sections by racial disparity for African American, Hispanic, White, etc ...
* Think about how to create an overall disparity score across all code sections within fips/race, then plot these scores on a map
* Create table of titles for each code section and join into existing data

Additional things to clean:
* Check on what race categories are matched or not matched
* Add others here as you go

In [5]:
myquery = """
SELECT * FROM disparity
WHERE total_count_overall > 1000 AND race='Hispanic' AND code_section NOT LIKE '%%46.%%'
ORDER BY disparity_index DESC;
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,jurisdiction,race,code_section,total_count_race,total_count_overall,proportion,disparity_index
0,059,Fairfax County,Hispanic,29.1-335,378,1027,0.165150,2.228654
1,003,Albemarle County,Hispanic,A.18.2-266,254,2489,0.057743,1.767309
2,171,Shenandoah County,Hispanic,18.2-248,259,2235,0.073528,1.576047
3,019,Bedford County,Hispanic,29.1-335,49,1572,0.023672,1.316778
4,067,Franklin County,Hispanic,A.18.2-266,71,1889,0.028782,1.305887
...,...,...,...,...,...,...,...,...
907,013,Arlington County,Hispanic,18.2-388,1,1437,0.156015,0.004460
908,059,Fairfax County,Hispanic,18.2-108.01,2,2969,0.165150,0.004079
909,153,Prince William County,Hispanic,18.2-192,1,1114,0.244874,0.003666
910,059,Fairfax County,Hispanic,18.2-178,2,3512,0.165150,0.003448


In [10]:
myquery = """
SELECT * FROM disparity
WHERE total_count_overall > 1000 AND jurisdiction = 'Fairfax County'
ORDER BY disparity_index DESC;
"""
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,fips,jurisdiction,race,code_section,total_count_race,total_count_overall,proportion,disparity_index
0,059,Fairfax County,Unknown,A.46.2-862,6775,56937,0.006187,19.231855
1,059,Fairfax County,Unknown,A.46.2-853,355,3371,0.006187,17.020647
2,059,Fairfax County,Unknown,C.46.2-862,3978,47257,0.006187,13.605205
3,059,Fairfax County,Unknown,B.46.2-853,109,1303,0.006187,13.520369
4,059,Fairfax County,Unknown,36-106,128,2089,0.006187,9.903256
...,...,...,...,...,...,...,...,...
405,059,Fairfax County,Hispanic,5-1-1,25,16917,0.165150,0.008948
406,059,Fairfax County,Hispanic,18.2-137,4,3695,0.165150,0.006555
407,059,Fairfax County,Hispanic,18.2-192,3,3534,0.165150,0.005140
408,059,Fairfax County,Hispanic,18.2-108.01,2,2969,0.165150,0.004079


example: notes: total pop of fairfax county = 1,337,047
hispanic pop = 189,515
14.2 % of pop is hisp
378 hisp charged with 29.1-335 (fishing w/o license) out of 1027 total charged in Fairfax 
hisp make up 36.8% of charged 
less than 1% of hisp pop are charged 

how to only show specific code sections to find specific code sections using disparity table? 