In [8]:
from sqlalchemy import create_engine
import psycopg2 as db
import pandas as pd
import numpy as np
import os


postPass = os.environ["POSTGRES_PASS"]


engine = create_engine("postgresql+psycopg2://{user}:{pw}@localhost/{db}"
                      .format(user="jupyter", pw=postPass, db="expunge"))

In [2]:
myquery = """
WITH convicted AS (
SELECT 
    hearing_date
    , code_section
    , charge_type
    , charge_class
    , disposition_code
    , plea
    , race
    , sex
    , fips
    , 'Convicted' AS conviction
FROM charges
WHERE disposition_code = 'Guilty'
OR disposition_code = 'Guilty In Absentia'),

not_convicted AS (
SELECT 
    hearing_date
    , code_section
    , charge_type
    , charge_class
    , disposition_code
    , plea
    , race
    , sex
    , fips
    , 'Not Convicted' AS conviction
FROM charges
WHERE disposition_code <> 'Guilty'
AND disposition_code <> 'Guilty In Absentia'),

convictions AS (
SELECT * 
FROM convicted
UNION ALL
SELECT *
FROM not_convicted
)

SELECT 
    COUNT(conviction = 'Not Convicted') AS not_convicted
    , COUNT(conviction = 'Convicted') AS convicted
    , code_section
    , race
    , fips
FROM convictions
GROUP BY
    code_section
    , race
    , fips
ORDER BY convicted DESC
LIMIT 100
"""

result = pd.read_sql(myquery, con = engine)

In [3]:
result

Unnamed: 0,not_convicted,convicted,code_section,race,fips
0,57081,57081,A.46.2-862,White,081
1,53073,53073,46.2-300,White,059
2,36960,36960,B.46.2-301,White,059
3,32739,32739,B.46.2-301,Black,087
4,29946,29946,A.46.2-862,Black,081
...,...,...,...,...,...
95,7666,7666,18.2-95,Black,087
96,7569,7569,C.46.2-862,Black,810
97,7530,7530,B.46.2-301,Black,764
98,7516,7516,18.2-250.1,White,810


In [32]:
myquery = """
WITH convicted AS (
SELECT 
    CAST(COUNT(*) AS FLOAT) AS convicted
    , code_section
    , race
    , fips
FROM charges
WHERE disposition_code = 'Guilty'
OR disposition_code = 'Guilty In Absentia'
GROUP BY 
    code_section
    , race
    , fips
),

not_convicted AS (
SELECT 
    CAST(COUNT(*) AS FLOAT) AS not_convicted
    , code_section
    , race
    , fips
FROM charges
WHERE disposition_code <> 'Guilty'
AND disposition_code <> 'Guilty In Absentia'
GROUP BY
    code_section
    , race
    , fips)


SELECT 
    ROUND(CAST(convicted/(not_convicted + convicted) AS NUMERIC), 4) AS conviction_rate
    , convicted
    , not_convicted
    , c.code_section
    , c.race
    , c.fips
FROM convicted c
INNER JOIN not_convicted n
ON c.code_section = n.code_section
AND c.race = n.race
AND c.fips = n.fips
ORDER BY 
    conviction_rate

"""


convictions = pd.read_sql(myquery, con = engine)

In [33]:
convictions

Unnamed: 0,conviction_rate,convicted,not_convicted,code_section,race,fips
0,0.0030,1.0,336.0,19.2-358,Black,570
1,0.0032,1.0,310.0,19.2-124,Black,730
2,0.0033,1.0,300.0,19.2-152.4:1,Black,740
3,0.0041,3.0,721.0,18.2-53.1,Black,711
4,0.0054,1.0,185.0,18.2-48,Black,711
...,...,...,...,...,...,...
82842,0.9944,2824.0,16.0,A.46.2-862,Unknown,163
82843,0.9944,1762.0,10.0,A.46.2-862,Hispanic,025
82844,0.9950,398.0,2.0,C.46.2-862,Hispanic,025
82845,0.9964,1655.0,6.0,A.46.2-862,Unknown,117


In [None]:
myquery = """
SELECT *
FROM convictions
"""

result = pd.read_sql(myquery, con = engine)

In [9]:
result['conviction'].value_counts()

Convicted        5809895
Not Convicted    3243682
Name: conviction, dtype: int64

In [None]:
myquery = """
SELECT 
    COUNT(conviction) AS convictednum
    , conviction
    , code_section
    , race
    , fips
FROM convictions
GROUP BY
    conviction
    , code_section
    , race
    , fips
ORDER BY 
    race
    , fips
    , code_section
    , convictednum
LIMIT 200
"""

Can draw interpretation from random forest as opposed to other types of ML

In [None]:
myquery = """
SELECT *
FROM census
LIMIT 100
"""
result = pd.read_sql()