## Record Counts

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt

from sqlalchemy import create_engine

In [None]:
engine = create_engine("postgresql:///kcmo-mc")
db_conn = engine.connect()

In [None]:
# Counts of cases in dssgcharge
s = """
select 
	count (*) as num_rows,
	count(distinct case_num) as num_cases
FROM raw_court.dssgcharge
"""
charge_counts = pd.read_sql(s, db_conn)
charge_counts 

In [None]:
# Counts of cases in dssgcharge6222022
s = """
select 
	count (*) as num_rows,
	count(distinct case_num) as num_cases
FROM raw_court.dssgcharge6222022
"""
charge_counts = pd.read_sql(s, db_conn)
charge_counts 

In [None]:
# Counts of cases in charge_no_prob_fixed
s = """
with all_recs as(
   select rec_id
   from raw_court.charge_no_prob_fixed

   UNION

   select rec_id 
   from  raw_court.charge_no_prob_b_fixed
)
select 
	count (*) as num_rows,
	count(distinct rec_id) as num_records
FROM all_recs
"""
rec_counts = pd.read_sql(s, db_conn)
rec_counts 

In [None]:
# Counts of cases in case_rec files
s = """
with all_recs as (
select rec_id, case_num 
from raw_court.caserec_fixed 
UNION 
SELECT rec_id, case_num 
FROM raw_court.caserec_b_fixed)
select 
	count (*) as num_rows,
	count(distinct all_recs.rec_id) as num_recs,
	count(distinct all_recs.case_num) as num_cases,
	count(distinct d.case_num) as num_charge_probations
FROM all_recs 
LEFT JOIN
raw_court.dssgcharge d on all_recs.case_num = d.case_num ;
"""
case_rec_counts = pd.read_sql(s, db_conn)
case_rec_counts 

In [None]:
# Counts of cases in case_cont files
s = """
with all_conts as (
select rec_id
from raw_court.casecont_a_fixed 
UNION 
SELECT rec_id
FROM raw_court.casecont_b_fixed)
select 
	count (*) as num_rows,
	count(distinct all_conts.rec_id) as num_recs
FROM all_conts
"""
case_cont_counts = pd.read_sql(s, db_conn)
case_cont_counts 

In [None]:
# Counts of cases in dssgname
s = """
select 
	count (*) as num_rows,
	count(distinct case_num) as num_cases
FROM raw_court.dssgname
"""
case_counts = pd.read_sql(s, db_conn)
case_counts 

In [None]:
# Counts of cases in dssgname_noprob_a+b
s = """
with all_cases as(
   select case_num
   from raw_court.dssgnamenoprob_a

   UNION

   select case_num
   from  raw_court.dssgnamenoprob_b
)
select 
	count (*) as num_rows,
	count(distinct case_num) as num_cases
FROM all_cases
"""
case_counts = pd.read_sql(s, db_conn)
case_counts 

In [None]:
#Check if we have entries in charges for all cases in case_recs
s = """
WITH all_recs AS (
    SELECT rec_id, case_num 
    FROM raw_court.caserec_fixed 
    UNION 
    SELECT rec_id, case_num 
    FROM raw_court.caserec_b_fixed
    ),
all_charges AS(
    SELECT rec_id 
   FROM raw_court.charge_no_prob_fixed
   UNION
   SELECT rec_id 
   FROM  raw_court.charge_no_prob_b_fixed
)
SELECT count(distinct all_recs.rec_id)
    FROM all_recs
    INNER JOIN
    all_charges ON all_recs.rec_id = all_charges.rec_id;
"""
case_rec_counts = pd.read_sql(s, db_conn)
case_rec_counts 

In [None]:
#Check if we have entries in charges for all cases in case_recs
s = """
WITH all_recs AS (
    SELECT rec_id, case_num 
    FROM raw_court.caserec_fixed 
    UNION 
    SELECT rec_id, case_num 
    FROM raw_court.caserec_b_fixed
    )
SELECT * FROM all_recs;
"""
all_recs = pd.read_sql(s, db_conn)

s2 = """
WITH no_prob_charges AS(
    SELECT rec_id 
   FROM raw_court.charge_no_prob_fixed
   UNION
   SELECT rec_id 
   FROM raw_court.charge_no_prob_b_fixed
)
SELECT * from no_prob_charges
"""

s3 = """
SELECT case_num 
FROM raw_court.dssgcharge;
"""


all_recs = pd.read_sql(s, db_conn)
no_prob_charges = pd.read_sql(s2,db_conn)
prob_charges = pd.read_sql(s3,db_conn)

all_recs_set = set(all_recs['rec_id'])
no_prob_charges_set = set(no_prob_charges['rec_id'])
prob_charge_set = set(prob_charges['case_num'])
recs_cases_set = set(all_recs['case_num'])

In [None]:
print("number of total records:", len(all_recs_set.union(no_prob_charges_set)))
print("number of shared records no prob:", len(all_recs_set.intersection(no_prob_charges_set)))
print ("number of records only in case_recs:", len(all_recs_set.difference(no_prob_charges_set)))
print ("number of records only in no_prob charges:", len(no_prob_charges_set.difference(all_recs_set)))
print("number of shared cases in records and dssgcharge: ", len(recs_cases_set.intersection(prob_charge_set)))