In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt

from sqlalchemy import create_engine
engine = create_engine("postgresql:///kcmo-mc")
#db_conn = engine.connect()


In [None]:
q_dat = """
with data as(
select 
    c.*,
    d.disp_date::date,
    d.statute_ord 
from pipeline.cohort c
left join clean.dispositions d
on c.person_id = d.person_id
    and c.cohort_date::date = d.disp_date::date
)
select 
    statute_ord,
    count(*)
from data
group by statute_ord
order by count desc;
"""
with engine.connect() as db_conn:
    dat = pd.read_sql(q_dat, db_conn)
counts_sum = dat['count'].values.sum()
dat['proportion'] = dat['count']/counts_sum # counts_sum is 114024
dat

In [None]:
q_dat_disp = """
select 
    c.person_id, 
    disp_date::date,
    d.statute_ord, 
    d.chrg_desc as charge_desc
from pipeline.cohort c
left join clean.dispositions d
on c.person_id = d.person_id
    and c.cohort_date::date = d.disp_date::date
"""
with engine.connect() as db_conn:
    disp = pd.read_sql(q_dat_disp, db_conn)

In [None]:
# Create the list of top 20 statute ordinances 
top20 = list(dat['statute_ord'][dat.index < 20])

# Populate a column statute_ord_top20 with the ordinances in the top 20, and code
# all else as 'other'
disp['statute_ord_top20'] = disp['statute_ord'].where(disp['statute_ord'].isin(top20))
disp['statute_ord_top20'] = disp['statute_ord_top20'].fillna('other')
top20.append('other') # append to the end of the top 20 list

In [None]:
# For each statute ordinance in the top 20 (21) list, create a column of 0-1s
for statute in top20:
    disp['{}'.format(statute)] = np.where(disp['statute_ord_top20']==statute, 1, 0)

In [None]:
keywords = {
"kw_minor" : ["minor", "age", "child", "chld", "school", "schl"],
"kw_liquor" : ["alcohol", "alc", "liq", "liquor", "intox", "retail alco"
               "intoxicated", "intoxication"],
"kw_animal" : ["animal", "cat", "dog", "fowl", "livestock", "pigs",
                 "anml", 'neutering', 'breeding', 'pit bull'],
"kw_traffic" : ["improper passing","impr pass cutting in", "incr speed","drove left", 
                "no pass zone","traffic", "speed", "sped", "yield", "stopsign", 
                "mph", "drove", "drvr", "driving", "MV", "operating mv", "mtr", "follow too close",
                "chng lan", "rdce spd", "one way sign", "right turn", "pass veh", "drove slow"
                "fail to stop", "fail to yld", "fail to yield"],
"kw_speed": ["speed", "sped", "mph", "racing", "speeding",  "sped const zn", "careless drive"],
"kw_trespass" : ["tresp", "tress", "trespass", "trespas", "tresspass",
                 "tresspas"],
"kw_housing" : ["landlord", "hous", "housing", "occupancy", "building", 
                "build code", "roof", "elec", "waste", "sewage", "sewer", 
                "structure", "trash"],
"kw_weapon" : ["wpn", "weapon", "gun", "missile", "handgun", "explosive", "bomb"],
"kw_stealing": ["larceny", "steal", "stealing", "theft", "stole", "stolen"],
"kw_disturbance": ["peace", "loud", "noise", "music", "disturb", "disturbance", "nuisance"],
"kw_compliance": ["impeding", "impede","obstruct", "resist", "contempt", "interfere", "comply",
                  "fail comp", "fail comply", "hinder", "failure to comply", "fail to correct",
                  "order to leave nuis", "nuisance"],
"kw_safety": ["inspect", "maintenance", "tamper", "open burning", "open flame", "freestand",
              "incendiary burn", "unsafe"],
"kw_prostitution": ["nude", "lewd", "indec", "indecent", "adlt entrtnmnt", "unclothed", 
                    "sex", "prost", "prostitution"],
"kw_smoking": ["smoking", "smoke", "smok", "vape", "vaping", "tobacco", "individual cig", "poss substance"]
}

for kw in keywords.keys():
    disp[kw] = np.where([any(name in row.lower() for name in keywords[kw]) for row in disp['charge_desc']], 1,0)

In [None]:
disp.groupby(['person_id', 'disp_date']).count()
disp

In [None]:
disp_grouped = pd.DataFrame(disp).groupby(['person_id', 'disp_date']).sum()
disp_grouped = disp_grouped.reset_index()
disp_grouped

In [None]:
disp['person_id'].value_counts()