In [1]:
import pandas as pd
from PIL import Image,ImageDraw,ImageFont
import os
import numpy as np
import math

In [2]:
feature_table = pd.read_csv("170728_2015-01-01_alphasort.csv")

In [3]:
feature_table = feature_table[feature_table.feature_importance>0.00135]
feature_table.reset_index(drop=True,inplace=True)
top_feats = list(feature_table.feature)

In [4]:
len(top_feats)

18

In [5]:
top_feats

['demarrests_id_p5y_populationdensity_avg',
 'dispatch_id_p1m_dispatchinitiatiationtype_ci_sum',
 'dispatch_id_p1y_dispatchinitiatiationtype_ci_sum',
 'dispatch_id_p3m_dispatchinitiatiationtype_ci_sum',
 'dispatch_id_p6m_dispatchinitiatiationtype_ci_sum',
 'dispatch_id_p3m_dispatchtype_assist_sum',
 'dispatch_id_p6m_dispatchtype_assist_sum',
 'dispatch_id_p3m_dispatchtype_disorder_sum',
 'dispatch_id_p6m_dispatchtype_disorder_sum',
 'dispatch_id_p3m_dispatchtype_domestic_disturb_sum',
 'dispatch_id_p3m_dispatchtype_theft_sum',
 'dispatch_id_p1m_dispatchtype_traffic_sum',
 'dispatch_id_p3m_dispatchtype_traffic_sum',
 'ocag_id_all_officerage_max',
 'ts_id_p1m_trafficstops_count',
 'ts_id_p3m_trafficstops_count',
 'ts_id_p3m_trafficstopsbyrace_black_sum',

#### Color settings

In [6]:
cathue = {
    'ts':'0',
    'fi':'32',
    'ir':'60',
    'shifts':'115',
    'dispatch':'180',
    'arrests':'240',
    'ic':'270',
    'demarrests':'300',
    'ocnd':'200'
    }

#### train matrix

In [7]:
train_matrix = pd.read_csv("170728_2015-01-01_train-matrix-and-labels.csv")
labels = train_matrix.outcome
#del train_matrix['outcome']

In [8]:
train_matrix = train_matrix[train_matrix['ocag_id_all_officerage_max']!=0]

In [9]:
train_matrix.outcome.value_counts()

0    7862
1     780
Name: outcome, dtype: int64

In [10]:
tmp = train_matrix[['ocag_id_all_officerage_max','outcome']]

In [11]:
prior = float(780)/7862

In [12]:
max(tmp.ocag_id_all_officerage_max) - min(tmp.ocag_id_all_officerage_max)

46.0

## Binning

In [17]:
def binticks_label(matrix,feat):
    tmp = matrix.loc[:,feat]
    tmpmin = min(tmp)
    tmpmax = max(tmp)
    tmprange = tmpmax - tmpmin
    step = tmprange / 100
    return np.arange(tmpmin,tmpmax+step,step)

In [18]:
def labelticks(matrix,feat):
    tmp = matrix.loc[:,feat]
    tmpmin = min(tmp)
    tmpmax = max(tmp)
    tmprange = tmpmax - tmpmin
    step = tmprange / 20
    return np.arange(tmpmin,tmpmax+step,step)

In [19]:
def minmax_label(matrix,feat):
    tmp = matrix.loc[:,feat]
    tmpmin = min(tmp)
    tmpmax = max(tmp)
    return tmpmin,tmpmax

### Axis labels

In [20]:
def addlabels(canvas,ticks):
    width = canvas.width
    height = canvas.height
    margin = 28
    newheight = height + margin
    
    unicanvas = Image.new('RGB',(width,newheight),(255,255,255))
    unicanvas.paste(canvas,(0,0))
    
    font = ImageFont.truetype('VeraMono.ttf', rectheight - 8 )
    
    # fontWidth, fontHeight = font.getsize(high) # can use if I have sizing issues

    draw = ImageDraw.Draw(unicanvas,'RGB')
    
    for i in range(len(ticks)):
        xpos = i * rectwidth * 5
        draw.text((xpos,newheight - 28 ),text=str(ticks[i]),font=font,fill=(0,0,0))
    
    return unicanvas

# Train plot

### Overlaid histograms

In [21]:
def overhist_label(X,labels,feat,rectwidth,rectheight,pad,subset):
    
    pos = pd.cut(subset[labels==1].loc[:,feat],bins=binticks_label(X,feat),labels=False,include_lowest=True)
    neg = pd.cut(subset[labels==0].loc[:,feat],bins=binticks_label(X,feat),labels=False,include_lowest=True)
    
    # handmade binticks
    #pos = pd.cut(subset[labels==1].loc[:,feat],bins=np.arange(22,69,1),labels=False,include_lowest=True)
    #neg = pd.cut(subset[labels==0].loc[:,feat],bins=np.arange(22,69,1),labels=False,include_lowest=True)
    
    binmax = neg.value_counts().max()
    
    px_w = len(binticks_label(X,feat)) * rectwidth + 1
    px_h = binmax * rectheight
    
    canvas = Image.new('RGB',(px_w,px_h),(255,255,255))
    draw = ImageDraw.Draw(canvas,'RGBA')
    
    for binn in pos.value_counts().keys():
        xpos = binn * rectwidth
        ypos = px_h - rectheight

        tmp = pos[pos==binn]
        
        for i in tmp.index:
            bbox = [(xpos+pad,ypos+pad-1),(xpos+rectwidth-pad,ypos+rectheight-pad-1)]
            draw.rectangle(bbox,fill=(220,101,113,127),outline=None)
            ypos = ypos - rectheight
    
    for binn in neg.value_counts().keys():
        xpos = binn * rectwidth
        ypos = px_h - rectheight

        tmp = neg[neg==binn]
        negbinheight = len(tmp)
        nneg = len(neg)
        npos = len(pos)
        overheight = int( negbinheight * npos / nneg )

        for i in tmp.index:
            bbox = [(xpos,ypos-1),(xpos+rectwidth,ypos+rectheight-1)]
            draw.rectangle(bbox,fill=None,outline=(189,189,189,255))
            ypos = ypos - rectheight
        
        ypos = px_h - rectheight # reset
        
        for i in range(overheight):
            bbox = [(xpos+pad,ypos+pad-1),(xpos+rectwidth-pad,ypos+rectheight-pad-1)]
            draw.rectangle(bbox,fill=(112,159,210,127),outline=None)
            ypos = ypos - rectheight
    
    # range labels
    canvas = addlabels(canvas,labelticks(X,feat))
    
    return canvas

In [23]:
pad = 1

In [24]:
rectwidth = 28
rectheight = 28

In [25]:
#for cfeat in top_feats:
#    im = overhist_label(train_matrix,labels,cfeat,rectwidth,rectheight,pad,train_matrix)
#    im.save("/Users/damoncrockett/Desktop/tmpwintour/PILplatz/train_poster/"+cfeat+".png")