In [None]:
%matplotlib inline
import pickle
import pandas as pd
import operator
import psycopg2
import pylab
import numpy as np
import datetime
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from collections import Counter
import matplotlib.patches as mpatches
from scipy.stats.stats import pearsonr
import matplotlib.lines as mlines
import matplotlib as mpl
from matplotlib import cm
from mpl_toolkits.axes_grid1 import make_axes_locatable
from statsmodels.tsa import stattools
import statsmodels.api as sm
import scipy
import random
import seaborn as sns
from matplotlib.font_manager import FontProperties
import matplotlib.mlab as mlab
import re
from collections import OrderedDict
import statsmodels.api as sm
from scipy import stats
import statsmodels
from statsmodels.graphics.api import qqplot
from sklearn import linear_model, datasets
from sklearn import metrics
import time

mpl.rcdefaults()
pd.options.display.mpl_style = 'default'

# Connection

In [None]:
# Read database parameters from default_profile
dbitems = {}
with open('default_profile') as f:
    for line in f.readlines():
        item = line.split(" ")[1].split("=")
        dbitems[item[0]] = item[1].strip()
        
# Connect to database with psycopg2
try:
    conn = psycopg2.connect("dbname='%s' user='%s' host='%s' password='%s'"%(dbitems['PGDATABASE'],dbitems['PGUSER'],dbitems['PGHOST'],dbitems['PGPASSWORD']))
except:
    print "Unable to connect to the database"
    
# Connect to database with sqalchemy
conn_sqlalch = create_engine('postgresql+psycopg2://%s:%s@%s/%s'%(dbitems['PGUSER'],dbitems['PGPASSWORD'],dbitems['PGHOST'],dbitems['PGDATABASE']))

# Functions

In [None]:
def get_date(s):
    """
    input: string
    output: replace space with dash and colon with dash and split on dashes
    """
    
    s = s.replace(' ', '-')
    s = s.replace(':', '-')
    l = s.split('-')
    l = [int(i) for i in l]
    return datetime.datetime(l[0], l[1], l[2], l[3], l[4], l[5])

In [None]:
def cases(s):
    """
    input: string
    output: convert text incident types to their number counterparts
    """
    if s == 'SUICF':
        return '25'
    elif s == 'DROWNF':
        return '14'
    elif s == 'ACCIF':
        return '29'
    else:
        return s

# Read Model Output

In [None]:
result_df = pickle.load( open( "e69191e313e930617ef52e9d8549f0d2.p", "rb" ))
model = pickle.load(open("model_e69191e313e930617ef52e9d8549f0d2.p", "rb"))

In [None]:
model

In [None]:
feature_df = pd.read_sql_query("SELECT * from features.master", conn)

In [None]:
dispatch_df = feature_df[['i_eventnumber', 'iti_typeid', 'iiu_tdispatch']]

In [None]:
dispatch_df = dispatch_df.dropna()
dispatch_df['code'] = dispatch_df['iti_typeid'].apply(lambda x: re.sub(r'[A-Z]+[0-9]+.*', '', x))
dispatch_df['severity'] = dispatch_df.apply(lambda row: re.sub(r'[0-9]+', '', row.iti_typeid)[0] if row.code.isdigit() else 'NONE', axis = 1)
dispatch_df['iiu_tdispatch'] = dispatch_df['iiu_tdispatch'].apply(get_date)

In [None]:
dispatch_df = dispatch_df.sort('iiu_tdispatch')

In [None]:
dispatch_df['time_day'] = dispatch_df['iiu_tdispatch'].apply(lambda x: datetime.datetime(x.year, x.month, x.day))

In [None]:
dispatch_df.index = dispatch_df.iiu_tdispatch
dispatch_df = dispatch_df.drop('iiu_tdispatch' ,1)

In [None]:
gb_type = dispatch_df.groupby('code')

In [None]:
df_dict = {}
for name,group in gb_type:
    df_time = group.groupby('time_day').count()
    df_time = df_time.sort_index()
    df_time = pd.rolling_mean(df_time, window = 7).dropna()['i_eventnumber']
    if len(df_time) > 0:
        idx = pd.date_range(df_time.index[0], df_time.index[-1])
        df_time.index = pd.DatetimeIndex(df_time.index)
        df_time = df_time.reindex(idx, fill_value=0)
        df_dict[name] = df_time

In [None]:
srs_key_1 = '29'
srs_key_2 = '17'

srs_1 = df_dict[srs_key_1]
srs_2 = df_dict[srs_key_2]

max_start = max(srs_1.index[0], srs_2.index[0])
min_end = min(srs_1.index[-1], srs_2.index[-1])

srs_1 = srs_1[(srs_1.index > max_start) & (srs_1.index < min_end)]
srs_2 = srs_2[(srs_2.index > max_start) & (srs_2.index < min_end)]

srs_1 = srs_1.values
srs_2 = srs_2.values

srs_pair = zip(srs_1, srs_2)

item = statsmodels.tsa.stattools.grangercausalitytests(srs_pair, maxlag = 5)

In [None]:
plt.plot(df_dict['ACCIF'].index, df_dict['ACCIF'])
plt.xticks(rotation='vertical')

In [None]:
result_df.index = result_df.incident
feature_df.index = feature_df.incident

In [None]:
full_df = result_df.join(feature_df, how = 'left', lsuffix = '_left')

In [None]:
full_df['code_type'] = full_df['code_type'].apply(cases)

In [None]:
full_df = full_df.drop('incident_left', 1)
full_df = full_df.drop('incident', 1)

In [None]:
def plot_precision_recall_n(y_true, y_prob, model_name):
    """
    input: real y's and y probabilities
    output: a plot of precision and recall at k
    """
    from sklearn.metrics import precision_recall_curve
    y_score = y_prob
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score>=value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)
    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    
    
    name = model_name
    plt.title(name)
    plt.show()

In [None]:
def metrics_at_k(y_true, y_scores, k):
    """
    input: true y values and given scores from a model and a level at which to threshold
    output: precision at k, recall at k, auc
    """
    threshold = np.sort(y_scores)[::-1][int(k*len(y_scores))]
    y_pred = [int(i >= threshold) for i in y_scores]
    return (metrics.precision_score(y_true, y_pred), metrics.recall_score(y_true, y_pred), metrics.roc_auc_score(y_true, y_scores)) 


In [None]:
full_df

In [None]:
master_dict = {
    1:'ABDOM',
    2:'ALLERGIES',
    3:'ANIMAL BITES',
    4:'ASSAULT',
    5:'BACK',
    6:'BREATH',
    7:'BURNS',
    8:'CARBON',
    9:'CARDIAC',
    10:'CHEST',
    11:'CHOKE',
    12:'CONVUL',
    13:'DIAB',
    14:'DROWN',
    15:'ELECTRO',
    16:'EYE',
    17:'FALL',
    18:'HEAD',
    19:'HEART',
    20:'HEAT',
    21:'HEMORR',
    22:'ENTRAP',
    23:'OD',
    24:'PREG',
    25:'SUIC',
    26:'SICK',
    27:'STAB',
    28:'STROKE',
    29:'TRAFIC',
    30:'TRAUMA',
    31:'UNCONS',
    32:'UNKNOWN'}

sev_dict = {9: 2,19: 2, 28: 2, 10: 2, 27:2,  14:2, 12:1, 31:1, 13:1, 6:1,8:1, 23:1, 15:1, 7:1, 24:1, 25:1, 11:0, 2:0, 21:0, 3:0, 18:0, 16:0, 1:0, 20:0, 17:0, 26:0, 4:0, 29:0}

In [None]:
full_df.columns

In [None]:
def get_urgency(s):
    """
    input: string
    output: severity level associated with incident coded by that string
    """
    try:
        return sev_dict[int(s)]
    except:
        return ''

In [None]:
full_df['urgency'] = full_df['code_type'].apply(get_urgency)

In [None]:
gb_urg = full_df.groupby('time_day')

In [None]:
k = 5
for name,group in gb_urg:
    print name, ' ', len(group)/float(len(full_df))
    print "actual pct pos:", np.mean(group.trns_to_hosp)
    print '-----'

In [None]:
metrics_at_k(full_df.trns_to_hosp, full_df.new_y_probs, 0.5)

In [None]:
k = [30,60,90]

In [None]:
gb_type = full_df.groupby('code_type')

In [None]:
master_dict = {
    1:'ABDOM',
    2:'ALLERGIES',
    3:'ANIMAL BITES',
    4:'ASSAULT',
    5:'BACK',
    6:'BREATH',
    7:'BURNS',
    8:'CARBON',
    9:'CARDIAC',
    10:'CHEST',
    11:'CHOKE',
    12:'CONVUL',
    13:'DIAB',
    14:'DROWN',
    15:'ELECTRO',
    16:'EYE',
    17:'FALL',
    18:'HEAD',
    19:'HEART',
    20:'HEAT',
    21:'HEMORR',
    22:'ENTRAP',
    23:'OD',
    24:'PREG',
    25:'SUIC',
    26:'SICK',
    27:'STAB',
    28:'STROKE',
    29:'TRAFIC',
    30:'TRAUMA',
    31:'UNCONS',
    32:'UNKNOWN'}

sev_dict = {9: 2,19: 2, 28: 2, 10: 2, 27:2,  14:2, 12:1, 31:1, 13:1, 6:1,8:1, 23:1, 15:1, 7:1, 24:1, 25:1, 11:0, 2:0, 21:0, 3:0, 18:0, 16:0, 1:0, 20:0, 17:0, 26:0, 4:0, 29:0}

In [None]:
ord_list = [9 ,19, 28, 10, 27,  14, 12, 31, 13, 6,8, 23, 15, 7, 24, 25, 11, 2, 21, 3, 18, 16, 1, 20, 17, 26, 4, 29]

In [None]:
dict_pct = {}
dict_recall = {}
dict_pop_prec = {}
dict_pop_rec = {}
for name,group in gb_type:
    #problematic missing incident type
    if name == '22':
        continue
    try:
        z = int(name)
        k_val = k[sev_dict[z]]
        l =  group.y_pred_probs
        cuttoff = np.percentile(l, 100-k_val)
        group['predict'] = group.y_pred_probs.apply(lambda x: 1 if x > cuttoff else 0)
        group['correct'] = group.apply(lambda x: int(x.trns_to_hosp == x.predict), axis = 1)
        group['recalled'] = group.apply(lambda x: int(x.trns_to_hosp ==1 and  x.predict == 1), axis = 1)
        x = int(name)
        dict_pct[master_dict[x]] = (np.mean(group.correct) - np.mean(group.trns_to_hosp))/np.mean(group.trns_to_hosp)
        dict_recall[master_dict[x]] = (sum(group.recalled) / float(sum(group.trns_to_hosp)) - k_val/100.0)/(k_val/100.0)
        dict_pop_prec[master_dict[x]] = (np.mean(group.correct) - np.mean(group.trns_to_hosp))/np.mean(group.trns_to_hosp) * len(full_df[full_df['code_type'] == name])
        dict_pop_rec[master_dict[x]] = (sum(group.recalled) / float(sum(group.trns_to_hosp)) - k_val/100.0)/(k_val/100.0) * len(full_df[full_df['code_type'] == name])
    except ValueError:
        continue
    except KeyError:
        continue

In [None]:
ord_list = [9 ,19, 28, 10, 27,  14, 12, 31, 13, 6,8, 23, 15, 7, 24, 25, 11, 2, 21, 3, 18, 16, 1, 20, 17, 26, 4, 29]

In [None]:
name_ord_list = [master_dict[i] for i in ord_list]

In [None]:
tups = []

In [None]:
for item in name_ord_list:
    tups.append((item, dict_pct[item]))

In [None]:
tups_rec = []
for item in name_ord_list:
    tups_rec.append((item, dict_recall[item]))

In [None]:
mpl.rcdefaults()
color_list = ['red']*6 + ['yellow'] * 11 + ['green'] * 12
fig, ax = plt.subplots(figsize = (12,8))
ax.bar(range(1,len(tups)+1), [i[1] for i in tups], color = color_list, alpha = 0.9)
ax.set_xticks(np.arange(1.45,len(tups)+1.45, 1))
ax.set_xticklabels([i[0] for i in tups], rotation = 90)
ax.set_ylabel('Precision Gain', fontsize = 16)
plt.tight_layout()
#plt.show()
plt.savefig('actual_prec.pdf')

In [None]:
ideal_prec = np.arange(0.1, 0.5, 0.4/len(tups))

# Plots of Precision and Recall gain by incident type

In [None]:
mpl.rcdefaults()
color_list = ['red']*6 + ['yellow'] * 11 + ['green'] * 12
fig, ax = plt.subplots(figsize = (12,8))
ax.bar(range(1,len(tups)+1), ideal_prec, color = color_list, alpha = 0.9)
ax.set_xticks(np.arange(1.45,len(tups)+1.45, 1))
ax.set_xticklabels([i[0] for i in tups], rotation = 90)
ax.set_yticklabels([])
ax.set_ylabel('Precision Gain', fontsize = 16)
plt.tight_layout()
#plt.show()
plt.savefig('good_prec.pdf')

In [None]:
mpl.rcdefaults()
color_list = ['red']*6 + ['yellow'] * 11 + ['green'] * 12
fig, ax = plt.subplots(figsize = (12,8))
ax.bar(range(1,len(tups)+1), [i[1] for i in tups_rec], color = color_list, alpha = 0.9)
ax.set_xticks(np.arange(1.45,len(tups)+1.45, 1))
ax.set_xticklabels([i[0] for i in tups], rotation = 90)
ax.set_ylabel('Recall Gain', fontsize = 16)
plt.tight_layout()
#plt.show()
plt.savefig('actual_rec.pdf')

In [None]:
ideal_rec = np.arange(0.5, 0.1, -0.4/len(tups_rec))

In [None]:
mpl.rcdefaults()
color_list = ['red']*6 + ['yellow'] * 11 + ['green'] * 12
fig, ax = plt.subplots(figsize = (12,8))
ax.bar(range(1,len(tups)+1), ideal_rec, color = color_list, alpha = 0.9)
ax.set_xticks(np.arange(1.45,len(tups)+1.45, 1))
ax.set_xticklabels([i[0] for i in tups], rotation = 90)
ax.set_ylabel('Recall Gain', fontsize = 16)
ax.set_yticklabels([])
plt.ylim(0,.52)
plt.tight_layout()
#plt.show()
plt.savefig('good_rec.pdf')

In [None]:
x = len(full_df[full_df['code_type'] == '27'])/float(len(full_df))
y = len(full_df[full_df['code_type'] == '6'])/float(len(full_df))
y / x
# there are 300 times as many breathing as stabbings

In [None]:
tups_prec_pop = []
for item in name_ord_list:
    tups_prec_pop.append((item, dict_pop_prec[item]))

In [None]:
tups_rec_pop = []
for item in name_ord_list:
    tups_rec_pop.append((item, dict_pop_rec[item]))

In [None]:
mpl.rcdefaults()
color_list = ['red']*6 + ['yellow'] * 11 + ['green'] * 12
f, (ax1, ax2) = plt.subplots(2,1,figsize=(12,8))

ax1.bar(range(1,len(tups)+1), [i[1] for i in tups], color = color_list, alpha = 0.9)
ax1.set_xticks(np.arange(1.45,len(tups)+1.45, 1))
ax1.set_xticklabels([i[0] for i in tups], rotation = 90)
ax1.set_ylabel('Precision Gain', fontsize = 16)

ax2.bar(range(1,len(tups)+1), [i[1] for i in tups_prec_pop], color = color_list, alpha = 0.9)
ax2.set_xticks(np.arange(1.45,len(tups)+1.45, 1))
ax2.set_xticklabels([i[0] for i in tups], rotation = 90)
ax2.set_ylabel('Scaled Precision Gain', fontsize = 16)
plt.tight_layout()
plt.show()
#plt.savefig('scaled_prec.pdf')

In [None]:
tups_

In [None]:
mpl.rcdefaults()
color_list = ['red']*6 + ['yellow'] * 11 + ['green'] * 12
f, (ax1, ax2) = plt.subplots(2,1,figsize=(12,8))

ax1.bar(range(1,len(tups_rec)+1), [i[1] for i in tups_rec], color = color_list, alpha = 0.9)
ax1.set_xticks(np.arange(1.45,len(tups)+1.45, 1))
ax1.set_xticklabels([i[0] for i in tups], rotation = 90)
ax1.set_ylabel('Recall Gain', fontsize = 16)

ax2.bar(range(1,len(tups_rec_pop)+1), [i[1] for i in tups_rec_pop], color = color_list, alpha = 0.9)
ax2.set_xticks(np.arange(1.45,len(tups)+1.45, 1))
ax2.set_xticklabels([i[0] for i in tups], rotation = 90)
ax2.set_ylabel('Scaled Recall Gain', fontsize = 16)
plt.tight_layout()
#plt.show()
plt.savefig('scaled_rec.pdf')

# Feature Importance

In [None]:
imp = model.feature_importances_
feats = [u'common_weather_awnd', u'common_weather_dapr', u'common_weather_fmtm', u'common_weather_mdpr', u'common_weather_pgtm', u'common_weather_prcp', u'common_weather_snow', u'common_weather_snwd', u'common_weather_tmax', u'common_weather_tmin', u'common_weather_tobs', u'common_weather_wdf2', u'common_weather_wdf5', u'common_weather_wesd', u'common_weather_wesf', u'common_weather_wsf2', u'common_weather_wsf5', u'time_of_day_is_1p-7p', u'time_of_day_is_7a-1p', u'time_of_day_is_7p-1a', u'time_year_is_2012.0', u'time_year_is_2013.0', u'time_year_is_2014.0', u'time_year_is_2015.0', u'time_year_is_2016.0', u'time_month_is_2.0', u'time_month_is_3.0', u'time_month_is_4.0', u'time_month_is_5.0', u'time_month_is_6.0', u'time_month_is_7.0', u'time_month_is_8.0', u'time_month_is_9.0', u'time_month_is_10.0', u'time_month_is_11.0', u'time_month_is_12.0', u'time_day_is_2.0', u'time_day_is_3.0', u'time_day_is_4.0', u'time_day_is_5.0', u'time_day_is_6.0', u'time_day_is_7.0', u'time_day_is_8.0', u'time_day_is_9.0', u'time_day_is_10.0', u'time_day_is_11.0', u'time_day_is_12.0', u'time_day_is_13.0', u'time_day_is_14.0', u'time_day_is_15.0', u'time_day_is_16.0', u'time_day_is_17.0', u'time_day_is_18.0', u'time_day_is_19.0', u'time_day_is_20.0', u'time_day_is_21.0', u'time_day_is_22.0', u'time_day_is_23.0', u'time_day_is_24.0', u'time_day_is_25.0', u'time_day_is_26.0', u'time_day_is_27.0', u'time_day_is_28.0', u'time_day_is_29.0', u'time_day_is_30.0', u'time_day_is_31.0', u'time_hour_is_1.0', u'time_hour_is_2.0', u'time_hour_is_3.0', u'time_hour_is_4.0', u'time_hour_is_5.0', u'time_hour_is_6.0', u'time_hour_is_7.0', u'time_hour_is_8.0', u'time_hour_is_9.0', u'time_hour_is_10.0', u'time_hour_is_11.0', u'time_hour_is_12.0', u'time_hour_is_13.0', u'time_hour_is_14.0', u'time_hour_is_15.0', u'time_hour_is_16.0', u'time_hour_is_17.0', u'time_hour_is_18.0', u'time_hour_is_19.0', u'time_hour_is_20.0', u'time_hour_is_21.0', u'time_hour_is_22.0', u'time_hour_is_23.0', u'code_type_is_10', u'code_type_is_11', u'code_type_is_12', u'code_type_is_13', u'code_type_is_14', u'code_type_is_15', u'code_type_is_16', u'code_type_is_17', u'code_type_is_18', u'code_type_is_19', u'code_type_is_2', u'code_type_is_20', u'code_type_is_21', u'code_type_is_22', u'code_type_is_23', u'code_type_is_24', u'code_type_is_25', u'code_type_is_26', u'code_type_is_27', u'code_type_is_28', u'code_type_is_29', u'code_type_is_3', u'code_type_is_30', u'code_type_is_31', u'code_type_is_32', u'code_type_is_4', u'code_type_is_5', u'code_type_is_6', u'code_type_is_7', u'code_type_is_8', u'code_type_is_9', u'code_type_is_ACCIF', u'code_type_is_AIRF', u'code_type_is_ASSLTF', u'code_type_is_BIOHZF', u'code_type_is_BLDGF', u'code_type_is_BOATF', u'code_type_is_BOMB', u'code_type_is_CALARM', u'code_type_is_CHEMF', u'code_type_is_CHEMI', u'code_type_is_COLAPS', u'code_type_is_COSICK', u'code_type_is_CUTF', u'code_type_is_DETAIL', u'code_type_is_DOMINF', u'code_type_is_DOMNF', u'code_type_is_DROWNF', u'code_type_is_EMS', u'code_type_is_FADV', u'code_type_is_FALARM', u'code_type_is_FDRILL', u'code_type_is_FHELPF', u'code_type_is_FRO', u'code_type_is_FSERV', u'code_type_is_FTEST', u'code_type_is_FTRACC', u'code_type_is_FUMES', u'code_type_is_GAS1', u'code_type_is_GAS2', u'code_type_is_HEROIF', u'code_type_is_HERONF', u'code_type_is_HIRISK', u'code_type_is_HYDR', u'code_type_is_INACTF', u'code_type_is_INFOF', u'code_type_is_INVEST', u'code_type_is_LOCK', u'code_type_is_MENTIF', u'code_type_is_MUTUAL', u'code_type_is_OUTDR', u'code_type_is_OUTLET', u'code_type_is_PDOAF', u'code_type_is_PERDWF', u'code_type_is_PHELPF', u'code_type_is_POSTAF', u'code_type_is_RAPEF', u'code_type_is_RIVERF', u'code_type_is_ROBBIF', u'code_type_is_SALV', u'code_type_is_SHOOTF', u'code_type_is_SIG500', u'code_type_is_STRUCT', u'code_type_is_STUCK', u'code_type_is_SUICF', u'code_type_is_SWAT', u'code_type_is_TASER', u'code_type_is_TESTC', u'code_type_is_TESTF', u'code_type_is_TRAP', u'code_type_is_TRAPF', u'code_type_is_TRK', u'code_type_is_VEH', u'code_type_is_WALKIN', u'code_type_is_WATERR', u'code_type_is_WIRES', u'code_level_is_B', u'code_level_is_C', u'code_level_is_CO', u'code_level_is_D', u'code_level_is_E', u'code_level_is_O']

In [None]:
d_imp = dict(zip(feats, imp))

In [None]:
code_feat_sum = sum([d_imp[i] for i in feats if 'code_' in i])
com_weather_feat_sum = sum([d_imp[i] for i in feats if 'common_' in i])
ext_weather_feat_sum = sum([d_imp[i] for i in feats if 'extreme_' in i])
code_type_sum = sum([d_imp[i] for i in feats if 'code_type' in i])
code_level_sum = sum([d_imp[i] for i in feats if 'code_level' in i])

code_imp = [d_imp[i] for i in feats if 'code_type' in i]
code_sev = [d_imp[i] for i in feats if 'code_level' in i]

In [None]:
dict_code = {}
for key,value in d_imp.iteritems():
    if 'code_type' in key:
        dict_code[key] = d_imp[key]/sum(code_imp)
dict_code_level = {}
for key,value in d_imp.iteritems():
    if 'code_level' in key:
        dict_code_level[key] = d_imp[key]/sum(code_sev)

In [None]:
import operator
sorted_x = sorted(dict_code.items(), key=operator.itemgetter(1))
sorted_x_lev = sorted(dict_code_level.items(), key=operator.itemgetter(1))

In [None]:
sorted_x = sorted_x[::-1]
sorted_x_lev = sorted_x_lev[::-1]

In [None]:
sorted_x = sorted_x[:5]
sorted_x_lev = sorted_x_lev[:2]

In [None]:
vals = [i[1] for i in sorted_x]
vals_lev = [i[1] for i in sorted_x_lev]

In [None]:
vals.append(1-sum(vals))
vals_lev.append(1-sum(vals_lev))

In [None]:
sorted_x

In [None]:
import matplotlib as mpl
mpl.rcParams['font.size'] = 9.0

pct_sizes = [14,0,0]
name_sizes = [0,0,0]

pct_sizes_2 = [10,10,10,10,10,0]
name_sizes_2 = [10,10,10,10,10,10]



labels = ['CODE', 'WEATHER', 'TIME']
labs_2 = ['FADV', 'FIRE ALARM', 'BREATH', 'CHEST', 'INFOF', 'OTHER']

sizes = [code_feat_sum, com_weather_feat_sum, 1-code_feat_sum-com_weather_feat_sum]
colors = ['cornflowerblue', 'gold','mediumseagreen']
colors_2 = ['indianred', 'violet', 'orangered', 'palegreen', 'darksalmon', 'darkgrey']

patches, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors = colors,pctdistance=.85, labeldistance=1.05)
for item in texts:
    item.set_fontsize(14)
for i in range(len(pct_sizes)):
    autotexts[i].set_fontsize(pct_sizes[i])
for i in range(len(name_sizes)):
    texts[i].set_fontsize(name_sizes[i])

#centre_circle = plt.Circle((0,0),0.7,color='black', fc='white',linewidth=1.25)
#fig = plt.gcf()
#fig.gca().add_artist(centre_circle)



centre_circle = plt.Circle((0,0),0.6,color='black', fc='white',linewidth=1.25)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.text(.03,.9,'TIME', fontsize = 14 , rotation = 85)
plt.text(.11,.9,'WEATHER', fontsize = 14 , rotation = 77)
plt.text(.4,-.75,'CODE', fontsize = 14)



plt.axis('equal')
plt.tight_layout()
plt.savefig('pie_feat.pdf')


plt.show()



In [None]:
import matplotlib as mpl
mpl.rcParams['font.size'] = 9.0

plt.clf()

pct_sizes = [14,0,0]
name_sizes = [0,0,0]

pct_sizes_2 = [10,10,10,10,10,0]
name_sizes_2 = [10,10,10,10,10,10]



labels = ['CODE', 'WEATHER', 'TIME']
labs_2 = ['FADV', 'FIRE ALARM', 'BREATH', 'CHEST', 'INFOF', 'OTHER']

sizes = [code_feat_sum, com_weather_feat_sum, 1-code_feat_sum-com_weather_feat_sum]
colors = ['cornflowerblue', 'gold','mediumseagreen']
colors_2 = ['indianred', 'violet', 'orangered', 'hotpink', 'darksalmon', 'darkgrey']

patches, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors = colors,pctdistance=.85, labeldistance=1.05)
for item in texts:
    item.set_fontsize(14)
for i in range(len(pct_sizes)):
    autotexts[i].set_fontsize(pct_sizes[i])
for i in range(len(name_sizes)):
    texts[i].set_fontsize(name_sizes[i])

#centre_circle = plt.Circle((0,0),0.7,color='black', fc='white',linewidth=1.25)
#fig = plt.gcf()
#fig.gca().add_artist(centre_circle)

patches2, texts2, autotexts2 = plt.pie(vals, labels=labs_2, autopct='%1.1f%%', startangle=90, colors = colors_2,pctdistance=.85, labeldistance=1.05, radius = 0.6)
fig = plt.gcf()
for item in texts2:
    item.set_fontsize(14)
for i in range(len(pct_sizes_2)):
    autotexts2[i].set_fontsize(pct_sizes_2[i])
for i in range(len(name_sizes_2)):
    texts2[i].set_fontsize(name_sizes_2[i])
#fig.gca().add_artist(centre_circle)

centre_circle = plt.Circle((0,0),0.4,color='black', fc='white',linewidth=1.25)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

#ax.annotate('CODE TYPE', xy=(2,1), xytext=(1,2))
plt.text(.03,.9,'TIME', fontsize = 14 , rotation = 85)
plt.text(.11,.9,'WEATHER', fontsize = 14 , rotation = 77)
plt.text(.4,-.75,'CODE', fontsize = 14)

plt.text(-.20,0,'CODE TYPE', fontsize = 14)

plt.axis('equal')
plt.tight_layout()
plt.savefig('pie_loc.pdf')



plt.show()



In [None]:
import matplotlib as mpl
mpl.rcParams['font.size'] = 9.0

pct_sizes = [14,0,0]
name_sizes = [0,0,0]

pct_sizes_2 = [10,10,10,10,10,0]
name_sizes_2 = [10,10,10,10,10,10]

pct_sizes_3 = [11,11,0]
name_sizes_3 = [14,14,0]



labels = ['CODE', 'WEATHER', 'TIME']
#labs_2 = ['FADV', 'FIRE ALARM', 'BREATH', 'CHEST', 'INFOF', 'OTHER']
labs_3 = ['D', 'C','OTHER']

sizes = [code_feat_sum, com_weather_feat_sum, 1-code_feat_sum-com_weather_feat_sum]
colors = ['cornflowerblue', 'gold','mediumseagreen']
colors_2 = ['indianred', 'violet', 'darkgrey']



patches, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors = colors,pctdistance=.85, labeldistance=1.05)
for item in texts:
    item.set_fontsize(14)
for i in range(len(pct_sizes)):
    autotexts[i].set_fontsize(pct_sizes[i])
for i in range(len(name_sizes)):
    texts[i].set_fontsize(name_sizes[i])

#centre_circle = plt.Circle((0,0),0.7,color='black', fc='white',linewidth=1.25)
#fig = plt.gcf()
#fig.gca().add_artist(centre_circle)

patches2, texts2, autotexts2 = plt.pie(vals_lev, labels=labs_3, autopct='%1.1f%%', startangle=90, colors = colors_2,pctdistance=.85, labeldistance=1.05, radius = 0.6)
fig = plt.gcf()
for item in texts2:
    item.set_fontsize(14)
for i in range(len(pct_sizes_3)):
    autotexts2[i].set_fontsize(pct_sizes_3[i])
for i in range(len(name_sizes_3)):
    texts2[i].set_fontsize(name_sizes_3[i])
#fig.gca().add_artist(centre_circle)

centre_circle = plt.Circle((0,0),0.4,color='black', fc='white',linewidth=1.25)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.text(.03,.9,'TIME', fontsize = 14 , rotation = 85)
plt.text(.11,.9,'WEATHER', fontsize = 14 , rotation = 77)
plt.text(.4,-.75,'CODE', fontsize = 14)

plt.text(-.27,0,'CODE SEVERITY', fontsize = 14)

plt.axis('equal')
plt.tight_layout()

plt.savefig('pie_sev.pdf')




plt.show()



In [None]:
feats = ['common_weather_prcp', 'common_weather_wesf', 'common_weather_wsf2', 'common_weather_awnd', 'common_weather_pgtm', 'common_weather_tmin', 'common_weather_wdf5', 'common_weather_mdpr', 'common_weather_wesd', 'common_weather_fmtm', 'common_weather_wdf2', 'common_weather_snow', 'common_weather_tmax', 'common_weather_dapr', 'common_weather_snwd', 'common_weather_wsf5', 'common_weather_tobs', 'time_of_day_is_7p-1a', 'time_month_is_10.0', 'time_day_is_27.0', 'time_day_is_11.0', 'time_day_is_15.0', 'time_year_is_2016.0', 'time_year_is_2014.0', 'time_of_day_is_7a-1p', 'time_day_is_4.0', 'time_month_is_2.0', 'time_month_is_3.0', 'time_month_is_9.0', 'time_day_is_22.0', 'time_day_is_19.0', 'time_day_is_14.0', 'time_day_is_26.0', 'time_day_is_25.0', 'time_day_is_9.0', 'time_month_is_11.0', 'time_day_is_20.0', 'time_day_is_29.0', 'time_month_is_6.0', 'time_year_is_2012.0', 'time_day_is_10.0', 'time_month_is_8.0', 'time_day_is_2.0', 'time_year_is_2015.0', 'time_month_is_4.0', 'time_day_is_24.0', 'time_day_is_5.0', 'time_of_day_is_1p-7p', 'time_year_is_2013.0', 'time_month_is_7.0', 'time_month_is_12.0', 'time_day_is_23.0', 'time_day_is_18.0', 'time_day_is_31.0', 'time_day_is_13.0', 'time_day_is_8.0', 'time_day_is_12.0', 'time_day_is_7.0', 'time_day_is_21.0', 'time_day_is_17.0', 'time_day_is_30.0', 'time_day_is_6.0', 'time_day_is_28.0', 'time_month_is_5.0', 'time_day_is_3.0', 'time_day_is_16.0', 'code_type_is_GAS2', 'code_type_is_6', 'code_type_is_HEROIF', 'code_type_is_DROWNF', 'code_type_is_MUTUAL', 'code_type_is_COLAPS', 'code_type_is_BOATF', 'code_type_is_COSICK', 'code_type_is_23', 'code_type_is_FHELPF', 'code_type_is_EMS', 'code_level_is_C', 'code_type_is_12', 'code_type_is_PHELPF', 'code_type_is_GAS1', 'code_type_is_DOMNF', 'code_type_is_CHEMI', 'code_type_is_TESTC', 'code_type_is_24', 'code_type_is_TASER', 'code_type_is_FADV', 'code_type_is_32', 'code_level_is_D', 'code_type_is_15', 'code_type_is_ASSLTF', 'code_type_is_4', 'code_type_is_SUICF', 'code_type_is_OUTDR', 'code_type_is_HERONF', 'code_type_is_25', 'code_type_is_WALKIN', 'code_type_is_ACCIF', 'code_type_is_FRO', 'code_type_is_STUCK', 'code_type_is_POSTAF', 'code_type_is_14', 'code_type_is_31', 'code_type_is_CUTF', 'code_type_is_PDOAF', 'code_level_is_O', 'code_type_is_5', 'code_type_is_AIRF', 'code_type_is_SHOOTF', 'code_type_is_TRAP', 'code_type_is_17', 'code_type_is_30', 'code_type_is_26', 'code_type_is_CALARM', 'code_type_is_INACTF', 'code_type_is_TESTF', 'code_type_is_28', 'code_type_is_VEH', 'code_type_is_2', 'code_type_is_INFOF', 'code_type_is_WIRES', 'code_type_is_SIG500', 'code_type_is_16', 'code_type_is_DETAIL', 'code_type_is_27', 'code_type_is_29', 'code_level_is_CO', 'code_type_is_TRK', 'code_type_is_BIOHZF', 'code_type_is_3', 'code_type_is_MENTIF', 'code_type_is_FUMES', 'code_type_is_HYDR', 'code_type_is_RIVERF', 'code_type_is_WATERR', 'code_type_is_INVEST', 'code_type_is_19', 'code_type_is_CHEMF', 'code_type_is_11', 'code_type_is_20', 'code_type_is_BOMB', 'code_type_is_ROBBIF', 'code_type_is_HIRISK', 'code_type_is_FTEST', 'code_type_is_RAPEF', 'code_type_is_FALARM', 'code_type_is_DOMINF', 'code_type_is_7', 'code_type_is_OUTLET', 'code_type_is_8', 'code_type_is_18', 'code_type_is_10', 'code_type_is_STRUCT', 'code_type_is_TRAPF', 'code_type_is_21', 'code_level_is_E', 'code_type_is_FDRILL', 'code_type_is_LOCK', 'code_type_is_BLDGF', 'code_type_is_PERDWF', 'code_type_is_SALV', 'code_level_is_B', 'code_type_is_9', 'code_type_is_SWAT', 'code_type_is_FTRACC', 'code_type_is_13', 'code_type_is_FSERV', 'code_type_is_22', 'weather_event_mist', 'weather_event_glaze', 'weather_event_rain', 'weather_event_unknown_precip', 'weather_event_freezing_drizzle', 'weather_event_ground_fog', 'weather_event_drizzle', 'weather_event_thunder', 'weather_event_ice_fog', 'weather_event_heavy_fog', 'weather_event_smoke', 'weather_event_snow', 'weather_event_freezing_rain', 'weather_event_hail', 'weather_event_high_winds', 'station_name_is_ST03', 'station_name_is_ST34', 'station_name_is_ST31', 'station_name_is_ST51', 'station_name_is_ST35', 'station_name_is_ST21', 'station_name_is_ST50', 'station_name_is_ST18', 'station_name_is_ST46', 'station_name_is_ST32', 'station_name_is_ST05', 'station_name_is_ST20', 'station_name_is_ST23', 'station_name_is_ST07', 'station_name_is_ST19', 'station_name_is_ST29', 'station_name_is_ST17', 'station_name_is_ST12', 'station_name_is_ST02B', 'station_name_is_ST38', 'station_name_is_ST14', 'station_name_is_ST09', 'station_name_is_ST24', 'station_name_is_ST49', 'station_name_is_ST08', 'station_name_is_ST37', 'within_1day_full_moon', 'acs_edu', 'acs_no_insurance', 'acs_white', 'acs_income', 'acs_black', 'acs_age']

In [None]:
import pickle
model1 = pickle.load(open('model_e69191e313e930617ef52e9d8549f0d2.p', 'r'))


In [None]:
model1

In [None]:
d_imp = dict(zip(feats, model1.coef_[0]))
imps = zip(feats, model1.coef_[0])

In [None]:
sorted(imps, key = lambda x:abs(x[1]))[::-1]

In [None]:
code_feat_sum = sum([abs(d_imp[i]) for i in feats if 'code_' in i])
com_weather_feat_sum = sum([abs(d_imp[i]) for i in feats if 'common_' in i])
ext_weather_feat_sum = sum([abs(d_imp[i]) for i in feats if 'weather_event_' in i])
acs_sum = sum([abs(d_imp[i]) for i in feats if 'acs_' in i])
station_sum = sum([abs(d_imp[i]) for i in feats if 'station_' in i])
time_sum = sum([abs(d_imp[i]) for i in feats if 'time_' in i])


In [None]:
total = float(code_feat_sum + com_weather_feat_sum + ext_weather_feat_sum + acs_sum + station_sum + time_sum)

In [None]:
print code_feat_sum / total
print com_weather_feat_sum / total
print ext_weather_feat_sum / total
print acs_sum / total
print station_sum / total
print time_sum / total

In [None]:
code_feat_sum


In [None]:
scores = full_df['score']

In [None]:
pd.isnull(full_df.score).sum()

In [None]:
metrics_at_k(full_df.trns_to_hosp, full_df.score, 0.99)

In [None]:
from sklearn.metrics import precision_recall_curve
l = precision_recall_curve(full_df.trns_to_hosp, full_df.score)

In [None]:
l[0][-10:]

In [None]:
plt.plot(l[2], l[0][:-1])

In [None]:
s = '-high: 3 -5 -1 1 \n -med: 3 -5 -1 1 \n -low: 3 -5 -1 1'

In [None]:
s = s.replace(' ', '|')

In [None]:
weight_tups = []
weight_str = re.findall(r'-.*[0-9].*[0-9].*[0-9].*[0-9]', self.weights)
for item in weight_str:
    l = (item.strip('-').split(':'))
    weights = [float(i) for i in l[1].split('|')[1:]]
    weight_tups.append((l[0], weights))

weight_dict = {}
for item in weight_tups:
    weight_dict[item[0]] = item[1]

In [None]:
tups = [(1,1), (1,2)]

In [None]:
sum([i[0] == i[1] for i in tups])

In [None]:
for code in set(full_df['code_type']):
    try:
        type_df = full_df[full_df['code_type'] == code]

        curr_df = type_df[['m_required', 'trns_to_hosp']]

        curr_df = curr_df.dropna()

        def type_of_pred(pred, obs):
            """
            input: prediction and observation
            output: whether this is a True Positive, False Pos, False Neg, or True Neg
            """
            if pred and obs:
                return 'TP'
            elif pred and not obs:
                return 'FP'
            elif not pred and obs:
                return 'FN'
            else:
                return 'TN'
        curr_df['class'] = curr_df.apply(lambda x: type_of_pred(x.m_required, x.trns_to_hosp), axis =1)

        new_df = type_df[['score', 'trns_to_hosp']]

        new_df = new_df.dropna()

        v = np.percentile(new_df['score'], 100-44.5)

        new_df['pred'] = new_df['score'].apply(lambda x: True if x >= v else False)

        new_df['class'] = new_df.apply(lambda x: type_of_pred(x.pred, x.trns_to_hosp), axis =1)

        gb_class_old = curr_df.groupby('class')
        gb_class_new = new_df.groupby('class')


        """for name,group in gb_class_old:
            print name, len(group)/float(len(curr_df))
        print '---------'
        for name,group in gb_class_new:
            print name, len(group)/float(len(new_df))"""

        if len([i for i in curr_df['class'] if i == 'TP']) + len([i for i in curr_df['class'] if i == 'FN']) == 0:
            print code
    except:
        continue

In [None]:
new_df[['trns_to_hosp', 'pred']]

In [None]:
t = new_df[['trns_to_hosp', 'pred']]

In [None]:
t.reindex(np.random.permutation(t.index))

In [None]:
model1 = open('e69191e313e930617ef52e9d8549f0d2.jsonb' ,'rw')


In [None]:
print model1.read()

In [None]:
model.coef_

In [None]:
x = feature_df[['trns_to_hosp', 'm_required', 'code_type', 'time_year']]

In [None]:
max_val= 0
for code in set(x.code_type):
    new = len(x[(x['trns_to_hosp'] == False)&(x['m_required'] == True)&(x['code_type'] == code)&(x['time_year'] <= 2015)&(x['time_year'] >= 2013)])
    if new > max_val:
        max_val = new
        print code, new

In [None]:
set(feature_df.time_year)