In [None]:
%matplotlib inline
import pandas as pd
import operator
import psycopg2
import pylab
import numpy as np
import datetime
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from collections import Counter
import matplotlib.patches as mpatches
from scipy.stats.stats import pearsonr
import matplotlib.lines as mlines
from sklearn import metrics
import matplotlib as mpl
from matplotlib import cm
from mpl_toolkits.axes_grid1 import make_axes_locatable
from statsmodels.tsa import stattools
import statsmodels.api as sm
import scipy
import random
import seaborn as sns
from matplotlib.font_manager import FontProperties
import matplotlib.mlab as mlab
import re
from collections import OrderedDict
import statsmodels.api as sm
from scipy import stats
import statsmodels
from statsmodels.graphics.api import qqplot

mpl.rcdefaults()
pd.options.display.mpl_style = 'default'

In [None]:
# Read database parameters from default_profile
dbitems = {}
with open('default_profile') as f:
    for line in f.readlines():
        item = line.split(" ")[1].split("=")
        dbitems[item[0]] = item[1].strip()
        
# Connect to database with psycopg2
try:
    conn = psycopg2.connect("dbname='%s' user='%s' host='%s' password='%s'"%(dbitems['PGDATABASE'],dbitems['PGUSER'],dbitems['PGHOST'],dbitems['PGPASSWORD']))
except:
    print "Unable to connect to the database"
    
# Connect to database with sqalchemy
conn_sqlalch = create_engine('postgresql+psycopg2://%s:%s@%s/%s'%(dbitems['PGUSER'],dbitems['PGPASSWORD'],dbitems['PGHOST'],dbitems['PGDATABASE']))

In [None]:
time_df = pd.read_sql_query("SELECT * FROM semantic_demand_R.master", conn)

In [None]:
yvar = 'trns_to_hosp'

In [None]:
day_df = time_df.groupby(['time_year', 'time_month', 'time_day', 'station_name']).sum()

In [None]:
#convert hourly data to daily data
day_df.reset_index(inplace = True)

day_df['time'] = day_df.apply(lambda x: datetime.datetime(x.time_year, x.time_month, x.time_day), axis = 1)

day_df.set_index('time', inplace = True)

day_df.drop(['time_year', 'time_month', 'time_day', 'time_hour'], axis = 1, inplace = True)

In [None]:
def metrics_reg(y_true, y_pred, final_weight = 0.1, schedule = 'exp', overunder = (1,1)):
    """
    input: true y vals, predicted y vals, final weight, decay schedule, weights for over/undersending
    output: (Mean Absolute Percent Error, Mean Squared Error, Mean Absolute Error, Time Weighted Score)
    """
    #calculate mean absolute percent error
    mape = np.mean([abs(item[1] - item[0])/float(item[0]) for item in zip(y_true, y_pred)])
    #calculate mean squared error
    mse = metrics.mean_squared_error(y_true, y_pred)**0.5
    #calculate mean absolute error
    mae = metrics.mean_absolute_error(y_true, y_pred)
    
    #create time decaying weights based on decay schedule
    if schedule == 'exp':
        k = np.log(1/float(final_weight))/(len(y_true)-1)
        tweights = [np.exp(-k*i) for i in range(len(y_true))]
    elif schedule == 'lin':
        tweights = [(i+1)*(final_weight-1)/float(len(y_true)) + 1 for i in range(len(y_true))]
        
    
    #generate time weighted score taking into account penalties for over and under sending
    
    twscore = np.mean([((overunder[0]-overunder[1])*(int(item[0] < item[1])) + overunder[1])
                       *(abs(item[1] - item[0])/float(item[0]))
                       *(tweights[item[2]])
                       for item in zip(y_test, y_pred, range(len(y_pred))) if item[0] != 0])
    
    
    
    
    return (mape, mse, mae, twscore)

In [None]:
from sklearn.linear_model import Lasso, Ridge, LinearRegression, Lars, ElasticNet, SGDRegressor, LassoLars, Perceptron, BayesianRidge 

In [None]:
#models to use
models = [Lasso(), Ridge(), LinearRegression(), Lars(), ElasticNet(), LassoLars(), Perceptron(), BayesianRidge()]

In [None]:
#which results to expect
df_results = pd.DataFrame(columns = ['model', 'station', 'MAPE', 'MSE', 'MAE', 'TWSCORE'])

In [None]:
train_df[train_df.trns_to_hosp == 0]

In [None]:
#for each station in the city, run all models and gather results
i = 0
for station in set(day_df.station_name):
    for model in models:
    
        day_df_sta = day_df[day_df.station_name == station].drop('station_name', axis =1)

        day_df_sta['LagInc'] = day_df_sta.total_incidents.shift()
        
        day_df_sta = day_df_sta[day_df_sta[yvar] != 0]

        day_df_sta.dropna(inplace = True)

        train_df = day_df_sta[(day_df_sta.index.year <= 2014) & (day_df_sta.index.year >= 2013)]
        test_df = day_df_sta[day_df_sta.index.year ==2015]

        X_train = train_df[['LagInc']].as_matrix()
        X_test = test_df[['LagInc']].as_matrix()
        y_train = train_df[yvar]
        y_test = test_df[yvar]

        clf = model

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        
        mod_metrics = metrics_reg(y_test, y_pred, final_weight = 0.1, schedule = 'exp', overunder = (1,5))

        df_results.loc[i] = [model, station, mod_metrics[0], mod_metrics[1], mod_metrics[2], mod_metrics[3]]
        i+=1

In [None]:
df_results

In [None]:
clf.coef_

In [None]:
for name,group in df_results.groupby(['station']):
    print name
    print group.sort('TWSCORE').iloc[0][['model', 'TWSCORE']]
    print '-----'

In [None]:
time_df = pd.read_sql_query("SELECT * FROM features_demand.master", conn)

In [None]:
#determine which aggegation function to use for each column
dict_func = {}
for col in time_df:
    if col in ['m_required', 'trns_to_hosp']:
        dict_func[col] = np.sum
    elif col not in ['station_name', '_date', 'pk_demand', 'time_of_day']:
        dict_func[col] = np.mean

day_df = time_df.groupby(['_date', 'station_name']).agg(dict_func)

day_df.reset_index(inplace = True)

day_df['time'] = day_df.apply(lambda x: datetime.datetime(int(x.time_year), int(x.time_month), int(x.time_day)), axis = 1)


day_df.drop(['time_year', 'time_month', 'time_day'], axis = 1, inplace = True)

day_df.drop('_date', axis=1, inplace=True)

In [None]:
import pickle

In [None]:
pkl_file = open('/mnt/data/cincinnati/model_pickle/c0666e89e1df717913a6d802f55af15a.p', 'rb')

data1 = pickle.load(pkl_file)

data1.set_index('incident', inplace = True)

feat_df = pd.read_sql_query("SELECT incident, trns_to_hosp, time_year, m_required FROM features.master_tmp", conn)

feat_df = feat_df[feat_df.time_year == 2015]

feat_df.set_index('incident', inplace = True)

feat_df['score'] = data1.score

feat_df.dropna(inplace = True)

tp_curr = len(feat_df[(feat_df.trns_to_hosp == True)&(feat_df.m_required == True)])
fp_curr = len(feat_df[(feat_df.trns_to_hosp == False)&(feat_df.m_required == True)])
fn_curr = len(feat_df[(feat_df.trns_to_hosp == True)&(feat_df.m_required == False)])
tn_curr = len(feat_df[(feat_df.trns_to_hosp == False)&(feat_df.m_required == False)])

pct_const = 100*(1 - float(tn_curr)/(tp_curr + tn_curr + fn_curr + fp_curr)) - 7.2
mod_lim = np.percentile(feat_df.score, 100 - pct_const)

tp_mod = len(feat_df[(feat_df.trns_to_hosp == True)&(feat_df.score >= mod_lim)])
fp_mod = len(feat_df[(feat_df.trns_to_hosp == False)&(feat_df.score >= mod_lim)])
fn_mod = len(feat_df[(feat_df.trns_to_hosp == True)&(feat_df.score < mod_lim)])
tn_mod = len(feat_df[(feat_df.trns_to_hosp == False)&(feat_df.score < mod_lim)])

sum_inc = float(sum([tp_curr, fp_curr, fn_curr, tn_curr]))

print [tp_curr/sum_inc, fp_curr/sum_inc, fn_curr/sum_inc, tn_curr/sum_inc]

print [tp_mod/sum_inc, fp_mod/sum_inc, fn_mod/sum_inc, tn_mod/sum_inc]

tn_mod - tn_curr

# Generate Lagged Features

In [None]:
import datetime

In [None]:
df = pd.read_sql_query("SELECT * FROM features_demand.master", conn)

In [None]:
df.columns

In [None]:
hour_lag_demand = []
for i,row in df.iterrows():
    try:
        hour_lag_demand.append(df[(df.time_day == row.time_day - 1)&(df.station_name == row.station_name)])['trns_tp_hosp'].iloc[0]
    except TypeError:
        hour_lag_demand.append(0)

In [None]:
df.pk_demand.apply(lambda x: datetime.datetime(x.replace('-',' ').replace(':', ' ').split('_')[0].split()))

In [None]:
x.replace('-',' ').replace(':', ' ').split('_')[0].split()

In [None]:
time_df = pd.read_sql_query("SELECT * FROM features_demand.master", conn)

In [None]:
train_df = pd.read_sql_query("SELECT * FROM model_demand.training", conn)

In [None]:
#create multiindex of station and times to fater access data
ind_zip = zip(train_df.time, train_df.station_name)

index_ts = pd.MultiIndex.from_tuples(ind_zip, names=['day', 'station'])

train_df.index = index_ts

l = []
for i,row in train_df.iterrows():
    try:
        l.append(train_df.ix[i[0]-datetime.timedelta(1)].ix[i[1]].trns_to_hosp)
    except KeyError:
        l.append(0)
train_df['lag1'] = l

In [None]:
def plot_precision_recall_n(y_true, y_prob, model_name):
    """
    input: real y's and y probabilities
    output: a plot of precision and recall at k
    """
    from sklearn.metrics import precision_recall_curve
    y_score = y_prob
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score>=value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)
    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax1.set_ylim(0,1)
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    ax2.set_ylim(0,1)

    name = model_name
    plt.title(name)
    plt.tight_layout()


# Train Set Learning

In [None]:
#bring in model scores 
model_scores = pd.read_csv('/mnt/data/cincinnati/mkiang/cincinnati_ems/yes_time_training_model_scores.csv')
model_scores.index.rename('incident', inplace=True)

In [None]:
#populate dictionary with each model and which bucket it falls into
dict_buckets = {'low':[], 'med':[], 'high':[]}
import random
l = model_scores.columns.tolist()
random.shuffle(l)
for col in l:
    if 'm_' in col:
        low_slice = model_scores[model_scores.bucket == 'low'][col].dropna()
        med_slice = model_scores[model_scores.bucket == 'medium'][col].dropna()
        high_slice = model_scores[model_scores.bucket == 'high'][col].dropna()
        
        if len(low_slice) > 0:
            dict_buckets['low'].append(col)
        elif len(med_slice) > 0:
            dict_buckets['med'].append(col)
        elif len(high_slice) > 0:
            dict_buckets['high'].append(col)

In [None]:
#treat high urgency incidents different
high_count_trns = model_scores[model_scores.bucket == 'high'][['trns_to_hosp']].sum()[0]
high_count_not = len(model_scores[model_scores.bucket == 'high'][['trns_to_hosp']]) - high_count_trns
high_indices = model_scores[model_scores.bucket == 'high'][['trns_to_hosp']].index

In [None]:
def k_search(dfs, mods, curr_max, tot):
    """
    input: dataframes of predicted scores for each bucket
    AND which models they are coming from AND current best score AND total length of training set
    output: updated best score
    """
    #set k ranges
    k1_range = (0.3, 0.8)
    k2_range = (0.6, 0.9)
    k3_range = (1, 1)
    
    #set step
    step = 0.01
    
    #set timeout
    quit_iters = 200

    #true positive goal to hit
    hit_goal = 0.4
    
    #accepted margin of error
    margin = 0.05
    
    #create weights
    w_score_high = [10,-5000,-1,0]
    w_score_med = [3,-3,-1,1]
    w_score_low = [1,-1,-1,1.7]

    counter = 0.0
    k_best = (0,0,0)
    #tot = len(dfs[0]) + len(dfs[1]) + len(dfs[2])

    low_scores = dfs[0].score
    med_scores = dfs[1].score
    

    low_true = dfs[0].trns_to_hosp
    med_true = dfs[1].trns_to_hosp
    
    #loop through all k values pairs updating best score at each iteration
    for k1 in np.arange(k1_range[0], k1_range[1]+step, step):
        
        k1_level = np.percentile(low_scores, 100*(1 - k1))

        low_preds = low_scores >= k1_level

        tp_low = sum([i == 1 and j == 1 for i,j in zip(low_true, low_preds)])
        fp_low = low_preds.sum() - tp_low
        fn_low = sum([i == 1 and j == 0 for i,j in zip(low_true, low_preds)])
        tn_low = sum([i == 0 and j == 0 for i,j in zip(low_true, low_preds)])
        wsc_low = tp_low*w_score_low[0] + fp_low*w_score_low[2] + fn_low*w_score_low[1] + tn_low*w_score_low[3]

        for k2 in np.arange(k2_range[0], k2_range[1]+step, step):
            
            k2_level = np.percentile(med_scores, 100*(1 - k2))

            med_preds = med_scores >= k2_level

            tp_med = sum([i == 1 and j == 1 for i,j in zip(med_true, med_preds)])
            fp_med = med_preds.sum() - tp_med
            fn_med = sum([i == 1 and j == 0 for i,j in zip(med_true, med_preds)])
            tn_med = sum([i == 0 and j == 0 for i,j in zip(med_true, med_preds)])
            wsc_med = tp_med*w_score_med[0] + fp_med*w_score_med[2] + fn_med*w_score_med[1] + tn_med*w_score_med[3]

            

            actual = (tn_low + tn_med)/float(tot)
            
            #only update if within margin of error and weighted score is better than before
            if actual >= hit_goal - margin and actual  <= hit_goal + margin:
                if wsc_low + wsc_med  > curr_max:
                    k_best = (k1_level, k2_level)
                    curr_max = wsc_low + wsc_med
                    counter = 0
                    print k_best, '|', curr_max, '|', actual, '|', mods
                    print '------------------------------'
                else:
                    counter += 1
            else:
                counter += 1
                if counter >= quit_iters:
                    return curr_max
                    
    return curr_max

In [None]:
#loop over all combinations of models
curr_max = 0
l = dict_buckets['low']
random.shuffle(l)

for low_mod in l:
    low_slice = model_scores[model_scores.bucket == 'low']
    low_slice = low_slice.dropna(axis=1, how='all')
    model_slice_low = low_slice[['trns_to_hosp', low_mod]].dropna(axis=0)
    model_slice_low['score'] = low_slice[low_mod]
    model_slice_low.drop(low_mod, 1, inplace = True)
    
    for med_mod in np.random.choice(dict_buckets['med'],10):
        med_slice = model_scores[model_scores.bucket == 'medium']
        med_slice = med_slice.dropna(axis=1, how='all')
        model_slice_med = med_slice[['trns_to_hosp', med_mod]].dropna(axis=0)
        model_slice_med['score'] = med_slice[med_mod]
        model_slice_med.drop(med_mod, 1, inplace = True)
        
    
        print [low_mod, med_mod]
        print '---'
        
        dfs = [model_slice_low, model_slice_med]

        valid_indices = model_slice_low.index.tolist() + model_slice_med.index.tolist() + high_indices.tolist() 

        sub_df = model_scores.ix[valid_indices]

        #run k search for each combo
        curr_max = k_search(dfs, [low_mod, med_mod, 'NULL'], curr_max, len(sub_df))

            

In [None]:
len(low_slice)/float(len(model_scores)), len(med_slice)/float(len(model_scores)), len(high_slice)/float(len(model_scores))

# Train Set Check

In [None]:
#same as above, sanity check on train data
curr_max = 0

for low_mod in ['m_f69bddd49b08947e4d1e7f64090f411c']:
    low_slice = model_scores[model_scores.bucket == 'low']
    low_slice = low_slice.dropna(axis=1, how='all')
    model_slice_low = low_slice[['trns_to_hosp', low_mod]].dropna(axis=0)
    model_slice_low['score'] = low_slice[low_mod]
    model_slice_low.drop(low_mod, 1, inplace = True)
    
    for med_mod in ['m_1786ecf969af47fbc1fbe8f26f959971']:
        med_slice = model_scores[model_scores.bucket == 'medium']
        med_slice = med_slice.dropna(axis=1, how='all')
        model_slice_med = med_slice[['trns_to_hosp', med_mod]].dropna(axis=0)
        model_slice_med['score'] = med_slice[med_mod]
        model_slice_med.drop(med_mod, 1, inplace = True)
        
    
        print [low_mod, med_mod]
        print '---'
        

        dfs = [model_slice_low, model_slice_med]

        valid_indices = model_slice_low.index.tolist() + model_slice_med.index.tolist()  + high_indices.tolist()

        sub_df = model_scores.ix[valid_indices]

       

            

In [None]:
low_scores = dfs[0].score
med_scores = dfs[1].score


low_true = dfs[0].trns_to_hosp
med_true = dfs[1].trns_to_hosp



In [None]:

k_set = [0.46250493856280006, 0.49599409868599997,0]

scaler = 1

tn_low = 0
tn_med = 0


tot = 1

for item in [1]:
    
    k_vals = [i*item for i in k_set]
    
    if (tn_low + tn_med)/float(tot) <= 0.41 and (tn_low + tn_med)/float(tot) >= 0.39:
        break
        
    tot = len(low_true) + len(med_true) 

    low_preds = low_scores >= k_vals[0]
    med_preds = med_scores >= k_vals[1]
   

    tp_low = sum([i == 1 and j == 1 for i,j in zip(low_true, low_preds)])
    fp_low = low_preds.sum() - tp_low
    fn_low = sum([i == 1 and j == 0 for i,j in zip(low_true, low_preds)])
    tn_low = sum([i == 0 and j == 0 for i,j in zip(low_true, low_preds)])

    tp_med = sum([i == 1 and j == 1 for i,j in zip(med_true, med_preds)])
    fp_med = med_preds.sum() - tp_med
    fn_med = sum([i == 1 and j == 0 for i,j in zip(med_true, med_preds)])
    tn_med = sum([i == 0 and j == 0 for i,j in zip(med_true, med_preds)])
    
    tot = len(low_preds) + len(med_preds) + high_count_trns + high_count_not

print (tp_low + tp_med + high_count_trns)/float(tot)
print (fn_low + fn_med)/float(tot)
print (fp_low + fp_med + high_count_not)/float(tot)
print (tn_low + tn_med)/float(tot)

In [None]:
cur_df = pd.read_sql_query("SELECT incident, m_required FROM model.testing", conn)

#sub_df.set_index('Unnamed: 0',inplace=True)

cur_df.set_index('incident',inplace=True)

full = sub_df[['trns_to_hosp']].join(cur_df)

tp = len(full[(full.trns_to_hosp == 1)&(full.m_required == True)])
fn = len(full[(full.trns_to_hosp == 1)&(full.m_required == False)])
fp = len(full[(full.trns_to_hosp == 0)&(full.m_required == True)])
tn = len(full[(full.trns_to_hosp == 0)&(full.m_required == False)])
all_len = tp+fn+fp+tn

print float(tp)/all_len
print float(fn)/all_len
print float(fp)/all_len
print float(tn)/all_len

# Test Set Validation

In [None]:
#run same commands as above, now on test data using best models and best k values
model_scores_test = pd.read_csv('/mnt/data/cincinnati/mkiang/cincinnati_ems/yes_time_all_model_scores.csv')
model_scores_test.set_index('Unnamed: 0', inplace=True, drop=True)
model_scores_test.index.rename('incident', inplace=True)

In [None]:
high_count_trns = model_scores_test[model_scores_test.bucket == 'high'][['trns_to_hosp']].sum()[0]
high_count_not = len(model_scores_test[model_scores_test.bucket == 'high'][['trns_to_hosp']]) - high_count_trns
high_indices = model_scores_test[model_scores_test.bucket == 'high'][['trns_to_hosp']].index

In [None]:
curr_max = 0

for low_mod in ['m_94f214f6c2770dc60a4dff95de2b5a9b']:
    low_slice = model_scores_test[model_scores_test.bucket == 'low']
    low_slice = low_slice.dropna(axis=1, how='all')
    model_slice_low = low_slice[['trns_to_hosp', low_mod]].dropna(axis=0)
    model_slice_low['score'] = low_slice[low_mod]
    model_slice_low.drop(low_mod, 1, inplace = True)
    
    for med_mod in ['m_ca6b4b8f069b5dd5c4e4bfd93de1e052']:
        med_slice = model_scores_test[model_scores_test.bucket == 'medium']
        med_slice = med_slice.dropna(axis=1, how='all')
        model_slice_med = med_slice[['trns_to_hosp', med_mod]].dropna(axis=0)
        model_slice_med['score'] = med_slice[med_mod]
        model_slice_med.drop(med_mod, 1, inplace = True)
        
    
        print [low_mod, med_mod]
        print '---'
        

        dfs = [model_slice_low, model_slice_med]

        valid_indices = model_slice_low.index.tolist() + model_slice_med.index.tolist()  + high_indices.tolist()

        sub_df = model_scores_test.ix[valid_indices]

       

            

In [None]:
len(low_slice)/float(len(model_scores_test)), len(med_slice)/float(len(model_scores_test)), len(high_slice)/float(len(model_scores_test))

In [None]:
low_scores = dfs[0].score
med_scores = dfs[1].score


low_true = dfs[0].trns_to_hosp
med_true = dfs[1].trns_to_hosp


In [None]:
#scale back k's fixing proportions until we reach correct true negative percentage
k_set = [0.45957257626522998, 0.48092488563031999,0]

curr_max = 0

for l in np.arange(0.5,0.3,-0.01):
    for m in np.arange(0.5,0.3,-0.01):
        k_set = [l,m,0]

        scaler = 1

        tn_low = 0
        tn_med = 0


        tot = 1

        for item in np.arange(1.5,0,-0.05):

            k_vals = [i*item for i in k_set]

            if (tn_low + tn_med)/float(tot) <= 0.41 and (tn_low + tn_med)/float(tot) >= 0.39:
                break

            tot = len(low_true) + len(med_true) 

            low_preds = low_scores >= k_vals[0]
            med_preds = med_scores >= k_vals[1]


            tp_low = sum([i == 1 and j == 1 for i,j in zip(low_true, low_preds)])
            fp_low = low_preds.sum() - tp_low
            fn_low = sum([i == 1 and j == 0 for i,j in zip(low_true, low_preds)])
            tn_low = sum([i == 0 and j == 0 for i,j in zip(low_true, low_preds)])

            tp_med = sum([i == 1 and j == 1 for i,j in zip(med_true, med_preds)])
            fp_med = med_preds.sum() - tp_med
            fn_med = sum([i == 1 and j == 0 for i,j in zip(med_true, med_preds)])
            tn_med = sum([i == 0 and j == 0 for i,j in zip(med_true, med_preds)])

            tot = len(low_preds) + len(med_preds) + high_count_trns + high_count_not

            print item, (tn_low + tn_med)/float(tot)
        
        if (tp_low + tp_med + high_count_trns)/float(tot) > curr_max and (tn_low + tn_med)/float(tot) <= 0.41 and (tn_low + tn_med)/float(tot) >= 0.39:
            curr_max = (tp_low + tp_med + high_count_trns)/float(tot)
            print '---'
            print l,m,curr_max
            print '---'

print (tp_low + tp_med + high_count_trns)/float(tot)
print (fn_low + fn_med)/float(tot)
print (fp_low + fp_med + high_count_not)/float(tot)
print (tn_low + tn_med)/float(tot)

In [None]:
cur_df = pd.read_sql_query("SELECT incident, m_required FROM model.testing", conn)

#sub_df.set_index('Unnamed: 0',inplace=True)

cur_df.set_index('incident',inplace=True)

full = sub_df[['trns_to_hosp']].join(cur_df)

tp = len(full[(full.trns_to_hosp == 1)&(full.m_required == True)])
fn = len(full[(full.trns_to_hosp == 1)&(full.m_required == False)])
fp = len(full[(full.trns_to_hosp == 0)&(full.m_required == True)])
tn = len(full[(full.trns_to_hosp == 0)&(full.m_required == False)])
all_len = len(full)

print float(tp)/all_len
print float(fn)/all_len
print float(fp)/all_len
print float(tn)/all_len

# Feat Imps 

In [None]:
#read json file of model we care about
import json
dict_info = json.loads(df[df.PICKLE == '/mnt/data/cincinnati/model_pickle/ca6b4b8f069b5dd5c4e4bfd93de1e052.p'].JSON.iloc[0])

In [None]:
sorted_featimps = sorted(dict_info['MODEL FEATS & IMPS'], key = lambda x: -x[1])

In [None]:
#generate list of importances for each feature class
acs_sum = []
weather_sum = []
time_sum = []
code_type_sum = []
code_sev_sum = []
call_source_sum = []
operator_name_sum = []
geography_sum = []
building_sum = []
past_sum = []

for item in sorted_featimps:
    if 'acs' in item[0]:
        acs_sum.append(item[1])
    elif 'weather' in item[0] or 'relative' in item[0]:
        weather_sum.append(item[1])
    elif 'time' in item[0]:
        time_sum.append(item[1])
    elif 'code_type' in item[0]:
        code_type_sum.append(item[1])
    elif 'code_level' in item[0]:
        code_sev_sum.append(item[1])
    elif 'call_source' in item[0]:
        call_source_sum.append(item[1])
    elif 'operator_name' in item[0]:
        operator_name_sum.append(item[1])
    elif 'building' in item[0]:
        building_sum.append(item[1])
    elif 'frac' in item[0] or 'total_' in item[0] or 'repeated_' in item[0]:
        past_sum.append(item[1])
    elif 'station' in item[0]:
        geography_sum.append(item[1])

In [None]:
#take median of each class
feat_dict = {'Demographics': np.median(acs_sum), 'Weather': np.median(weather_sum), 'Time': np.median(time_sum), 'Code Type': np.median(code_type_sum), 'Code Severity': np.median(code_sev_sum), 'Call Source': np.median(call_source_sum), 'Calltaker': np.median(operator_name_sum)
             ,'Building Type': np.median(building_sum), 'Past History': np.median(past_sum), 'Geography': np.median(geography_sum)}

In [None]:

feat_items = feat_dict.items()

In [None]:
feat_items = sorted(feat_items, key = lambda x: -x[1])[:5]

In [None]:
#draw barchart of importances
plt.figure(figsize=(24,6))
mpl.rcdefaults()
scaler = 1
plt.bar(np.arange(0,scaler*len(feat_items), scaler), [i[1] for i in feat_items])
plt.xticks(np.arange(0.4, scaler*(len(feat_items)+0.4),scaler),[i[0] for i in feat_items], fontsize = 40)
plt.yticks(fontsize = 20)
plt.ylabel('Importance', fontsize = 48)
#plt.xlabel('Predictor', fontsize = 30)

plt.yticks([])

plt.xlim(-0.4, scaler*len(feat_items))
plt.tight_layout()

plt.savefig('feature_importanes.pdf')