# Descriptive stats for EMS codes

In [None]:
%load_ext autoreload 
%autoreload 2
%matplotlib inline
from __future__ import division
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from utils.pg_tools import PGWrangler
import pandas as pd
pd.options.display.mpl_style = 'default'  # Will raise error because deprecated
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams.update({'font.size': 22})
plt.rcParams['legend.fancybox'] = True
import numpy as np
from matplotlib import gridspec

pgw = PGWrangler()
engine = pgw.engine

## Load table and sanitize

In [None]:
# Load feature table form Karen
df = pd.read_sql_table('master',engine.connect(),'semantic')

In [None]:
# Select only codes with year > 2012
df = df[df['i_eventnumber'].apply(lambda x: int(x[3:5])>12)]

In [None]:
# Convert number of units from float to string
#df['count'] = df['count'].apply(lambda x: "%d"%int(x) if not np.isnan(x) else 'NaN')

### Split eventnumber into class, severity, and specifiers

In [None]:
import re
def parse_id(ID):
    try:
        re_string = "^(\d+)([A-F]+)(.*)"
        m = re.search(re_string,ID)
        id_type = "%d"%int(m.group(1))
        severity = m.group(2)
        specifiers = m.group(3)
    except:
        #print('Unable to match re: '+re_string+' to '+ID)
        return(ID,None,None)
    return(id_type,severity,specifiers)

In [None]:
df['id_type'] = df['iti_typeid'].map(lambda x: parse_id(x)[0])
df['id_severity'] = df['iti_typeid'].map(lambda x: parse_id(x)[1])
df['id_specifiers'] = df['iti_typeid'].map(lambda x: parse_id(x)[2])

### Split eventnumbers into year-month-day

In [None]:
# Select day, month, and year for lunar phase estimation
df['day'] = df['i_eventnumber'].apply(lambda x: int(x[7:9]))
df['month'] = df['i_eventnumber'].apply(lambda x: int(x[5:7]))
df['year'] = df['i_eventnumber'].apply(lambda x: int("20%s"%x[3:5]))
df['hour'] = df['i_ttimecreate'].apply(lambda x: x.hour)

## Compute true and false positives and negatives

In [None]:
df['trns_to_hosp'] = df['trns_to_hosp']==True

In [None]:
df['fp'] = df['m_required'] & (~df['trns_to_hosp'])
df['fn'] = (~df['m_required']) & df['trns_to_hosp']
df['tp'] = df['m_required'] & (df['trns_to_hosp'])
df['tn'] = (~df['m_required']) & (~df['trns_to_hosp'])

In [None]:
print("FP: %.2f"%(sum(df['fp'])/len(df)))
print("FN: %.2f"%(sum(df['fn'])/len(df)))
print("TP: %.2f"%(sum(df['tp'])/len(df)))
print("TN: %.2f"%(sum(df['tn'])/len(df)))

In [None]:
print("Match: required-sent %.2f"%(sum(df['m_required']==df['m_sent'])/len(df)))
print("Match: required-trns %.2f"%(sum(df['m_required']==df['trns_to_hosp'])/len(df)))
print("Match: sent-trns %.2f"%(sum(df['trns_to_hosp']==df['m_sent'])/len(df)))

### Recode class to classlabel by hand

In [None]:
types = {0:'Case',
         1:'Abdominal',
         2:'Allergies',
         3:'Animal',
         4:'Assault',
         5:'Back',
         6:'Breathing',
         7:'Burns',
         8:'CO',
         9:'Cardiac',
         10:'Chest',
         11:'Chocking',
         12:'Convulsions',
         13:'Diabetic',
         14:'Drowning',
         15:'Electrocution',
         16:'Eye',
         17:'Falls',
         18:'Headache',
         19:'Heart',
         20:'Heat',
         21:'Hemorrhage',
         22:'Inaccessible',
         23:'Overdose',
         24:'Pregnancy',
         25:'Psychiatric',
         26:'Sick',
         27:'Penetration',
         28:'Stroke',
         29:'Traffic',
         30:'Traumatic',
         31:'Unconscious',
         32:'Unknown',
         33:'Transfer'}

In [None]:
def type_to_str(x):
    try:
        return types_str[x]
    except:
        return x
def id_dict_to_str(x):
    type_str = type_to_str(x[0])
    if x[1]:
        type_str+='-'+x[1]+x[2]
    return type_str

types_str = {"%d"%x:types[x].upper() for x in types}
df['type']=df['id_type'].apply(lambda x: type_to_str(x))
df['id_full'] = df['iti_typeid'].map(lambda x: id_dict_to_str(parse_id(x)))

# Analysis

In [None]:
df.head(10)

## Code-specific stats

### 26A9: Sick person - transport only (code from site visit)

In [None]:
# Code: Sick person - transport only
code_26A9 = df.groupby('iti_typeid').get_group('26A9')
print("FP for sick/transport: %.2f"%(code_26A9['fp'].sum()/len(code_26A9)))
print("TP for sick/transport: %.2f"%(code_26A9['tp'].sum()/len(code_26A9)))
print("Fraction of sick/transport calls: %.4f"%(len(code_26A9)/len(df)))

### Stats by day of month

In [None]:
df[(~(df['month']==2)) & (df['day']<31)]['day'].hist(bins=30)
plt.xlabel('Day of the month')
plt.ylabel('Incident count 2013-2015')
_ = plt.xlim([1,30])

In [None]:
df[(~(df['month']==2)) & (df['day']<31) & (df['id_type']=='23')]['day'].hist(bins=30)
plt.xlabel('Day of the month')
plt.ylabel('OD count 2013-2015')
_ = plt.xlim([1,30])

## Summary Stats

In [None]:
def hist_by_column(col_name):
    group_col = df.groupby(col_name)
    N_events = group_col['i_eventnumber'].count().sort_values()
    plt.hist(N_events)
    plt.xlabel('Count')
    plt.ylabel('Frequency')
    plt.title(col_name.replace('_',' '))
    print('Most frequent:')
    print(N_events.tail(5)[::-1])
def bar_by_column(col_name):
    group_col = df.groupby(col_name)
    x = []
    y = []
    for (name,group) in group_col:
        x.append(name)
        y.append(len(group))
    x_plot = np.arange(len(x))
    plt.figure()
    plt.bar(x_plot-0.4,y,width=0.8)
    plt.xticks(x_plot,x)
    plt.xlabel(col_name.replace('_',' '))
    plt.ylabel('Count')

In [None]:
hist_by_column('id_type')

In [None]:
bar_by_column('id_severity')

# TrueFalsePositiveNegative Analyses

In [None]:
def tfpn_highfreq(col_name,sort='count',N=10,plot_scatters=False,start=0):
    name_dict = {'type':'Incident type',
                 'id_full':'Incident type',
                 'id_severity':'Incident severity'}
    try:
        name = name_dict[col_name]
    except:
        name = col_name.replace('_',' ')
    highfreq_ids = list(df.groupby(col_name)['i_eventnumber'].count().sort_values().tail(N).index)
    CALC_COLS = ['fp','fn','tp','tn']
    df_highfreq = df[df[col_name].isin(highfreq_ids)]
    df_tfpn = df_highfreq.groupby(col_name).agg({x:(lambda y: sum(y)/len(y)) for x in CALC_COLS})
    df_count = df_highfreq.groupby(col_name).agg({'id_type':(lambda x: len(x))})
    if sort=='count':
        df_tfpn = df_tfpn.reindex(highfreq_ids)
        df_count = df_count.reindex(highfreq_ids)
    elif sort=='col':
        df_tfpn = df_tfpn[::-1]
        df_count = df_count[::-1]
    else:
        print('Sorting not implemented')
        return
    
    # Plot tfpn + count
    plt.figure(figsize=(10,1+(N-start)//2))
    gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1]) 
    ax1 = plt.subplot(gs[0])
    ax2 = plt.subplot(gs[1])
    df_tfpn[CALC_COLS][::-1][start:][::-1].plot.barh(stacked=True,ax=ax1)
    df_count[::-1][start:][::-1].plot.barh(ax=ax2,legend=False)
    ax2.set_xlim([0,df_count['id_type'].max()])
    ax2.yaxis.set_major_locator(plt.NullLocator())
    ax2.xaxis.set_major_locator(plt.MaxNLocator(2))
    ax1.set_ylabel(name)
    ax1.set_xlabel('Performance')
    ax2.set_xlabel('Count')
    ax2.set_ylabel('')
    # Recolour patches
    bars = ax1.patches
    hatch_dict = {'tp':'\\','tn':' ','fp':'\\','fn':' '}
    col_dict = {'tp':'#467821','tn':'#467821','fp':'#A60628','fn':'#A60628'}
    hatches = ''.join(h*len(df_tfpn[start:]) for h in [hatch_dict[x] for x in CALC_COLS])
    tmp_colors = [[c]*len(df_tfpn[start:]) for c in [col_dict[x] for x in CALC_COLS]]
    colors = []
    for c in tmp_colors:
        colors.extend(c)
    for bar, hatch, color in zip(bars, hatches, colors):
        bar.set_facecolor(color)
        bar.set_hatch(hatch)
    legend = ax1.legend(loc='best')
    legend.set_visible(False)
    
    if plot_scatters:
        # Plot scatter
        plt.figure()
        ax=plt.gca()
        df_tfpn['count']=(df_count['id_type']/df_count['id_type'].max())*500
        df_tfpn.plot(kind='scatter',x='tn',y='fn',s=df_tfpn['count'],color='#348ABD',label='Code: no MT',ax=ax)
        df_tfpn.plot(kind='scatter',x='tp',y='fp',s=df_tfpn['count'],color='#7A68A6',label='Code: MT',hatch='\\',ax=ax)
        plt.xlabel('Code correct [%]')
        plt.ylabel('Code wrong [%]')
        ax.set_xlim(xmin=0,xmax=1)
        ax.set_ylim(ymin=0,ymax=1)
        ax.set_aspect('equal', adjustable='box')

        plt.figure()
        ax=plt.gca()
        df_tfpn.hist('tp',color='#7A68A6',hatch='\\',alpha=0.6,range=(0,1),ax=ax,label='Code: MT')
        df_tfpn.hist('tn',color='#348ABD',alpha=0.6,range=(0,1),ax=ax,label='Code: no MT')
        plt.legend(loc='best')
        plt.title('')
        plt.xlabel('Fraction correct')
        plt.ylabel('Count')

In [None]:
tfpn_highfreq(col_name='type',sort='count',N=20)

In [None]:
#tfpn_highfreq(col_name='type',sort='count',N=20,start=10)

In [None]:
tfpn_highfreq(col_name='id_severity',sort='col',N=8)

In [None]:
tfpn_highfreq(col_name='id_full',sort='count',N=30)

In [None]:
tfpn_highfreq(col_name='id_full',sort='count',N=20,start=10)

In [None]:
tfpn_highfreq(col_name='count',sort='col',N=10)

In [None]:
tfpn_highfreq(col_name='iti_typeid',sort='count',N=8)