# Transport Accuracy Analysis

In [None]:
%matplotlib inline
import pandas as pd
import operator
import psycopg2
import pylab
import numpy as np
import datetime
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from collections import Counter
import matplotlib.patches as mpatches
from scipy.stats.stats import pearsonr
import matplotlib.lines as mlines
import matplotlib as mpl
from matplotlib import cm
from mpl_toolkits.axes_grid1 import make_axes_locatable
from statsmodels.tsa import stattools
import statsmodels.api as sm
import scipy
import random
import seaborn as sns
from matplotlib.font_manager import FontProperties
import matplotlib.mlab as mlab
import re
from collections import OrderedDict
import statsmodels.api as sm
from scipy import stats
import statsmodels
from statsmodels.graphics.api import qqplot
from sklearn import linear_model, datasets

mpl.rcdefaults()
pd.options.display.mpl_style = 'default'

In [None]:
# Read database parameters from default_profile
dbitems = {}
with open('default_profile') as f:
    for line in f.readlines():
        item = line.split(" ")[1].split("=")
        dbitems[item[0]] = item[1].strip()
        
# Connect to database with psycopg2
try:
    conn = psycopg2.connect("dbname='%s' user='%s' host='%s' password='%s'"%(dbitems['PGDATABASE'],dbitems['PGUSER'],dbitems['PGHOST'],dbitems['PGPASSWORD']))
except:
    print "Unable to connect to the database"
    
# Connect to database with sqalchemy
conn_sqlalch = create_engine('postgresql+psycopg2://%s:%s@%s/%s'%(dbitems['PGUSER'],dbitems['PGPASSWORD'],dbitems['PGHOST'],dbitems['PGDATABASE']))

# Functions

In [None]:
def is_NLN_format(s):
    """
    input: string
    output: True if s is in Number-Letter-Number format and False otherwise
    """
    hit = re.match(r'\d{1,2}[A-Z](\d{1,2}|O)', s, flags = 0)
    if hit:
        return True
    return False

# Code

In [None]:
#get feature table
feature_df = pd.read_sql_query("SELECT * from luigi_clean_cad.dbo_rfirehouseapparatus", conn)

In [None]:
full_df = feature_df

In [None]:
#dictionary of codes
master_dict = {
    1:'ABDOMINAL PAIN',
    2:'ALLERGIES',
    3:'ANIMAL BITES',
    4:'ASSAULT',
    5:'BACK PAIN',
    6:'BREATHING',
    7:'BURNS',
    8:'CARBON MONOXIDE',
    9:'CARIAC/RESP ARREST',
    10:'CHEST PAIN',
    11:'CHOKING',
    12:'CONVULSIONS/SEIZURES',
    13:'DIABETIC PROBS',
    14:'DROWNING',
    15:'ELECTROCUTION',
    16:'EYE PROBS',
    17:'FALLS',
    18:'HEADACHE',
    19:'HEART PROBLEMS',
    20:'HEAT/COLD EXPOSURE',
    21:'HEMORRHAGE/LACERATIONS',
    22:'ENTRAPMENT',
    23:'OVERDOSE/POISONING',
    24:'PREGNANCY',
    25:'PHYCHIATRIC/SUICIDE ATTEMPT',
    26:'SICK PERSON',
    27:'STAB/GUNSHOT WOUND',
    28:'STROKE',
    29:'TRAFIC/TRANPORTATION INC',
    30:'TRAUMATIC INJURIES',
    31:'UNCONSCIOUS/FAINTING',
    32:'UNKNOWN'}

In [None]:
#get feature dataframe sum
trns_df = feature_df.groupby(['i_eventnumber'])['trns_to_hosp'].sum()

In [None]:
#find out of certain codes in NUM-LET-NUM format
full_df['is_NLN'] = full_df['iti_typeid'].apply(is_NLN_format)

In [None]:
#groupby type
gb_type = full_df.groupby('iti_typeid')

In [None]:
#get transport accuracy for each type of incident
trns_acc = {}
for name,group in gb_type:
    trns_acc[name] = (group['trns_to_hosp']>0).sum()/float(len(group))

In [None]:
def prob_trns(typ, sev=''):
    """
    Inputs typ which is numeric code of incident
    and sev which is severity level: A,B,C,D,E...
    
    Return Double (x,y)
    x = probability that incident requires tranport
    y = probability that a given incident has the given type
    
    If no such incident, return error
    """
    inc_type = typ+sev
    try:
        return trns_acc[inc_type]
    except:
        return 'Undefined Incident Type'

In [None]:
inc_df[0:100].to_sql('training', conn_sqlalch, if_exists = 'replace', schema = 'model')

In [None]:
f = open('workfile', 'wr')
f.write("complete")

In [None]:
with open('workfile', 'r') as infile:
    words = infile.read().splitlines()


In [None]:
words[0] == 'complete'

In [None]:
full_df[0:1000]

# Logistic Regression

In [None]:
#first pass at logistic regression
def gen_logistic_results(feature_df):
    """
    input: dataframe of features for logistic regression
    output: percent accuracy gain for each type of incident
    """

    feature_df['hour'] = feature_df['i_ttimecreate'].apply(lambda x: x.hour)
    feature_df = feature_df[feature_df['trns_to_hosp'].apply(lambda x: type(x) == bool)]

    base = {}
    scores = {}
    thresh = 0.9
    gb_type = feature_df.groupby('iti_typeid')
    for name,group in gb_type:
        if len(group) < thresh:
            break

        X = group['hour'][:, np.newaxis]
        X_train = X[:int(thresh*len(X))]
        X_test = X[int(thresh*len(X)):]

        Y = [int(i) for i in group['trns_to_hosp']]
        Y_train = Y[:int(thresh*len(Y))]
        Y_test = Y[int(thresh*len(Y)):]

        if len(set(Y_train)) > 1:
            logreg = linear_model.LogisticRegression()
            logreg.fit(X_train,Y_train)
            pred = logreg.predict(X_test)
            scores[name] = sum([int(i==j) for i,j in zip(pred, Y_test)])/float(len(Y_test))

        prob = sum(Y)/float(len(Y))
        base[name] = max(prob, 1-prob)
            
    results = pd.DataFrame(scores.items(), columns = ['inc_type_id', 'logistic_acc'])
    
    
    base_list = []
    for item in scores.keys():
        base_list.append(base[item])
    results['base_acc'] = base_list
    results['pct_acc_gain'] = (results['logistic_acc'] - results['base_acc'])/results['base_acc']
    return results
            
