# Drug Overdose Statistical Evaluation
by [Devon Bodey](https://www.linkedin.com/in/devonbodey/)

## Data Wrangling

## Training & Cross Validation

## Working Model

In [33]:
import os
try:
    inputFunc = raw_input
except NameError:
    inputFunc = input

import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
 
import seaborn as sns
from statsmodels.formula.api import ols

from sklearn import linear_model
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from patsy import dmatrices

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

import random



# Custom functions

def evaluate(pred, labels_test):
    acc = accuracy_score(pred, labels_test)
    print ("Accuracey: %s"%acc)
    tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()

    recall = tp / (tp + fp)
    percision = tp / (tp + fn)
    f1 = (2 / ((1/recall)+(1/percision)))

    print ("")
    print ("True Negatives: %s"%tn)
    print ("False Positives: %s"%fp)
    print ("False Negatives: %s"%fn)
    print ("True Positives: %s"%tp)
    print ("Recall: %s"%recall)
    print ("Precision: %s"%percision)
    print ("F1 Score: %s"%f1)

def plot_bound(Z_val,data,col1,col2,binary):
    # Z-val equals "Yes" value. E.g., "Y" or "1". 
    # data equals df
    # col1 and col2 defines which colums to use from data
    # Plot binary decision boundary. 
    # For this, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    
    x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10 
    x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
    y_min = 0.0; 
    y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
    h_x = (x_max-x_min)/100  # step size in the mesh
    h_y = (y_max-y_min)/100  # step size in the mesh
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
    if binary == 1:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])   
        Z = np.where(Z=="Y",1,0)
    else:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.pcolormesh(xx, yy, Z)
    plt.show()

## Data Cleaning

Here we load the data we collected and get it all ready to feed to our statistical model(s). That is, we are trying to make a table with one **target** column and one or more **features**. Here I'm loading happiness.csv from: https://data.somervillema.gov/Happiness/Somerville-Happiness-Survey-responses-2011-2013-20/w898-3dfm Note: you can find information on the data elements at this link. 


In [34]:
# Load and peek at your data. Change the file name as needed. 
processed_data_df = pd.read_csv('overdose.csv') 
processed_data_df.head()

Unnamed: 0,Sex,Age,Heroin,Cocaine,Fentanyl,Oxycodone,Oxymorphone,EtOH,Hydrocodone,Benzodiazepine,Methadone,Amphet,Tramad,Morphine
0,Male,41,0,0,1,0,0,0,0,0,0,0,0,0
1,Female,48,1,0,1,1,0,0,0,1,0,0,0,0
2,Male,28,1,0,1,0,0,0,0,0,0,0,0,0
3,Male,40,0,0,1,1,0,0,0,1,0,0,0,0
4,Male,52,1,1,0,0,0,1,0,0,0,0,0,0


In [35]:
# You can replace values in a column based on logic like so
# Note: I used the unique values found above to inform my logic.
# That is, I took the unique text lables and translated them into numbers.
# It's clear that different surveys had different buckets. So I probably 
# sould limit myself to years using the same metrics, but for our purposes
# I'm just going to run with a quick and dirty translation. 

processed_data_df.loc[processed_data_df['Sex'] == 'Male', 'Sex'] = 1
processed_data_df.loc[processed_data_df['Sex'] == 'Female', 'Sex'] = 0
processed_data_df.head()

Unnamed: 0,Sex,Age,Heroin,Cocaine,Fentanyl,Oxycodone,Oxymorphone,EtOH,Hydrocodone,Benzodiazepine,Methadone,Amphet,Tramad,Morphine
0,1,41,0,0,1,0,0,0,0,0,0,0,0,0
1,0,48,1,0,1,1,0,0,0,1,0,0,0,0
2,1,28,1,0,1,0,0,0,0,0,0,0,0,0
3,1,40,0,0,1,1,0,0,0,1,0,0,0,0
4,1,52,1,1,0,0,0,1,0,0,0,0,0,0


In [36]:
# To make sure all of your columns are stored as numbers, use the pd.to_numeric method like so.
processed_data_df = processed_data_df.apply(pd.to_numeric, errors='coerce')
# errors='coerce' will set things that can't be converted to numbers to NaN
# so you'll want to drop these like so.
processed_data_df = processed_data_df.dropna()
processed_data_df.head()

Unnamed: 0,Sex,Age,Heroin,Cocaine,Fentanyl,Oxycodone,Oxymorphone,EtOH,Hydrocodone,Benzodiazepine,Methadone,Amphet,Tramad,Morphine
0,1,41,0.0,0,1,0,0,0,0,0,0,0,0,0.0
1,0,48,1.0,0,1,1,0,0,0,1,0,0,0,0.0
2,1,28,1.0,0,1,0,0,0,0,0,0,0,0,0.0
3,1,40,0.0,0,1,1,0,0,0,1,0,0,0,0.0
4,1,52,1.0,1,0,0,0,1,0,0,0,0,0,0.0


In [37]:
# The second set will be for classifiers where the target is a class.
# Happiness
data_df = processed_data_df[[
                               'Sex', 
                               'Age', 
                               'Heroin',
                               'Oxycodone',"EtOH",
                               'Fentanyl' 
                               ]].copy()
data_df.head()

Unnamed: 0,Sex,Age,Heroin,Oxycodone,EtOH,Fentanyl
0,1,41,0.0,0,0,1
1,0,48,1.0,1,0,1
2,1,28,1.0,0,0,1
3,1,40,0.0,1,0,1
4,1,52,1.0,0,1,0


In [38]:
data_df["OD"] = 1
data_df.head()

Unnamed: 0,Sex,Age,Heroin,Oxycodone,EtOH,Fentanyl,OD
0,1,41,0.0,0,0,1,1
1,0,48,1.0,1,0,1,1
2,1,28,1.0,0,0,1,1
3,1,40,0.0,1,0,1,1
4,1,52,1.0,0,1,0,1


In [39]:
# not reall!
data_1 = data_df.copy()
data_2 = data_1.copy()
data_2["OD"] = 0
for i in range(1,2):
    data_df = pd.concat([data_df, data_2],ignore_index=True)
    data_df.head()
    
#data_df = data_df.dropna()
data_df.head()

Unnamed: 0,Sex,Age,Heroin,Oxycodone,EtOH,Fentanyl,OD
0,1,41,0.0,0,0,1,1
1,0,48,1.0,1,0,1,1
2,1,28,1.0,0,0,1,1
3,1,40,0.0,1,0,1,1
4,1,52,1.0,0,1,0,1


In [40]:
data = data_df
holdout = data.sample(frac=0.2)
training = data.loc[~data.index.isin(holdout.index)]
print(len(training),len(holdout))

837 209


In [41]:

# Define the target (y) and feature(s) (X)
features_train = training.drop("OD", axis=1).as_matrix(columns=None)
labels_train = training["OD"].as_matrix(columns=None)

features_test = holdout.drop("OD", axis=1).as_matrix(columns=None)
labels_test = holdout["OD"].as_matrix(columns=None)

# What percentage of the time is target Y?
print("Percentage of 1s: %s\n"%(len(data[data["OD"]==1])/len(data)))


Percentage of 1s: 0.5



In [42]:
# Logistic Regression
model = LogisticRegression(fit_intercept = False, C = 1e9)
clf = model.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("Logistic Regression")
evaluate(pred, labels_test) 
print (clf.coef_)

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("\nDecision Tree")
evaluate(pred, labels_test)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("Random Forest")
evaluate(pred, labels_test)  

from sklearn.svm import SVC
clf = SVC(kernel="rbf",probability=True)
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("SVM")
evaluate(pred, labels_test)  

Logistic Regression
Accuracey: 0.464114832536

True Negatives: 45
False Positives: 72
False Negatives: 40
True Positives: 52
Recall: 0.41935483871
Precision: 0.565217391304
F1 Score: 0.481481481481
[[-0.01903327  0.00186982 -0.01930588 -0.16068909  0.21797513 -0.05332343]]

Decision Tree
Accuracey: 0.239234449761

True Negatives: 30
False Positives: 87
False Negatives: 72
True Positives: 20
Recall: 0.18691588785
Precision: 0.217391304348
F1 Score: 0.201005025126
Random Forest
Accuracey: 0.167464114833

True Negatives: 19
False Positives: 98
False Negatives: 76
True Positives: 16
Recall: 0.140350877193
Precision: 0.173913043478
F1 Score: 0.155339805825
SVM
Accuracey: 0.320574162679

True Negatives: 18
False Positives: 99
False Negatives: 43
True Positives: 49
Recall: 0.331081081081
Precision: 0.532608695652
F1 Score: 0.408333333333
