In [2]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#--------  cost
# A function that computes the expected cost of the public healthy policy based on the 
# classifications generated by your model
# Input: 
#      y_true (true class labels: 0, 1, 2)
#      y_pred (predicted class labels: 0, 1, 2)
# Returns: 
#      total_cost (expected total cost)

def cost(y_true, y_pred):
    cost_of_treatment_1 = 29500
    cost_of_treatment_2 = 45000
    cost_of_intervention_1 = 4150
    cost_of_intervention_2 = 4250
    cost_of_vaccine = 15
    
    prob_complications_untreated = 0.65
    prob_complications_1 = 0.30
    prob_complications_2 = 0.15
    
    trials = 1000    
    
    intervention_cost = cost_of_intervention_1 * len(y_pred[y_pred==1]) + cost_of_intervention_2 * len(y_pred[y_pred==2])

    vaccine_cost = cost_of_vaccine * len(y_pred[y_pred==0])
    
    false_neg_1 = ((y_true == 1) & (y_pred == 2)).sum()
    false_neg_2 = ((y_true == 2) & (y_pred == 1)).sum()
    
    untreated_1 = ((y_true == 1) & (y_pred == 0)).sum()    
    untreated_2 = ((y_true == 2) & (y_pred == 0)).sum()
    
    false_neg_1_cost = np.random.binomial(1, prob_complications_1, (false_neg_1, trials)) * cost_of_treatment_1
    false_neg_2_cost = np.random.binomial(1, prob_complications_2, (false_neg_2, trials)) * cost_of_treatment_2
    untreated_1_cost = np.random.binomial(1, prob_complications_untreated, (untreated_1, trials)) * cost_of_treatment_1
    untreated_2_cost = np.random.binomial(1, prob_complications_untreated, (untreated_2, trials)) * cost_of_treatment_2
    
    false_neg_1_cost = false_neg_1_cost.sum(axis=0)
    expected_false_neg_1_cost = false_neg_1_cost.mean()
    
    false_neg_2_cost = false_neg_2_cost.sum(axis=0)
    expected_false_neg_2_cost = false_neg_2_cost.mean()
    
    untreated_1_cost = untreated_1_cost.sum(axis=0)
    expected_untreated_1_cost = untreated_1_cost.mean()
    
    untreated_2_cost = untreated_2_cost.sum(axis=0)
    expected_untreated_2_cost = untreated_2_cost.mean()
    
    total_cost = vaccine_cost + intervention_cost + expected_false_neg_1_cost + expected_false_neg_2_cost + expected_untreated_1_cost + expected_untreated_2_cost
    
    return total_cost

In [4]:
accuracy = lambda y_true, y_pred: (((y_true - y_pred) == 0).sum() * 1.) / len(y_true)

# function to check - did they beat our benchmarks?
# You can either pass it the flu_predict function the student wrote or the
# file name of the y-labels.
#
# predict: function of type (array -> array)
# pred_y_file_name: file name of where their predicted y-labels live
# data_preprocessing: if false x-train data will be fed to flu_predict with no processing if true data will be encoded with 0 filled in for NaN
# cost: if true computes the expect cost
# 
# return some string indicating result of comparison with benchmark

def beat_benchmark(flu_predict=None, pred_y_file_name=None, data_preprocessing=False):
    
    acc_bm_0 = .69
    acc_bm_1 = .60
    
    acc_rm_0 = .5
    acc_rm_1 = .5
    
    df_test = pd.read_csv('flu_test.csv')
    df_test = df_test[~np.isnan(df_test['flu'])]
    df_test['flutype'] = df_test['flutype'] - 1
    
    y_true = df_test.values[:, -2]
    
    if flu_predict is not None:
        if data_preprocessing:                       
            encode = preprocessing.LabelEncoder()
            for column in df_test.columns:
                if df_test[column].dtype == np.object:
                    df_test.loc[:, column] = encode.fit_transform(df_test[column])

            df_test = df_test.fillna(0)
            
        x = df_test.values[:, :-2]
        
        y_pred = flu_predict(x)
        
    elif pred_y_file_name is not None:
        df_y_pred = pd.read_csv(pred_y_file_name)
        y_pred = df_y_pred.values[:, -1]
        
    else:
        return 'params ill-specified'
    
    acc_0 = accuracy(y_true[y_true == 0], y_pred[y_true == 0])
    acc_1 = accuracy(y_true[y_true == 1], y_pred[y_true == 1])
    
    if acc_0 > acc_rm_0 and acc_1 > acc_rm_1:
        print 'accuracies: {}, {}'.format(acc_0, acc_1)
        return "accuracy: beats all benchmarks :)"
    elif acc_0 > acc_bm_0 and acc_1 > acc_bm_1:
        print 'accuracies: {}, {}'.format(acc_0, acc_1)
        return "accuracy: beats only baseline models :/"
    else:
        print 'accuracies: {}, {}'.format(acc_0, acc_1)
        return "accuracy: beats no benchmarks :("

In [5]:
def flu_predict(x):
    return np.array([0] * len(x))

print beat_benchmark(flu_predict, data_preprocessing=False)

IOError: File flu_test.csv does not exist