In [125]:
import math
import pandas as pd

In [126]:
data = pd.read_csv('data/warfarin.csv')
data = data[data['Therapeutic Dose of Warfarin'].notnull()]

In [127]:
def assign_bucket(n):
    if n < 21:
        return 0
    elif n > 49:
        return 2
    else:
        return 1


In [128]:
doses = data['Therapeutic Dose of Warfarin'][:-1:] # remove last row, contains all nulls

## Baseline 1: Always administer 35 mg/wk

In [129]:
Y = doses.apply(lambda dose : assign_bucket(dose))
num_wrong = (Y != 1).sum()
print(f"Performance of baseline 1: {round(num_wrong / len(buckets), 4)}")

Performance of baseline 1: 0.3765


## Baseline 2: Linear model based on age, height, weight, race and medications

In [130]:
# get median age for imputing
age_buckets = data[data.Age.notnull()].Age.apply(lambda a : int(a[0]))
med_age = age_buckets.median()
med_age

6.0

In [131]:
def compute_dose(row):
    def get_age_bucket(row):
        age = row.Age
        if type(age) == float and math.isnan(age):
            # impute age
            return med_age
        return int(age[0])

    def race_weight(row):
        if row['Race'] == 'Asian':
            return -.6752
        elif row['Race'] == 'Black or African American':
            return .4060
        elif row['Race'] == 'Unknown':
            return .0443
        else:
            return 0
    
    def enzyme_inducer(row):
        if row['Carbamazepine (Tegretol)'] or row['Phenytoin (Dilantin)'] or row['Rifampin or Rifampicin']:
            return 1
        else:
            return 0
    
    def amidarone(row):
        if row['Amiodarone (Cordarone)']: 
            return 1
        else: 
            return 0
        

    total = 4.0376
    total -= 0.2546 * get_age_bucket(row) 
    total += 0.0118 * row['Height (cm)']
    total += 0.0134 * row['Weight (kg)']
    total += race_weight(row) 
    total += 1.2799 * enzyme_inducer(row)
    total -= 0.5695 * amidarone(row)
    
    return total * total


In [132]:
linear_doses = data.apply(lambda row : compute_dose(row), axis=1)
linear_buckets = linear_doses.apply(lambda dose : assign_bucket(dose))[:-1:] # remove last row, contains nulls
num_wrong = (Y != linear_buckets).sum()
print(f"Performance of baseline 2: {round(num_wrong / len(buckets), 4)}")

Performance of baseline 2: 0.3993
