# Who is heading for Diabetes?

This is the predictive part of the 2017 Melbourne Datathon.

The task is to predict the probability that a patient will be dispensed a drug related to Diabetes post 2015. This is quite important research as it will be an early warning system for doctors so intervention can potentially be made before it is too late.

Use the patients that we have provided all the records for to build your model, then see how it performs on these unseen people.

For patient ID'S 279,201 to 558,352 you need to submit a file with 2 columns, the Patient_ID and the probability in the range [0-1]. The file will have 279,153 rows including the header row. An example submission file is provided for download.

In [1]:
import tqdm

import pandas as pd
import numpy as np

import sqlite3

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import normalize, StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

## Data retrieval functions

In [3]:
conn = sqlite3.connect("../../sql/datasci.db")

In [4]:
def patient_data(connection, patient_id):
    """
    Return the patient data.
    """
    SQL = """
SELECT *
FROM transactions a
LEFT OUTER JOIN ChronicIllness_LookUp b 
    ON a.Drug_ID = b.MasterProductID 
LEFT OUTER JOIN patients c
    ON a.Patient_ID = c.Patient_ID
LEFT OUTER JOIN classification d
    ON a.Patient_ID = d.Patient_ID
LEFT OUTER JOIN social e
    ON c.postcode = e.postcode
WHERE a.Patient_ID = {}
AND a.prescription_week < '2016-01-01'
ORDER BY prescription_week
    """.format(patient_id)

    return pd.read_sql_query(SQL, connection)

## Feature extraction functions

TODO: Update the SQL to do the cleaning so that the functions don't need to.

In [70]:
gender_map = {'F': 1, 'M': 0, 'U': 0.5}

def gender(patient_data):
    return gender_map[patient_data.gender[0]]

def age(patient_data):
    patient_age = 2016 - patient_data.year_of_birth[0]
    if patient_age > 100: 
        return 50 
    else: 
        return patient_age

def socio_score(patient_data):
    score = patient_data.disadvantage_score[0]
    if isinstance(score, str):
        return 1000
    if score is None:
        return 1000
    return float(score)

def had_diabetes(patient_data):
    return float(patient_data.ChronicIllness.str.contains('Diabetes').any())

def had_lipids(patient_data):
    return float(patient_data.ChronicIllness.str.contains('Lipids').any())

def had_hypertension(patient_data):
    return float(patient_data.ChronicIllness.str.contains('Hypertension').any())

def had_heart_failure(patient_data):
    return float(patient_data.ChronicIllness.str.contains('Heart Failure').any())

def had_osteoporosis(patient_data):
    return float(patient_data.ChronicIllness.str.contains('Osteoporosis').any())

def had_depression(patient_data):
    return float(patient_data.ChronicIllness.str.contains('Depression').any())

## Compute some basic features of the data 

In [71]:
def feature_extract(patient_frame):
    """
    The thing that forms a feature vector.
    
    ** Make sure to partition out data from the 2016 period.
    
    """
    
    x = [gender(patient_frame), 
         age(patient_frame), 
         socio_score(patient_frame),
         had_diabetes(patient_frame),
         had_lipids(patient_frame),
         had_hypertension(patient_frame),
         had_heart_failure(patient_frame),
         had_osteoporosis(patient_frame),
         had_depression(patient_frame)]
    
    y = patient_frame.Target[0]
    
    return x, y


## Create the matrix of features and vector of targets

In [72]:
n = 20000

X, Y, indices = [], [], []
for i in tqdm.tqdm_notebook(np.random.randint(0, 279201, n)): 
    x, y = feature_extract(patient_data(conn, i))
    X.append(x)
    Y.append(y)
    indices.append(i)
    
X = np.vstack(X)
y = np.array(Y)




## Feature selection

Note: not sure here - trying out something from scikit learn but it may not be sensible.

In [111]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
print(X.shape)

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)

print(X_new.shape)


(20000, 9)
(20000, 7)


## Try out a set of different classifiers

Note: the feature matrix is usually transformed to have zero mean and unit standard deviation.

In [118]:
X_transformed = StandardScaler().fit_transform(X)

Partition the data into test and train datasets.

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.5, random_state=0)

In [120]:
classifiers = [
    ('Random Forrest', RandomForestClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Adaboost',AdaBoostClassifier() ),
    ('SVM',SVC(probability=True))]

for name, clf in classifiers:
    print('Classifier: {}'.format(name))
    model = clf.fit(X_train, y_train)
    y_true, y_pred = y_test, clf.predict(X_test) 
    print(classification_report(y_true, y_pred))


Classifier: Random Forrest
             precision    recall  f1-score   support

          0       0.94      0.95      0.94      8194
          1       0.76      0.72      0.74      1806

avg / total       0.91      0.91      0.91     10000

Classifier: Decision Tree
             precision    recall  f1-score   support

          0       0.93      0.95      0.94      8194
          1       0.73      0.65      0.69      1806

avg / total       0.89      0.89      0.89     10000

Classifier: Adaboost
             precision    recall  f1-score   support

          0       0.98      0.93      0.96      8194
          1       0.76      0.93      0.84      1806

avg / total       0.94      0.93      0.94     10000

Classifier: SVM
             precision    recall  f1-score   support

          0       0.98      0.93      0.96      8194
          1       0.76      0.92      0.83      1806

avg / total       0.94      0.93      0.93     10000



## Where do we perform well? where do we fail?

Not this approach is fitting the model on the whole dataset, so it more than likely is going to do well on the training data. 

In [99]:
y_pred = clf.predict(X_transformed)

experiment = pd.DataFrame(np.vstack([indices, y_pred, y]).T, columns=['Patient_ID', 'prediction', 'actual'])
experiment.head()

Unnamed: 0,Patient_ID,prediction,actual
0,246558,0,0
1,231815,0,0
2,31210,0,0
3,179483,0,0
4,89221,0,0


In [100]:
false_negatives = experiment[(experiment.prediction != experiment.actual) & (experiment.actual == 1)]
false_positives = experiment[(experiment.prediction != experiment.actual) & (experiment.actual == 0)]
len(false_positives), len(false_negatives)

(1122, 256)

### Pull apart the types of illness that the false negatives contain

In [113]:
fn_types_of_illness = set()
for x in false_negatives.Patient_ID.values:
    df = patient_data(conn, x)
    fn_types_of_illness |= set(df.ChronicIllness.unique())
fn_types_of_illness

{'Lipids',
 'Epilepsy',
 'Urology',
 'Depression',
 'Osteoporosis',
 'Diabetes',
 'Hypertension',
 'Chronic Obstructive Pulmonary Disease (COPD)',
 'Heart Failure',
 'Anti-Coagulant',
 'Immunology',
 None}

### Pull apart the types of illness that the false positives contain

In [117]:
fp_types_of_illness = set()
for x in false_positives.Patient_ID.values:
    df = patient_data(conn, x)
    fp_types_of_illness |= set(df.ChronicIllness.unique())
fp_types_of_illness

{'Lipids',
 'Urology',
 'Depression',
 'Osteoporosis',
 'Hypertension',
 'Anti-Coagulant',
 None,
 'Epilepsy',
 'Diabetes',
 'Chronic Obstructive Pulmonary Disease (COPD)',
 'Heart Failure',
 'Immunology'}

## Form a submission

Perform the prediction in 1000 patient "chunks" to speed up the processing.

In [11]:
submission = pd.read_csv('../../submissions/diabetes_submission_example.csv')

In [12]:
chunks = submission.groupby(np.arange(len(submission)) // 1000)

In [33]:
for group, frame in tqdm.tqdm_notebook(chunks):
    
    # Extract the features
    data = [feature_extract(patient_data(conn, x)) for x in frame.Patient_ID.values]
    
    # Construct prediction X matrix
    pred_x = np.vstack([x[0] for x in data])
    
    # Make sure we don't have nans in the data
    pred_x[np.isnan(pred_x)] = 0
    
    # Apply the standard transform prior to fitting. 
    pred_x = StandardScaler().fit_transform(pred_x)
    
    # Fit the model
    submission.Diabetes[frame.index] = clf.predict_proba(pred_x)[:, 0]





In [34]:
submission.to_csv('../../submissions/kaggle.csv', index=False)