In [1]:
import plotly.plotly as py
import plotly
# import plotly.graph_objs.Scatter as go
from plotly.offline import init_notebook_mode, plot, iplot, download_plotlyjs
from plotly.graph_objs import Scatter, Figure, Layout
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

In [2]:
import numpy
import os
import pandas as pd
import tables  # This will fail if you don't have 'pytables' installed to read the cache file

import dask.dataframe as dd


In [3]:
DATA_DIR = '../../data'
RAW_DATA_DIR = DATA_DIR + '/raw'
LOOKUPS = RAW_DATA_DIR + "/Lookups"
TRANSACT = RAW_DATA_DIR + "/Transactions"

SUBSET_FILE = DATA_DIR + '/' + 'subset.hdf'
THINNED_FILE = DATA_DIR + '/' + 'thinned.hdf'

In [4]:
files = os.listdir(TRANSACT)

In [5]:
stores = pd.read_csv(LOOKUPS + '/stores.txt', sep='\t')
drugs = pd.read_csv(LOOKUPS + '/Drug_LookUp.txt', sep='\t')
illness = pd.read_csv(LOOKUPS + '/ChronicIllness_LookUp.txt', sep='\t')
patients = pd.read_csv(LOOKUPS + '/patients.txt', sep='\t')
atc = pd.read_csv(LOOKUPS + '/ATC_LookUp.txt', sep='\t')

In [6]:
t1 = illness.merge(drugs, on='MasterProductID', how='outer')

In [7]:
def get_subset(frac=0.1):
    samples = []
    for file in files:
        df = pd.read_csv(TRANSACT + '/' + file, sep='\t')
        sample = df.sample(frac=frac)
        samples.append(sample)

    return pd.concat(samples)

In [8]:
def expand_subset(subset):
    subset = subset.merge(t1, left_on='Drug_ID', right_on='MasterProductID', suffixes=('_illness', '_drug'), how='outer')    
    subset = subset.merge(patients, on='Patient_ID', how='outer')
    subset['target'] = subset['ChronicIllness'] == 'Diabetes'
    
    return subset

In [9]:
def get_subset_cached():
    
    if os.path.exists(SUBSET_FILE):
        print("Using cached file")
        return pd.read_hdf(SUBSET_FILE, '/data')
    
    subset = get_subset()
    subset = expand_subset(subset)
    
    subset.to_hdf(SUBSET_FILE, '/data')
    return subset

In [10]:
def get_thinned_cached(subset=None):
    
    if os.path.exists(THINNED_FILE):
        return pd.read_hdf(THINNED_FILE, '/data')
    
    if subset is None:
        subset = get_subset_cached()
        
    relevant = [
        'Patient_ID',
        'Store_ID',
        'Prescriber_ID',
        'Drug_ID',
        'Prescription_Week',
        'Dispense_Week',
        'Drug_Code',
        'NHS_Code',
        'PatientPrice_Amt', 
        'WholeSalePrice_Amt',
        'GovernmentReclaim_Amt', 
        'StreamlinedApproval_Code', 
        'ChemistListPrice',
        'gender', 
        'year_of_birth', 
        'postcode', 
        'target'                
    ]
    
    thinned = subset[relevant]
    
    thinned.to_hdf(THINNED_FILE, '/data')
    return thinned

In [11]:
thinned = get_thinned_cached()
include_fields = ['Patient_ID', 'year_of_birth', 'postcode', 'gender', 'target']
train_fields = ['Patient_ID', 'year_of_birth', 'postcode', 'gender']
train = thinned[include_fields].copy()
train['gender'] = train['gender'] == 'M'

In [12]:
valid_ratio = len(train.dropna()) / len(train)
print(str(valid_ratio))

if valid_ratio > .95:
    train = train.dropna()

0.9986955865277415


In [15]:
import keras

Using TensorFlow backend.


In [28]:
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization

model = Sequential()
model.add(Dense(units=8, input_dim=4))  # 4 input values connect to 16 input nodes
model.add(Activation('relu'))  # Relu the suckers
model.add(BatchNormalization())
model.add(Dense(units=16))  
model.add(Activation('softmax'))
model.add(Dense(units=1))  
model.add(Activation('softmax'))

model.compile(loss='mean_squared_error',
              optimizer='sgd',
              metrics=['accuracy'])

In [29]:
len(train[train_fields].columns)

4

In [31]:
max_year = train['year_of_birth'].max()
train['year_of_birth'] = train['year_of_birth'] / max_year

In [None]:
model.fit(train[train_fields].as_matrix(),  # X / inputs
          train['target'].as_matrix(),      # y / targets
          epochs=5,             # number of training epochs
          batch_size=32)        # batch size (number-at-once)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

In [None]:
# import pickle
# pickle.dump(rf, open("model2.pkl", 'wb'))

In [None]:
# base_rate = sum(train['target']) / len(train)
# print(str(base_rate))

In [None]:
prediction = model.predict(train[train_fields])
truth = train['target']

In [None]:
indexes = list(train.index)

In [None]:
# Plotly Plotting Bit!

# Create a trace
trace_true = Scatter(
    x = indexes[:500],
    y = truth[:500],
    mode = 'markers'
)

trace_pred = Scatter(
    x = indexes[:500],
    y = prediction[:500],
    mode = 'markers'
)

data = [trace_true, trace_pred]

# Plot and embed in ipython notebook!
iplot(data)

In [None]:
deltas = truth - prediction

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.hist(deltas, 10, normed=1)
plt.show()

In [None]:
deltas.min()