# Disease Analysis
Analyze data from the National Notifiable Diseases Surveillance System on [deaths by pneumonia and influenza](https://www.healthdata.gov/dataset/nndss-table-ii-invasive-pneumococcal-diseases-all-ages) in 122 cities.

In [13]:
import numpy as np
from scipy import stats
import cntk as C

In [14]:
dataPath = "source.csv"
data = np.genfromtxt(dataPath, delimiter = ",", dtype = str)

In [62]:
deaths = data[1:, 6]
print(deaths)

non_blanks = []
for i in range(len(deaths)):
    if deaths[i] != "":
        non_blanks.append(i)
        
deaths = deaths[non_blanks].astype(np.float32)
years = stats.zscore( (data[1:, 0])[non_blanks].astype(np.float32))
winter = np.asarray( [1.0 if (week < 7 or 47 < week) else 0.0 for week in data[1:, 1].astype(int)], dtype = np.float32 )
winter = winter[non_blanks]

['11' '11' '5' ..., '2' '12' '11']


In [63]:
def shuffle_arrays(*arrays):
    rand_state = np.random.get_state()
    for array in arrays:
        np.random.set_state(rand_state)
        np.random.shuffle(array)

shuffle_arrays(years, winter, deaths)

In [64]:
num_samples = len(years)
training_size = num_samples * 2 // 3;


In [65]:
num_features = 2
num_classes = 1 #Do I even need this, since this is a regression problem?
input_var = C.input_variable(num_features, np.float32)

In [66]:
parm_dict = {}
def linear_layer(input_var, num_output_classes):
    
    weights = C.parameter(shape = (input_var.shape[0], num_output_classes))
    biases = C.parameter(shape = (num_output_classes))
    
    return C.times(input_var, weights) + biases

In [67]:
z = linear_layer(input_var, num_classes)
label = C.input_variable((num_classes), np.float32)
loss = C.squared_error(z, label)
#eval_error = C.squared_error(z, label)

In [68]:
learning_rate = 0.5
minibatch_size = 25
num_training_minibatches = training_size // minibatch_size

lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)
learner = C.sgd(z.parameters, lr_schedule)

trainer = C.Trainer(z, (loss), [learner])

In [69]:
features = np.column_stack((years, winter))
training_features = features[:training_size]
training_labels = deaths[:training_size].reshape((training_size, 1))


In [70]:
def moving_average(array):
    return [np.cumsum(array) / (i + 1) for i in len(array)]

def print_progress(trainer, mb_num, frequency):
    loss = "NA"
    if mb_num % frequency == 0:
        loss = trainer.previous_minibatch_loss_average
        print("Minibatch {0}: Loss = {1:4f}".format(mb_num, loss))
    return mb_num, loss

In [71]:
from collections import defaultdict

progress_output_freq = 500
plotdata = defaultdict(list)

for i in range(num_training_minibatches):
    start_index = i * minibatch_size
    end_index = start_index + minibatch_size
    feature_batch = training_features[start_index : end_index]
    label_batch = training_labels[start_index : end_index]
    trainer.train_minibatch({input_var : feature_batch, label : label_batch})
    

    mb_num, loss = print_progress(trainer, i, progress_output_freq)
    if loss != "NA":
        plotdata["Batch Number"].append(mb_num)
        plotdata["Loss"].append(loss)


Minibatch 0: Loss = 197.080000
Minibatch 500: Loss = 57.642559
Minibatch 1000: Loss = 73.261353
Minibatch 1500: Loss = 17.741641
Minibatch 2000: Loss = 64.167891
Minibatch 2500: Loss = 78.782280
Minibatch 3000: Loss = 64.366938
Minibatch 3500: Loss = 18.921396
Minibatch 4000: Loss = 18.993368
Minibatch 4500: Loss = 44.087241
Minibatch 5000: Loss = 14.683348
Minibatch 5500: Loss = 94.861162
Minibatch 6000: Loss = 28.850076
Minibatch 6500: Loss = 53.171563
Minibatch 7000: Loss = 59.159995
Minibatch 7500: Loss = 236.844531
Minibatch 8000: Loss = 31.763760
Minibatch 8500: Loss = 109.542910
Minibatch 9000: Loss = 70.104346
