# Using TensorFlow to Predict Drug Usage

### 1. DNNLinearCombinedClassifier

In [None]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import constants

from tensorflow.contrib.learn.python.learn.datasets import base

# Less Verbose Output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.logging.set_verbosity(tf.logging.ERROR)

DATA_DIR = "Dataset/"
DRUG_TRAINING = DATA_DIR + "re_drug_consumption_data.csv"
DRUG_TEST = DATA_DIR + "test_drug_consumption_data.csv"
DRUG_PREDICT = DATA_DIR + "predict_drug_consumption_data.csv"

PREDICT_OUTPUT = DATA_DIR + "predictions.csv"

# Bucketization for possible nonlinear relationship
age = tf.feature_column.numeric_column("age")
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=constants.AGE_BOUNDARIES)
gender = tf.feature_column.numeric_column("gender")
gender_buckets = tf.feature_column.bucketized_column(
    gender, boundaries=constants.GENDER_BOUNDARIES)
country = tf.feature_column.numeric_column("country")
country_buckets = tf.feature_column.bucketized_column(
    country, boundaries=constants.COUNTRY_BOUNDARIES)
ethnicity = tf.feature_column.numeric_column("ethnicity")
ethnicity_buckets = tf.feature_column.bucketized_column(
    ethnicity, boundaries=constants.ETHNICITY_BOUNDARIES)
education = tf.feature_column.numeric_column("education")
education_buckets = tf.feature_column.bucketized_column(
    education, boundaries=constants.EDUCATION_BOUNDARIES)

# Could bucketize but guessing these are close to linear
nscore = tf.feature_column.numeric_column("nscore")
escore = tf.feature_column.numeric_column("escore")
oscore = tf.feature_column.numeric_column("oscore")
ascore = tf.feature_column.numeric_column("ascore")
cscore = tf.feature_column.numeric_column("cscore")
impulsive = tf.feature_column.numeric_column("impulsive")
ss = tf.feature_column.numeric_column("ss")

nscore_ascore = tf.feature_column.crossed_column(
    ["nscore", "ascore"], hash_bucket_size=500)
nscore_cscore = tf.feature_column.crossed_column(
    ["nscore", "cscore"], hash_bucket_size=500)
ascore_cscore = tf.feature_column.crossed_column(
    ["ascore", "cscore"], hash_bucket_size=500)
nscore_ascore_cscore = tf.feature_column.crossed_column(
    ["nscore", "ascore", "cscore"], hash_bucket_size=500)

In [None]:
def input_fn(data_file, target, num_epochs, batch_size=30, shuffle=False, num_threads=1):
    dataset = pd.read_csv(
        tf.gfile.Open(data_file),
        header=0,
        usecols=constants.FEATURE_COLUMNS + [target],
        skipinitialspace=True,
        engine="python")
    # Drop NaN entries
    dataset.dropna(how="any", axis=0)

    # Init empty dataframe, add column for each of targets
    labels = pd.DataFrame(columns=[target])
    
    # This assigns a different number to each usage category
    # labels[constants.TARGET] = dataset[constants.TARGET].apply(lambda x: constants.MAPPED_CODES[x]).astype(int)

    # This classifies usage as binary (USER/NON-USER) to make prediction easier
    labels[target] = dataset[target].apply(lambda x: x in constants.USER).astype(int)

    return tf.estimator.inputs.pandas_input_fn(
        x=dataset,
        y=labels,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=1)

In [None]:
base_columns = [
    age_buckets, gender_buckets, country_buckets, ethnicity_buckets,
    education_buckets,
    # For alcohol, purposefully removing personality features to reduce noise
    # nscore, escore,
    # oscore, ascore, cscore, impulsive,
    # ss
]

crossed_columns = [
    # See comment above
    # nscore_ascore, nscore_cscore, ascore_cscore,
    # nscore_ascore_cscore
]

feature_columns = base_columns + crossed_columns

classifier = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=constants.MODEL_DIR,
    n_classes=2,
    linear_feature_columns=crossed_columns,
    dnn_feature_columns=base_columns,
    dnn_hidden_units=[36],
    dnn_optimizer=tf.train.ProximalAdagradOptimizer(
        learning_rate=0.1,
        l1_regularization_strength=0.002,
        l2_regularization_strength=0.01))

classifier.train(input_fn=input_fn(DRUG_TRAINING, target="alcohol", num_epochs=None, shuffle=True),
    steps=60000)

results = classifier.evaluate(input_fn=input_fn(DRUG_TEST, target="alcohol", num_epochs=1,
    shuffle=False), steps=None)

print("Accuracy: %s" % results['accuracy'])

def predict(classifier, target):
    predictions = classifier.predict(input_fn=input_fn(DRUG_PREDICT, target=target, num_epochs=1,
        shuffle=False))
    predict_writer = open(PREDICT_OUTPUT, "w")
    predict_writer.write("Fake header\n")
    for prediction in list(predictions):
        curr_line = ""
        for class_id in prediction['class_ids']:
            curr_line += (str(class_id) + ',')
        predict_writer.write(curr_line[:-1] + '\n')

    predict_writer.close()

predict(classifier, "alcohol")

Well, it's nowhere near the accuracy I saw with MNIST, but I think it's respectable.  I also separated out 200 cases on which I used `classifier.predict()` instead of `evaluate()`.  The cell below calculates accuracy, sensitivity and specifity on this sample of 200 using a separate [python script](https://github.com/emdoyle/tensorflow_stuff/tree/master/RealLearning/compare_results.py).

In [None]:
from compare_results import compare_results
compare_results(["alcohol"])

It's worth noting (if only for a chance at interpretation) that the specifity in this sample is far below the sensitivity. This means that it was much more difficult for the model to predict that someone _didn't_ drink alcohol than if they did.  I suspect that this is because the model assumes a very positive bias, since guessing positively is usually correct.  Later it may be possible to correct this, but for now I will move on to other usages, since the optimal hyperparameters are likely to be different for different targets.

In [None]:
base_columns = [
    age_buckets, gender_buckets, country_buckets, ethnicity_buckets,
    education_buckets,
]

# Although the individual personality traits aren't crossed columns,
# I think they will do better as input to the linear part of the model
crossed_columns = [
    nscore_ascore_cscore, nscore, escore, oscore, ascore, cscore, impulsive,
    ss
]

feature_columns = base_columns + crossed_columns

classifier = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=constants.MODEL_DIR + "_cannabis",
    n_classes=2,
    linear_feature_columns=crossed_columns,
    dnn_feature_columns=base_columns,
    dnn_hidden_units=[144, 72, 36, 18],
    dnn_optimizer=tf.train.ProximalAdagradOptimizer(
        learning_rate=0.1,
        l1_regularization_strength=0.002,
        l2_regularization_strength=0.005))

classifier.train(input_fn=input_fn(DRUG_TRAINING, target="cannabis", num_epochs=None, shuffle=True),
    steps=60000)

results = classifier.evaluate(input_fn=input_fn(DRUG_TEST, target="cannabis", num_epochs=1,
    shuffle=False), steps=None)

print("Accuracy: %s" % results['accuracy'])

In [None]:
predict(classifier, "cannabis")
compare_results(["cannabis"])

I'm very surprised at the disparity between the `evaluate()` reported accuracy and the `predict()` reported accuracy.  It is possible that since the 200 is a small portion (~10%) of the total samples that it contains a significant number of outliers in terms of cannabis usage, or perhaps the model is overfitted to the data it saw.  I believe if the problem is overfitting that the solution is higher regularization strength, but if it is poor data then I will need to tweak my `constants` file to up the portion of the data used for prediction.

In [None]:
classifier = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=constants.MODEL_DIR + "_cannabis",
    n_classes=2,
    linear_feature_columns=crossed_columns,
    dnn_feature_columns=base_columns,
    dnn_hidden_units=[72, 36, 18],
    dnn_optimizer=tf.train.ProximalAdagradOptimizer(
        learning_rate=0.1,
        l1_regularization_strength=0.05,
        l2_regularization_strength=0.15))

classifier.train(input_fn=input_fn(DRUG_TRAINING, target="cannabis", num_epochs=None, shuffle=True),
    steps=60000)

results = classifier.evaluate(input_fn=input_fn(DRUG_TEST, target="cannabis", num_epochs=1,
    shuffle=False), steps=None)

print("Accuracy: %s" % results['accuracy'])

In [None]:
predict(classifier, "cannabis")
compare_results(["cannabis"])

The regularization _does_ seem to have helped with the prediction metrics without much of an impact on evaluation accuracy.  Also I did remove the first hidden layer since I felt four hidden layers might be too much for a relatively simple binary classification.

### 2. Homemade Estimator

While the `DNNLinearCombinedClassifier` certainly performed well, to really have full control over the training model I will need to build an Estimator using `tf.estimator`.  Since I won't be using a pre-built classifier, I will need to create my own model function, which involves defining the layers of the network, loss, the optimizer, the learning rate, and whatever other parameters I choose to modify.  Below is the skeleton of a model function, taken from [here](https://www.tensorflow.org/extend/estimators).

In [None]:
# def model_fn(features, labels, mode, params):
    # Logic to do the following:
    # 1. Configure the model via TensorFlow operations
    # 2. Define the loss function for training/evaluation
    # 3. Define the training operation/optimizer
    # 4. Generate predictions
    # 5. Return predictions/loss/train_op/eval_metric_ops in EstimatorSpec object
#   return EstimatorSpec(mode, predictions, loss, train_op, eval_metric_ops)

In [None]:
def input_fn(data_file, target, num_epochs, batch_size=30, shuffle=False, num_threads=1):
    dataset = pd.read_csv(
        tf.gfile.Open(data_file),
        header=0,
        usecols=constants.FEATURE_COLUMNS + [target],
        skipinitialspace=True,
        engine="python")
    # Drop NaN entries
    dataset.dropna(how="any", axis=0)

    # Init empty dataframe, add column for each of targets
    labels = pd.DataFrame(columns=[target])
    
    # This assigns a different number to each usage category
    # labels[constants.TARGET] = dataset[constants.TARGET].apply(lambda x: constants.MAPPED_CODES[x]).astype(int)

    # This classifies usage as binary (USER/NON-USER) to make prediction easier
    labels[target] = dataset[target].apply(lambda x: x in constants.USER).astype(int)
    dataset.pop(target)
    
    return tf.estimator.inputs.numpy_input_fn(
        x={"x": np.array(dataset)},
        y=np.array(labels[target]),
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=1)

In [None]:
def model_fn(features, labels, mode, params):
    
    # 1. Configure the model via TensorFlow operations
    input_layer = tf.cast(features["x"], tf.float32)
    
    layer_sizes = params["hidden_layers"]
    current_tensor = input_layer
    for nodes in layer_sizes:
        current_tensor = tf.layers.dense(current_tensor, nodes, activation=tf.nn.sigmoid)
        
    output_layer = tf.layers.dense(current_tensor, 1)

    # 4. Generate predictions
    predictions = tf.reshape(output_layer, [-1])
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"usage": predictions})
    
    # 2. Define the loss function for training/evaluation

    loss = tf.losses.mean_squared_error(labels, predictions)

    thresh_predictions = tf.where(tf.less(predictions, tf.constant(0.5, tf.float32)),
                             tf.zeros(tf.shape(predictions)), tf.ones(tf.shape(predictions)))
    eval_metric_ops = {
        "rmse": tf.metrics.root_mean_squared_error(tf.cast(labels,tf.float64), tf.cast(predictions,tf.float64)),
        "accuracy": tf.metrics.accuracy(
            tf.cast(labels, tf.float64), tf.cast(thresh_predictions, tf.float64))
    }
    
    # 3. Define the training operation/optimizer
    decay_steps = 50000
    learning_rate = tf.train.polynomial_decay(params["start_learn"], tf.train.get_global_step(),
                                          decay_steps, params["end_learn"],
                                          power=0.5)
    optimizer=tf.train.GradientDescentOptimizer(
        learning_rate=learning_rate)
    train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
    
    # 5. Return predictions/loss/train_op/eval_metric_ops in EstimatorSpec object
    return tf.estimator.EstimatorSpec(mode, predictions, loss, train_op, eval_metric_ops)

In [None]:
base_columns = [
    age_buckets, gender_buckets, country_buckets, ethnicity_buckets,
    education_buckets,
]

# Although the individual personality traits aren't crossed columns,
# I think they will do better as input to the linear part of the model
# tf.feature_column.embedding_column(nscore_ascore_cscore, 1), 
crossed_columns = [
    nscore, escore, oscore, ascore, cscore, impulsive,
    ss
]

feature_columns = base_columns + crossed_columns

model_params = {
    "feature_columns": feature_columns,
    "hidden_layers": [30, 10],
    "start_learn": 0.1,
    "end_learn": 0.01
}

nn = tf.estimator.Estimator(model_fn, params=model_params)

In [None]:
nn.train(input_fn=input_fn(DRUG_TRAINING, target="cannabis", batch_size=1, num_epochs=None, shuffle=True),
    steps=60000)

results = nn.evaluate(input_fn=input_fn(DRUG_TEST, target="cannabis", batch_size=1, num_epochs=1,
    shuffle=False), steps=None)

print(results)
print("Accuracy: %s" % results['accuracy'])

In [None]:
predictions = nn.predict(input_fn=input_fn(DRUG_PREDICT, target="cannabis", num_epochs=1,
    shuffle=False))
print(list(predictions))