# Using TensorFlow to Predict Drug Usage

### 1. DNNLinearCombinedClassifier

In [1]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import constants

from tensorflow.contrib.learn.python.learn.datasets import base

# Less Verbose Output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.logging.set_verbosity(tf.logging.ERROR)

DATA_DIR = "Dataset/"
DRUG_TRAINING = DATA_DIR + "re_drug_consumption_data.csv"
DRUG_TEST = DATA_DIR + "test_drug_consumption_data.csv"
DRUG_PREDICT = DATA_DIR + "predict_drug_consumption_data.csv"

PREDICT_OUTPUT = DATA_DIR + "predictions.csv"

# Bucketization for possible nonlinear relationship
age = tf.feature_column.numeric_column("age")
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=constants.AGE_BOUNDARIES)
gender = tf.feature_column.numeric_column("gender")
gender_buckets = tf.feature_column.bucketized_column(
    gender, boundaries=constants.GENDER_BOUNDARIES)
country = tf.feature_column.numeric_column("country")
country_buckets = tf.feature_column.bucketized_column(
    country, boundaries=constants.COUNTRY_BOUNDARIES)
ethnicity = tf.feature_column.numeric_column("ethnicity")
ethnicity_buckets = tf.feature_column.bucketized_column(
    ethnicity, boundaries=constants.ETHNICITY_BOUNDARIES)
education = tf.feature_column.numeric_column("education")
education_buckets = tf.feature_column.bucketized_column(
    education, boundaries=constants.EDUCATION_BOUNDARIES)

# Could bucketize but guessing these are close to linear
nscore = tf.feature_column.numeric_column("nscore")
escore = tf.feature_column.numeric_column("escore")
oscore = tf.feature_column.numeric_column("oscore")
ascore = tf.feature_column.numeric_column("ascore")
cscore = tf.feature_column.numeric_column("cscore")
impulsive = tf.feature_column.numeric_column("impulsive")
ss = tf.feature_column.numeric_column("ss")

nscore_ascore = tf.feature_column.crossed_column(
    ["nscore", "ascore"], hash_bucket_size=500)
nscore_cscore = tf.feature_column.crossed_column(
    ["nscore", "cscore"], hash_bucket_size=500)
ascore_cscore = tf.feature_column.crossed_column(
    ["ascore", "cscore"], hash_bucket_size=500)
nscore_ascore_cscore = tf.feature_column.crossed_column(
    ["nscore", "ascore", "cscore"], hash_bucket_size=500)

base_columns = [
    age_buckets, gender_buckets, country_buckets, ethnicity_buckets,
    education_buckets, 
    # nscore, escore, oscore, ascore, cscore, impulsive,
    # ss
]

crossed_columns = [
    
    # nscore_ascore, nscore_cscore, ascore_cscore,
    # nscore_ascore_cscore
]

feature_columns = base_columns + crossed_columns

classifier = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=constants.MODEL_DIR,
    n_classes=2,
    linear_feature_columns=crossed_columns,
    dnn_feature_columns=base_columns,
    dnn_hidden_units=[100, 50, 10],
    dnn_optimizer=tf.train.ProximalAdagradOptimizer(
        learning_rate=0.1,
        l1_regularization_strength=0.002,
        l2_regularization_strength=0.005))

def input_fn(data_file, num_epochs, shuffle):
    dataset = pd.read_csv(
        tf.gfile.Open(data_file),
        header=0,
        usecols=constants.FEATURE_COLUMNS + constants.TARGET,
        skipinitialspace=True,
        engine="python")
    # Drop NaN entries
    dataset.dropna(how="any", axis=0)

    # Init empty dataframe, add column for each of targets
    labels = pd.DataFrame(columns=constants.TARGET)
    
    # This assigns a different number to each usage category
    # labels[constants.TARGET] = dataset[constants.TARGET].apply(lambda x: constants.MAPPED_CODES[x]).astype(int)

    # This classifies usage as binary (USER/NON-USER) to make prediction easier
    labels["alcohol"] = dataset["alcohol"].apply(lambda x: x in constants.USER).astype(int)

    return tf.estimator.inputs.pandas_input_fn(
        x=dataset,
        y=labels,
        batch_size=100,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=1)

classifier.train(input_fn=input_fn(DRUG_TRAINING, num_epochs=None, shuffle=True),
    steps=60000)

results = classifier.evaluate(input_fn=input_fn(DRUG_TEST, num_epochs=1,
    shuffle=False), steps=None)

# Only printing accuracy for now
# for key in sorted(results):
#   print("%s: %s" % (key, results[key]))
print("Accuracy: %s" % results['accuracy'])

predictions = classifier.predict(input_fn=input_fn(DRUG_PREDICT, num_epochs=1,
    shuffle=False))
predict_writer = open(PREDICT_OUTPUT, "w")
predict_writer.write("Fake header\n")
for prediction in list(predictions):
    curr_line = ""
    for class_id in prediction['class_ids']:
        curr_line += (str(class_id) + ',')
    predict_writer.write(curr_line[:-1] + '\n')

predict_writer.close()

Accuracy: 0.81


In [3]:
print(results)
print("Specificity: %s" % results['specificity'])
print("Sensitivity: %s" % results['sensitivity'])

{'accuracy': 0.81, 'accuracy_baseline': 0.82666665, 'auc': 0.47937346, 'auc_precision_recall': 0.83383834, 'average_loss': 0.81419468, 'label/mean': 0.82666665, 'loss': 81.419472, 'prediction/mean': 0.84004128, 'global_step': 60001}


KeyError: 'specificity'