In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    return dataset.shuffle(1000).repeat().batch(batch_size)


def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

def merge_data(data_1, data_2):
    data = []
    for i in range(min(len(data_1), len(data_2))):
        data.append(data_1[i])
        data.append(data_2[i])

    if len(data_1)>len(data_2):
        data.append(data_1[i+1:])
    else:
        data+=data_2[i+1:]
    return data

In [3]:
with np.load('new_positive_original.npz') as fp:
    HAA_train_positive = fp['HAA']
    HJC_train_positive = fp['HJC']
    HRA_train_positive = fp['HRA']
    SD_train_positive = fp['SD']

In [4]:
with np.load('new_negative_original.npz') as fn:
    HAA_train_negative = fn['HAA']
    HJC_train_negative = fn['HJC']
    HRA_train_negative = fn['HRA']
    SD_train_negative = fn['SD']

In [5]:
print('the size of positive set: ', len(HAA_train_positive))
print('the size of negative set: ', len(HAA_train_negative))

the size of positive set:  200000
the size of negative set:  200000


In [6]:
feature_HAA = merge_data(list(HAA_train_positive)[0:140000], list(HAA_train_negative)[0:140000])
feature_HJC = merge_data(list(HJC_train_positive)[0:140000], list(HJC_train_negative)[0:140000])
feature_HRA = merge_data(list(HRA_train_positive)[0:140000], list(HJC_train_negative)[0:140000])
feature_SD = merge_data(list(SD_train_positive)[0:140000], list(SD_train_negative)[0:140000])

In [7]:
print(len(feature_HAA), len(feature_HJC), len(feature_HRA), len(feature_SD))

280000 280000 280000 280000


In [9]:
train_features = {
                  'HAA': np.array(feature_HAA),
                  'HJC': np.array(feature_HJC),
                  'HRA': np.array(feature_HRA),
                  'SD': np.array(feature_SD)
                  }

print("the size of training set",len(feature_HAA))

train_labels = merge_data([1 for x in range(len(HAA_train_positive[0:140000]))],
                          [0 for x in range(len(HAA_train_negative[0:140000]))])
print('the size of the label set', len(train_labels))

the size of training set 280000
the size of the label set 280000


In [10]:
with np.load('new_test_original.npz') as tft:
    HAA_test = tft['HAA']
    HJC_test = tft['HJC']
    HRA_test = tft['HRA']
    SD_test = tft['SD']

test_features = {
                 'HAA': np.array(HAA_test),
                 'HJC': np.array(HJC_test),
                 'HRA': np.array(HRA_test),
                 'SD': np.array(SD_test)
                 }
print('the size of test set', len(HAA_test))

the size of test set 2000


In [17]:
# load the eval data
eval_features = {
                 'HAA': np.array(feature_HAA[140000:200000]),
                 'HJC': np.array(feature_HJC[140000:200000]),
                 'HRA': np.array(feature_HRA[140000:200000]),
                 'SD': np.array(feature_SD[140000:200000])
                 }
eval_labels = merge_data([1 for x in range(30000)],
                         [0 for x in range(30000)])

print('@@@@@@')
print(np.var(feature_HAA))
print(np.var(feature_HJC))
print(np.var(feature_HRA))
print(np.var(feature_SD))


# Define the feature column (describe how to use the features)
HAA = tf.feature_column.numeric_column('HAA')
HJC = tf.feature_column.numeric_column('HJC')
HRA = tf.feature_column.numeric_column('HRA')
SD = tf.feature_column.numeric_column('SD')

# Instantiate an estimator(2 hidden layer DNN with 10, 10 units respectively), passing the feature columns.
estimator = tf.estimator.DNNClassifier(
    feature_columns=[HAA],
    # Two hidden layers of 10 nodes each.
    hidden_units=[16,4],
    # The model must choose between 3 classes.
    n_classes=2
)

# Train the Model.
estimator.train(
    input_fn=lambda: train_input_fn(train_features, train_labels, 128),
    steps=1000)

eval_result = estimator.evaluate(
    input_fn=lambda: eval_input_fn(eval_features, eval_labels, 128))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

# Making Predictions
predictions = estimator.predict(input_fn=lambda:eval_input_fn(test_features, None, 128))
i = 0
id_list = []
prediction_list = []
#print('id ', 'probability')
for p in predictions:
    i += 1
    class_id = p['class_ids'][0]
    probability = p['probabilities'][1]
    id_list.append(i)
    prediction_list.append(probability)
    #print(i, " " , probability)

dataframe = pd.DataFrame({'Id':id_list,'Predicted':prediction_list})
dataframe.to_csv("overfitting.csv",index=False,sep=',')


@@@@@@
50.77324435243052
1.859731205871518e-05
1.859731205871518e-05
46595.72067274444
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/79/26vt9yzn6w15mqnqw7x2l9fw0000gn/T/tmpaor0sh9b', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_