# 預測蘑菇是否有毒

In [6]:
#整理資料
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#整理資料後分別存成train.csv, test.csv
def prepare_data(data_file_name):
    # http://archive.ics.uci.edu/ml/datasets/Mushroom
    header = ['class', 'cap_shape', 'cap_surface',
              'cap_color', 'bruises', 'odor', 'gill_attachment',
              'gill_spacing', 'gill_size', 'gill_color', 'stalk_shape',
              'stalk_root', 'stalk_surface_above_ring',
              'stalk_surface_below_ring', 'stalk_color_above_ring',
              'stalk_color_below_ring', 'veil_type', 'veil_color',
              'ring_number', 'ring_type', 'spore_print_color',
              'population', 'habitat']
    df = pd.read_csv(data_file_name, sep=',', names=header)

    #移除缺失值
    df.replace('?', np.nan, inplace=True)
    df.dropna(inplace=True)

    #0:有毒，1:可食用
    df['class'].replace('p', 0, inplace=True)
    df['class'].replace('e', 1, inplace=True)

    #類別屬性都拆成one hot
    cols_to_transform = header[1:]
    df = pd.get_dummies(df, columns=cols_to_transform)                       

    #拆成training/test(9:1)
    df_train, df_test = train_test_split(df, test_size=0.1)

    #5079筆資料
    #99個屬性
    print (df_train.shape)
    print (df_test.shape)    
    num_train_entries = df_train.shape[0]                                    
    num_train_features = df_train.shape[1] - 1                               

    num_test_entries = df_test.shape[0]
    num_test_features = df_test.shape[1] - 1
    
    #分別存成train.csv, test.csv
    df_train.to_csv('mushroom_train.csv', index=False)
    df_test.to_csv('mushroom_test.csv', index=False)
    
MUSHROOM_DATA_FILE = "agaricus-lepiota.data"
prepare_data(MUSHROOM_DATA_FILE)



(5079, 99)
(565, 99)


In [7]:
df_train = pd.read_csv('mushroom_train.csv')
train_label = np.array(df_train['class'])
train_data =  np.array(df_train.drop('class', axis=1))
train_label


array([0, 1, 1, ..., 1, 1, 1])

In [41]:
df_test = pd.read_csv('mushroom_test.csv')
test_label = np.array(df_test['class'])
test_data =  np.array(df_test.drop('class', axis=1))


(5079, 98)


In [42]:
#label做one-got encoding
def one_hot(values):
    n_values = np.max(values) + 1
    return np.eye(n_values)[values]
train_label = one_hot(train_label.astype(int))
test_label = one_hot(test_label.astype(int))

print (train_label)

[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [1. 0.]
 [0. 1.]]


In [43]:
#建DNN，training
import tensorflow as tf
tf.reset_default_graph()

batch_size = 100
INPUT_NODE = 98
LAYER1_NODE = 32
LAYER2_NODE = 2

x = tf.placeholder(tf.float32, [None, INPUT_NODE], name='x')
y = tf.placeholder(tf.float32, [None, LAYER2_NODE], name='y')

W1 = tf.Variable(tf.truncated_normal([INPUT_NODE, LAYER1_NODE], stddev=0.1))
b1 = tf.Variable(tf.truncated_normal([LAYER1_NODE], stddev=0.1))
W2 = tf.Variable(tf.truncated_normal([LAYER1_NODE, LAYER2_NODE], stddev=0.1))
b2 = tf.Variable(tf.truncated_normal([LAYER2_NODE], stddev=0.1))

layer_1 = tf.matmul(x, W1) + b1
out1 = tf.nn.leaky_relu(layer_1, alpha=0.2)
layer_2 = tf.matmul(out1, W2) + b2
out2 = tf.nn.leaky_relu(layer_2, alpha=0.2)

y_predict = out2

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_predict))
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

correct_prediction = tf.equal(tf.argmax(y_predict, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),name="accuracy")


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for step in range(1001):                                     
        i = step
        if i+batch_size >= len(train_data):
                i = i+batch_size % len(train_data)
        batch_xs, batch_ys = train_data[i:i+batch_size], train_label[i:i+batch_size]
        
        train_step_, cross_entropy_ =sess.run([train_step, cross_entropy], feed_dict={x: batch_xs, y: batch_ys})
        if step % 50 == 0:
            print("step {}: cross_entropy is {}".format(step, cross_entropy_))
            
    accuracy_ = sess.run(accuracy, feed_dict={x: test_data, y: test_label})
    print('Testing...... accuracy is {}'.format(accuracy_))

step 0: cross_entropy is 0.6594077348709106
step 50: cross_entropy is 0.6101139187812805
step 100: cross_entropy is 0.5749460458755493
step 150: cross_entropy is 0.4900497496128082
step 200: cross_entropy is 0.4006577730178833
step 250: cross_entropy is 0.36826613545417786
step 300: cross_entropy is 0.35131990909576416
step 350: cross_entropy is 0.336247980594635
step 400: cross_entropy is 0.25668227672576904
step 450: cross_entropy is 0.20671872794628143
step 500: cross_entropy is 0.19451303780078888
step 550: cross_entropy is 0.17854303121566772
step 600: cross_entropy is 0.1629645675420761
step 650: cross_entropy is 0.15859925746917725
step 700: cross_entropy is 0.12463214248418808
step 750: cross_entropy is 0.11447204649448395
step 800: cross_entropy is 0.11282957345247269
step 850: cross_entropy is 0.08379519730806351
step 900: cross_entropy is 0.09613721072673798
step 950: cross_entropy is 0.08819933235645294
step 1000: cross_entropy is 0.05937248840928078
Testing...... accuracy 