# Logistic regression with Tensorflow

This is an example of logistic regression with Tensorflow. You can find all the data set information in the file info.txt in the same repository

## Load and prepare the data

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv('winequality-red.csv', sep=';')

In [4]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
5,7.4,0.660,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5
6,7.9,0.600,0.06,1.6,0.069,15.0,59.0,0.99640,3.30,0.46,9.4,5
7,7.3,0.650,0.00,1.2,0.065,15.0,21.0,0.99460,3.39,0.47,10.0,7
8,7.8,0.580,0.02,2.0,0.073,9.0,18.0,0.99680,3.36,0.57,9.5,7
9,7.5,0.500,0.36,6.1,0.071,17.0,102.0,0.99780,3.35,0.80,10.5,5


### Scaling Variables

In [5]:
quant_features = ['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides', 
                  'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates',
                  'alcohol']
scaled_features = {}
for each in quant_features:
    mean, std = data[each].mean(), data[each].std()
    scaled_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean)/std

### Splitting the data into training and testing sets

In [7]:
train_set = data.sample(frac=0.8,random_state=200)
test_set = data.drop(train_set.index)

In [8]:
data.shape

(1599, 12)

In [9]:
train_set.shape

(1279, 12)

Now we have to encode the labels

In [113]:
from sklearn import preprocessing

labels = data['quality']

#create the encoder
lb = preprocessing.LabelBinarizer()
lb.fit(labels)
labels_bi = lb.transform(labels)

In [114]:
labels_bi

array([[0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       ..., 
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0]])

Now separate data into features and targets

In [11]:
# Set an empty np.zeros array with the shape of the train and test labels
train_labels = np.zeros((train_set.shape[0],labels_bi.shape[1]), dtype=float)
test_labels = np.zeros((test_set.shape[0],labels_bi.shape[1]), dtype=float)

# train_features
label_field = ['quality']

train_features = train_set.drop(label_field, axis=1)
test_features = test_set.drop(label_field, axis=1)

# train_labels
count = 0
for ii in train_set.index:
    train_labels[count] = labels_bi[ii]
    count += 1

count = 0
for ii in test_set.index:
    test_labels[count] = labels_bi[ii]
    count += 1

In [12]:
# Hold out the last 60 days or so of the remaining data as a validation set
train_features, train_labels = train_features[:-128*2], train_labels[:-128*2]
val_features, val_labels = train_features[-128*2:], train_labels[-128*2:]

In [140]:
train_features.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
366,2.5733,0.486874,0.66,2.313022,-0.094901,-0.84845,-0.135818,2.889431,-1.561751,0.423883,-0.396918
1325,-0.93024,-0.378759,0.24,-0.594928,-0.222383,0.203159,-0.379014,-1.031444,0.51097,-0.343047,0.166109
133,-0.987675,-0.15537,0.01,-0.736779,-0.583584,0.107558,-0.62221,-0.819505,0.575742,-0.461036,-0.584594
1418,-0.298454,0.012172,0.01,-0.665853,-0.222383,-1.230854,-0.835006,-0.925474,-0.978798,-1.168972,-0.584594
1258,-0.872805,0.626492,0.0,0.114328,0.754982,-0.083643,-0.409414,-0.724132,0.834832,-0.166063,0.822974


In [141]:
test_features.shape

(320, 11)

In [143]:
# data assertion

assert_dict = {
    3.0: [ 1., 0., 0., 0., 0., 0.],
    4.0: [ 0., 1., 0., 0., 0., 0.],
    5.0: [ 0., 0., 1., 0., 0., 0.],
    6.0: [ 0., 0., 0., 1., 0., 0.],
    7.0: [ 0., 0., 0., 0., 1., 0.],
    8.0: [ 0., 0., 0., 0., 0., 1.]
}

# check if features data match with labels data
# the number of count bellow must be equal to features shape above
summ = 0
count = 0
for ii in test_features.index:
    if np.all(assert_dict[data.iloc[ii]['quality']] == test_labels[summ]):
        count += 1
    summ += 1
print(count)

320


## Building the Network

### Parameters

In [80]:
epochs = 10000
batch_size = 128
display_step = 100
learning_rate = 0.001

# Network parameters
n_hidden1 = 40
n_hidden2 = 40
n_classes = train_labels.shape[1]
n_inputs = train_features.shape[1]

### tf Graph inputs and variables

In [81]:
tf.reset_default_graph()

inputs = tf.placeholder("float", [None, n_inputs])
labels = tf.placeholder("float", [None, n_classes])

# store layers, weights and bias
weights = {
    'h1': tf.Variable(tf.truncated_normal([n_inputs, n_hidden1])),
    'h2': tf.Variable(tf.truncated_normal([n_hidden1, n_hidden2])),
    'out': tf.Variable(tf.truncated_normal([n_hidden2, n_classes]))
}
biases = {
    'h1': tf.Variable(tf.truncated_normal([n_hidden1])),
    'h2': tf.Variable(tf.truncated_normal([n_hidden2])),
    'out': tf.Variable(tf.truncated_normal([n_classes]))
}

### Create the model

In [82]:
def neural_net(x):
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['h1'])
    layer_1 = tf.nn.relu(layer_1)
    
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['h2'])
    layer_2 = tf.nn.relu(layer_2)
    
    logits = tf.add(tf.matmul(layer_2, weights['out']), biases['out'])
    return logits

In [83]:
# Define cost and optimizer
logits = neural_net(inputs)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

In [84]:
# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

## Batching

In [85]:
def batching(batch_size, train_features, train_labels):
    #reset the index of train_features data frame for matching with train_labels 
    train_features = train_features.reset_index(drop=True)
    #choose the random units of data
    batch = np.random.choice(train_features.index, size=batch_size)
    
    batch_X = train_features.loc[batch].values
    batch_y = np.take(train_labels, batch, axis=0)
    
    return batch_X, batch_y

In [86]:
# Validation and test data performance for evaluation

val_X, test_X= val_features.values, test_features.values
val_y, test_y = val_labels, test_labels

## Start training

In [87]:
with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(1, epochs+1):
        
        batch_X, batch_y = batching(batch_size, train_features, train_labels)
        
        sess.run(optimizer, feed_dict={inputs: batch_X, labels: batch_y})
        
        if epoch % display_step == 0 or epoch == 1:
            # Calculate batch loss and accuracy
            #train_loss, train_acc = sess.run([loss, accuracy], feed_dict={inputs: batch_X,
                                                                 #labels: batch_y})
            val_loss, val_acc = sess.run([loss, accuracy], feed_dict={inputs: val_X,
                                                                     labels: val_y})
            
            #print("Step " + str(epoch) + ", Train_loss= " + \
             #     "{:.4f}".format(train_loss) + ", val_loss= " + \
              #    "{:.3f}".format(val_loss) + "....." + " Train_acc= " + \
               #   "{:.4f}".format(train_acc) + ", Val_acc= " + \
                #  "{:.3f}".format(val_acc))
            print("Step " + str(epoch) + ", val_loss= " + \
                  "{:.3f}".format(val_loss) + ", Val_acc= " + \
                  "{:.3f}".format(val_acc))
            
    print("Optimization Finished!")

    # Calculate accuracy for MNIST test images
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={inputs: test_X,
                                      labels: test_y}))
    logos = sess.run(logits, feed_dict={inputs: test_X})
    print("test_features 1:", logos[10])
    print("test_label: ", test_y[10])

Step 1, val_loss= 29.809, Val_acc= 0.297
Step 100, val_loss= 10.839, Val_acc= 0.441
Step 200, val_loss= 6.650, Val_acc= 0.500
Step 300, val_loss= 4.822, Val_acc= 0.559
Step 400, val_loss= 3.707, Val_acc= 0.566
Step 500, val_loss= 2.949, Val_acc= 0.598
Step 600, val_loss= 2.543, Val_acc= 0.629
Step 700, val_loss= 2.131, Val_acc= 0.633
Step 800, val_loss= 1.894, Val_acc= 0.676
Step 900, val_loss= 1.599, Val_acc= 0.672
Step 1000, val_loss= 1.428, Val_acc= 0.715
Step 1100, val_loss= 1.280, Val_acc= 0.742
Step 1200, val_loss= 1.147, Val_acc= 0.770
Step 1300, val_loss= 1.000, Val_acc= 0.773
Step 1400, val_loss= 0.924, Val_acc= 0.762
Step 1500, val_loss= 0.838, Val_acc= 0.773
Step 1600, val_loss= 0.778, Val_acc= 0.805
Step 1700, val_loss= 0.730, Val_acc= 0.789
Step 1800, val_loss= 0.679, Val_acc= 0.840
Step 1900, val_loss= 0.622, Val_acc= 0.840
Step 2000, val_loss= 0.592, Val_acc= 0.859
Step 2100, val_loss= 0.569, Val_acc= 0.848
Step 2200, val_loss= 0.542, Val_acc= 0.863
Step 2300, val_loss= 