## Unconstrained optimization using Cyclic co-ordinate descent on Logistic Loss function to perform Multinomial classification

### Import statements

In [None]:
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

### Loading dataset into dataframes

In [None]:
data = pd.read_csv('wine.data', header = None)
num_features = data.shape[1]-1
num_examples = data.shape[0]
print "Number of features: ",num_features
print "Number of examples: ",num_examples
data.head()

### Separating class labels

In [None]:
Y = data[0]
print Y.shape
Y.head()

### Separating features, adding offset, and scaling the features

In [None]:
X = data[range(1,num_features+1)]
offset = np.empty(178)
offset.fill(1)
X = preprocessing.scale(X)
X = pd.DataFrame(X)
X[14] = pd.Series(offset)

### Perform train test split of the data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
                                    X, Y, train_size = 128, random_state=42)
num_train_examples = X_train.shape[0]
print "Number of training examples: ", num_train_examples
num_test_examples = X_test.shape[0]
print "Number of test examples: ", num_test_examples
print X_train.shape
print Y_train.shape

In [None]:
X_train_matrix = X_train.as_matrix()
Y_train_matrix = Y_train.as_matrix()
X_test_matrix = X_test.as_matrix()
Y_test_matrix = Y_test.as_matrix()

### Define an indicator array that stores I(Y(i), j)

In [None]:
indicator=np.zeros((128,3))
for i in range(128):
        indicator[i][Y_train_matrix[i]-1]=1

### Function to calculate probability in the logistic loss function

In [None]:
def dot_product(X,w):
    product = np.matrix(X)*np.matrix(w).T
    exponent = np.exp(product);
    normed_matrix = normalize(exponent, axis=1, norm='l1')
    return normed_matrix

### Initializing all zero vector for w

In [None]:
w = np.zeros([3,14])
predict = np.matrix(X_test_matrix)*np.matrix(w).T
predict = np.array(predict)
Y_test_pred =predict.argmax(axis=1)+1
acc_array = []
acc_array.append(accuracy_score(Y_test_matrix,Y_test_pred))
print acc_array

### Cyclic co-ordinate descent algorithm

In [None]:
from sklearn.metrics import log_loss
logLoss=[]

for i in range(10000):
    for j in range(3):
        for k in range(14):
            gradient = np.matrix(X_train_matrix).T[k]*\
                       np.matrix(indicator-dot_product(X_train_matrix,w))
            gradient = np.array(gradient)
            w[j][k]=w[j][k]+0.01*gradient[0][j]
    if i%100==0:
        predict_prob = dot_product(X_train_matrix,w)
        l = log_loss(Y_train_matrix,predict_prob)
        predict = np.matrix(X_test_matrix)*np.matrix(w).T
        predict = np.array(predict)
        Y_test_pred =predict.argmax(axis=1)+1
        acc_array.append(accuracy_score(Y_test_matrix,Y_test_pred))
        logLoss.append(l)

In [None]:
print "Final weight vector:"
print w

### Computing minimized logistic loss value

In [None]:
from sklearn.metrics import log_loss
predict_prob=dot_product(X_train_matrix,w)
print "Final logistic loss: ",log_loss(Y_train_matrix,predict_prob)

### Perform classification using the final weight vector

In [None]:
predict = np.matrix(X_test_matrix)*np.matrix(w).T
predict = np.array(predict)
Y_test_pred =predict.argmax(axis=1)+1
from sklearn.metrics import accuracy_score
print accuracy_score(Y_test_matrix,Y_test_pred)