In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from process import get_data


from sklearn.utils import shuffle
%matplotlib inline

In [29]:
# make predictions
def softmax(a):
    expA = np.exp(a)
    return expA / expA.sum(axis=1, keepdims=True)

def forward(X, W, b):
    return softmax(X.dot(W) + b)

def predict(P_Y_given_X):
    return np.argmax(P_Y_given_X, axis=1)

# calculate the accuracy
def classification_rate(Y, P):
    return np.mean(Y == P)

def cross_entropy(T, pY):
    return -np.mean(T*np.log(pY))

def y2indicator(y, K):
    N = len(y)
    ind = np.zeros((N, K))
    for i in range(N):
        ind[i, y[i]] = 1
    return ind

In [31]:
W = np.random.randn(D, K)
b = np.zeros(K)

Xtrain, Ytrain, Xtest, Ytest = get_data()
D = Xtrain.shape[1]
K = len(set(Ytrain) | set(Ytest))

# convert to indicator
Ytrain_ind = y2indicator(Ytrain, K)
Ytest_ind = y2indicator(Ytest, K)

train_costs = []
test_costs = []
learning_rate = 0.001
for i in range(10000):
    pYtrain = forward(Xtrain, W, b)
    pYtest = forward(Xtest, W, b)

    ctrain = cross_entropy(Ytrain_ind, pYtrain)
    ctest = cross_entropy(Ytest_ind, pYtest)
    train_costs.append(ctrain)
    test_costs.append(ctest)

    # gradient descent
    W -= learning_rate*Xtrain.T.dot(pYtrain - Ytrain_ind)
    b -= learning_rate*(pYtrain - Ytrain_ind).sum(axis=0)
    if i % 1000 == 0:
        print(i, ctrain, ctest)

print("Final train classification_rate:", classification_rate(Ytrain, predict(pYtrain)))
print("Final test classification_rate:", classification_rate(Ytest, predict(pYtest)))

0 0.598971269012886 0.6873872831701937
1000 0.08339104446512106 0.112233053587592
2000 0.07964342150795396 0.10640267754307384
3000 0.07819091405856265 0.10439145098912775
4000 0.07745713098200474 0.10343170597517705
5000 0.07703179446269952 0.1028849732528845
6000 0.07676302055579166 0.10253598857851372
7000 0.07658274718928196 0.10229515364621283
8000 0.07645642260049638 0.10211945502210419
9000 0.07636487604121932 0.10198594000305038
Final train classification_rate: 0.915
Final test classification_rate: 0.91


In [15]:
def one_hot_encoder(data):
    # One-hot encoding
    unique_time = np.unique(data)
    #print(unique_time)
    one_hot = np.zeros((data.shape[0], len(unique_time)))
    for t in unique_time:
        one_hot[:,int(t)] = np.where(data==t, 1, 0)
        
    return one_hot

In [16]:
def get_data_ant():
    df = pd.read_csv('ecommerce_data.csv')
    data = df.as_matrix()
    
    X = data[:,:-1]
    Y = data[:,-1].astype(np.int32)
    
    X, Y = shuffle(X, Y, random_state=42)
    
    N, D = X.shape
    
    
    # One-hot encoding
    X2 = np.zeros((N,D+3))
    X2[:,:D-1] = X[:,:D-1]
    X2[:,D-1:D+3] = one_hot_encoder(X[:,D-1])
    X = X2

    X_train = X[:-100,:]
    Y_train = Y[:-100]
    X_test = X[-100:,:]
    Y_test = Y[-100:]
    
    
    # normalize the data
    for i in (1,2):
        X_train[:,i] = (X_train[:,i] - X_train[:,i].mean())/X_train[:,i].std()
        X_test[:,i] = (X_test[:,i] - X_test[:,i].mean())/X_test[:,i].std()
        
    return X_train, Y_train, X_test, Y_test
    

In [17]:
def softmax_ant(a):
    exp_a = np.exp(a)
    return exp_a/exp_a.sum(axis=1, keepdims=True)

In [18]:
def classification_rate_ant(Y, Y_hat_class):
    return 100*np.mean(Y==Y_hat_class)

In [19]:
def forward_ant(X, W, b):
    return softmax(X.dot(W) + b)

In [20]:
# General multi-class cross-entropy
def cross_entropy_ant(Y, Y_hat):
    return -np.mean(Y*np.log(Y_hat))

In [21]:
def predict_ant(Y_hat):
    return np.argmax(Y_hat, axis=1)

In [22]:
def dJ_dw(Y, Y_hat, X):
    return X.T.dot(Y-Y_hat)

In [23]:
def derivative_b(Y, Y_hat):
    return (Y - Y_hat).sum(axis=0)

In [24]:
X_train, Y_train, X_test, Y_test = get_data()
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(400, 8) (400,) (100, 8) (100,)


In [25]:
D = X_train.shape[1] # number of features
K = len(set(Y_train)) # number of classes

T_train = one_hot_encoder(Y_train)
T_test = one_hot_encoder(Y_test)

W = np.random.randn(D, K)
b = np.random.randn(K)

# Check
# P_Y_given_x = forward(X_train, W, b)
# Y_hat = predict(P_Y_given_x)
# print('Classification rate:', np.round(classification_rate(Y_train, Y_hat),4), '%')

In [26]:
learning_rate = 1e-3
costs_train = []
costs_test = []
for epoch in range(10000):
    Y_hat_train = forward_ant(X_train, W, b)
    Y_hat_test = forward_ant(X_test, W, b)
    
    ctrain = cross_entropy_ant(T_train, Y_hat_train)
    ctest = cross_entropy_ant(T_test, Y_hat_test)
    
    costs_train.append(ctrain)
    costs_test.append(ctest)
    
    W -= learning_rate * dJ_dw(T_train, Y_hat_train, X_train)
    b -= learning_rate * derivative_b(T_train, Y_hat_train)
    
    if epoch % 1000 == 0:
        print(epoch, ctrain, ctest)


    
print("classification_rate:", classification_rate(Y_train, predict_ant(Y_hat_train)))    
# plt.plot(costs_train)
# plt.plot(costs_test)

0 0.8087971278692129 0.8658409464369052


  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


1000 nan nan
2000 nan nan
3000 nan nan
4000 nan nan
5000 nan nan
6000 nan nan
7000 nan nan
8000 nan nan
9000 nan nan
classification_rate: 0.5075


In [66]:
costs_train
costs_test

[0.49017228482615977, nan, nan, nan, nan, nan, nan, nan, nan, nan]

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [3]:
def y2indicator(y, K):
    N = len(y)
    ind = np.zeros((N, K))
    for i in range(N):
        ind[i, y[i]] = 1
    return ind

In [4]:
Xtrain, Ytrain, Xtest, Ytest = get_data()
D = Xtrain.shape[1]
K = len(set(Ytrain) | set(Ytest))

# convert to indicator
Ytrain_ind = y2indicator(Ytrain, K)
Ytest_ind = y2indicator(Ytest, K)

print(Ytrain_ind.shape)

(400, 4)


8 5 2


In [74]:
# initialize weights


0.36432160804020103
