In [1]:
import numpy as np
import pandas as pd

In [2]:
# read data

path = "train.csv"
train_data = pd.read_csv(path, index_col="S.No")

path = "test.csv"
test_data = pd.read_csv(path, index_col="S.No")

print(train_data.shape, test_data.shape)
print("data points per class in original data\n", train_data['LABELS'].value_counts())

(47760, 20) (7320, 19)
data points per class in original data
 0    37535
2     8223
1     2002
Name: LABELS, dtype: int64


In [3]:
# normalise dependant variable and test data
x = train_data.iloc[:,0:19]
X = (x - x.mean(axis=0)) / x.std(axis=0)

Y = train_data.iloc[:,-1]

test_data = (test_data - test_data.mean(axis=0)) / x.std(axis=0)
test_data = test_data.to_numpy()

In [4]:
# source: https://stackoverflow.com/a/35932368
def shuffle_split_data(X, y):
    arr_rand = np.random.rand(X.shape[0])
    split = arr_rand < np.percentile(arr_rand, 80)

    X_train = X[split].to_numpy()
    y_train = y[split].to_numpy()
    X_test =  X[~split].to_numpy()
    y_test = y[~split].to_numpy()

    print(len(X_train), len(y_train), len(X_test), len(y_test))
    return X_train, y_train, X_test, y_test

In [5]:
X_train, y_train, X_test, y_test = shuffle_split_data(X, Y)

38208 38208 9552 9552


In [6]:
def one_hot(y, c):
    y_hot = np.zeros((len(y), c))
    
    # Putting 1 for column where the label is,
    # Using multidimensional indexing.
    y_hot[np.arange(len(y)), y] = 1
    
    return y_hot

In [7]:
def softmax(z):
    # subtracting the max of z for numerical stability.
    exp = np.exp(z - np.max(z))
    
    # Calculating softmax for all examples.
    for i in range(len(z)):
        exp[i] /= np.sum(exp[i])
        
    return exp

In [8]:
def fit(X, y, lr, c, epochs):
    m, n = X.shape
    
    # Initializing weights and bias randomly.
    w = np.random.random((n, c))
    b = np.random.random(c)
    losses = []
    
    # Training loop.
    for epoch in range(epochs):
        
        z = X@w + b
        y_hat = softmax(z)
        
        y_hot = one_hot(y, c)
        
        # Calculating the gradient of loss w.r.t w and b.
        w_grad = (1/m)*np.dot(X.T, (y_hat - y_hot)) 
        b_grad = (1/m)*np.sum(y_hat - y_hot)
        
        # Updating the parameters.
        w = w - lr*w_grad
        b = b - lr*b_grad
        
        # Calculating loss and appending it in the list.
        loss = -np.mean(np.log(y_hat[np.arange(len(y)), y]))
        losses.append(loss)

        if epoch%100==0:
            print('Epoch {epoch}==> Loss = {loss}'
                  .format(epoch=epoch, loss=loss))
    return w, b, losses

In [9]:
def predict(X, w, b):
    z = X@w + b
    y_hat = softmax(z)
    
    # Returning the class with highest probability.
    return np.argmax(y_hat, axis=1)

In [10]:
def accuracy(y, y_hat):
    return np.sum(y==y_hat)/len(y)

In [11]:
# fit the model
w, b, l = fit(X_train, y_train, lr=0.01, c=3, epochs=15000)

Epoch 0==> Loss = 1.0511686323925014
Epoch 100==> Loss = 0.8986473260917506
Epoch 200==> Loss = 0.8407929688397437
Epoch 300==> Loss = 0.8039866062050548
Epoch 400==> Loss = 0.7793290411090309
Epoch 500==> Loss = 0.7620807993181944
Epoch 600==> Loss = 0.7495579156662484
Epoch 700==> Loss = 0.740185484260134
Epoch 800==> Loss = 0.7330069932712779
Epoch 900==> Loss = 0.727429564944412
Epoch 1000==> Loss = 0.7230865641044802
Epoch 1100==> Loss = 0.7197160605035634
Epoch 1200==> Loss = 0.7170857811329642
Epoch 1300==> Loss = 0.7150113984992589
Epoch 1400==> Loss = 0.713348641604404
Epoch 1500==> Loss = 0.7119862821898421
Epoch 1600==> Loss = 0.7108456346297435
Epoch 1700==> Loss = 0.7098735745667245
Epoch 1800==> Loss = 0.7090335317634837
Epoch 1900==> Loss = 0.7082992066812979
Epoch 2000==> Loss = 0.7076509479480729
Epoch 2100==> Loss = 0.7070736618991569
Epoch 2200==> Loss = 0.7065555286334976
Epoch 2300==> Loss = 0.706087158900437
Epoch 2400==> Loss = 0.7056610094067644
Epoch 2500==> Lo

In [12]:
# print training accuracy
train_preds = predict(X_train, w, b)
accuracy(y_train, train_preds)

0.7946765075376885

In [13]:
# print validation accuracy
test_preds = predict(X_test, w, b)
accuracy(y_test, test_preds)

0.7906197654941374

In [15]:
# get predictions on test data and write it in the csv

test_preds_ = predict(test_data, w, b)

pred = []
for i in test_preds_:
    pred.append(int(i))
print(pred)

sr = [i for i in range(0, 7320)]
df = pd.DataFrame(data={"S.No": sr, "LABELS": pred})
df.to_csv("./my_lr.csv", sep=',',index=False)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 