In [8]:
from IPython.display import IFrame
import pylab as plt
%matplotlib inline
import numpy as np

# Load data
X_train = np.loadtxt("X_train.csv", delimiter=',', dtype="int", skiprows=1)
y_train = np.loadtxt("y_train.csv", delimiter=',', dtype="int", skiprows=1, usecols=1)
actual = y_train[:,None]
N = X_train.shape[0]

#Calculates a loss using a design matrix (X), a set of responses (y) and a parameter vector (w)
def calculate_loss(inputs,responses,weights):
    loss = 0
    N = inputs.shape[0]
    X = inputs
    y = responses
    w = weights
    
    for i in range(N):
        y_i = y[i]
        dot_i = np.dot(w.T,X[i])
        loss += ((y_i-dot_i)**2)
    loss = (np.sqrt(loss * (1/N)))
    return loss

def mse(actual, prediction, N):
    return np.sqrt(np.sum((actual-prediction)**2)/N)

#Calculate optimum weights using the Normal Equation: w=(X(T)X)-1X(T)y
def calc_weights(inputs,responses):
    X = inputs
    y = responses
    
    XX = np.dot(X.T, X)
    invXX = np.linalg.inv(XX)
    Xy = np.dot(X.T, y)
    w = np.dot(invXX, Xy)
    return w
    
def calc_LOOCV_loss(inputs, responses):
    N = inputs.shape[0]
    totalLoss = 0
    for i in range(N):
        X_remove = np.concatenate((inputs[:i],inputs[i+1:]))
        y_remove = np.concatenate((responses[:i],responses[i+1:]))
        X_i = inputs[i]
        y_i = responses[i]
        w_i = calc_weights(X_remove,y_remove)
        #pred = np.dot(X_remove,w_i)
        #dom_i = mse(y_remove,pred,N-1) 
        #loss_i = calculate_loss(X_remove,y_remove,w_i)
        dot_i = np.dot(w_i.T,X_i)
        diff = (y_i-dot_i)
        loss_i = diff**2
        totalLoss += loss_i
        #print("###### Iteration "+str(i)+" ######\n")
        #print("X(i): "+str(X_i)+" \n ,y(i): "+str(y_i)+", diff(i): "+str(diff)+",loss(i): "+str(loss_i))
    return np.sqrt((totalLoss * (1/N)))

#X_remove = (i==0)?(inputs.splice(i+1;N)):((i==N)?(inputs.splice(0;i)):(numpy.concatenate(inputs.splice(0;i),inputs.splice(i+1;N))))

def plot_feature(X, y, log=True):
    plt.figure()
    if log:
        plt.xscale('log')
        plt.yscale('log')
    plt.plot(X, y, 'go')

# Load data
X_train = np.loadtxt("X_train.csv", delimiter=',', dtype="int", skiprows=1)
y_train = np.loadtxt("y_train.csv", delimiter=',', dtype="int", skiprows=1, usecols=1)

# X_train
# -------
# MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX
# 125,256,6000,256,16,128
# 29,8000,32000,32,8,32
# ...

# Y_train
# -------
# Id,PRP
# 0,198
# 1,269
# ...

MYCT = X_train[:,0][:,None]
MMIN = X_train[:,1][:,None]
MMAX = X_train[:,2][:,None]
CACH = X_train[:,3][:,None]
CHMIN = X_train[:,4][:,None]
CHMAX = X_train[:,5][:,None]
PRP = y_train[:][:,None]

#Delta
e = 0.0001

# Model
X = np.hstack((
    np.ones_like(MYCT),
    np.log(MYCT),
    np.log(MMIN),
    np.log(MMAX),
    np.log(CACH+e),
    np.log(CHMIN+e),
    np.log(CHMAX+e)
))
#print(X)

# Get weights
XX = np.dot(X.T, X)
invXX = np.linalg.inv(XX)
Xt = np.dot(X.T, PRP)
w = np.dot(invXX, Xt)
#print("Weights:")
#print(w)

#plot_feature(MYCT, PRP)
#plot_feature(MMIN, PRP)
#plot_feature(MMAX, PRP)
#plot_feature(CACH, PRP)
#plot_feature(CHMIN, PRP)
#plot_feature(CHMAX, PRP)

lossSum = calculate_loss(X,y_train,w)[0]
print("Training loss: " + str(lossSum))

loocv = calc_LOOCV_loss(X,y_train)
print("Validation loss: " + str(loocv))

#Xw = np.matmul(X,w)
#loss = (1/N)*np.matmul((y_train-Xw).T,(y_train-Xw))
#print("Loss: " + str(loss))


Training loss: 127.1199671952179
Validation loss: 135.09184181136618
MSE:
2597.8512841094516
