In [45]:
from IPython.display import IFrame
import pylab as plt
%matplotlib inline
import numpy as np
import copy

# Load data
X_train = np.loadtxt("X_train.csv", delimiter=',', dtype="int", skiprows=1)
y_train = np.loadtxt("y_train.csv", delimiter=',', dtype="int", skiprows=1, usecols=1)
actual = y_train[:,None]
N = X_train.shape[0]
X_test = np.loadtxt("X_test.csv", delimiter=',', dtype="int", skiprows=1)

MYCT = X_train[:,0][:,None]
MMIN = X_train[:,1][:,None]
MMAX = X_train[:,2][:,None]
CACH = X_train[:,3][:,None]
CHMIN = X_train[:,4][:,None]
CHMAX = X_train[:,5][:,None]
PRP = y_train[:,None]

# Model
X = np.hstack((
    np.ones_like(MYCT),
    MYCT/np.log(MYCT),
    np.log(MMIN),
    np.log(MMAX),
    MMAX,
    MMIN,
    MMAX*MYCT / MMIN,
    CACH**CHMIN,
    CHMAX * MMAX * CACH
))









def plot_feature(X, y, log=True):
    plt.figure()
    if log:
        plt.xscale('log')
        plt.yscale('log')
    plt.plot(X, y, 'go')

def get_weights(X, t):
    return np.dot(np.linalg.inv(np.dot(X.T,X)),np.dot(X.T,t))

def get_regularised_weights(X, t, lamb=0):
    weight_count = X.shape[1]
    return np.linalg.solve(np.dot(X.T,X) + lamb*np.identity(weight_count),np.dot(X.T,t))

def get_mse(actual, prediction):
    N = actual.shape[0]
    return np.sqrt(np.sum((actual-prediction)**2)/N)

def loocv(X, T, alpha=0):
    N = X.shape[0]
    loss = 0
    
    for n in range(0, N):
        X_spliced = np.concatenate((X[:n], X[n+1:]))
        T_spliced = np.concatenate((T[:n], T[n+1:]))
        
        #w = get_weights(X_spliced, T_spliced)
        w = get_regularised_weights(X_spliced, T_spliced, alpha)
        
        pred = np.dot(X[n], w)
        diff = (pred-T[n])**2
        loss += diff
    
    return np.sqrt(loss/N)

def save_predictions(predictions, filename="predictions_regression.csv"):
    N = predictions.shape[0]
    output = np.ones((N, 2))
    output[:,0] = range(N)
    output[:,1] = predictions.ravel()
    np.savetxt(filename, output, fmt='%d', delimiter=",", header="Id,PRP")
    print("Predictions saved")



w = get_weights(X, PRP)
w = get_regularised_weights(X, PRP)
predictions = np.dot(X,w)

print("Training loss:", get_mse(actual, predictions))
print("Validation loss:", loocv(X, PRP))

# Test out regularisation
# alpha is lambda
# for alpha in [0, 0.01, 0.1, 1, 10]:
#     w = get_regularised_weights(X, PRP, alpha)
#     prediction = np.dot(X,w)
#     print("Training loss at alpha:", alpha, get_mse(actual, prediction, N))


Training loss: 31.39924540114201
Validation loss: [33.91194453]


In [44]:
X_test = np.loadtxt("X_test.csv", delimiter=',', dtype="int", skiprows=1)
MYCT = X_test[:,0][:,None]
MMIN = X_test[:,1][:,None]
MMAX = X_test[:,2][:,None]
CACH = X_test[:,3][:,None]
CHMIN = X_test[:,4][:,None]
CHMAX = X_test[:,5][:,None]
N = X_test.shape[0]

# Model
X = np.hstack((
    np.ones_like(MYCT),
    MYCT/np.log(MYCT*0.000001),
    np.log(MMIN),
    np.log(MMAX),
    MMAX,
    MMIN,
    MMAX*MYCT / MMIN,
    CACH**CHMIN,
    CHMAX * MMAX * CACH     
))

predictions = np.dot(X, w).flatten()

pred = np.ones((N, 2))
pred[:,0] = range(N)
pred[:,1] = predictions
np.savetxt('predictions_regression.csv', pred, fmt='%d', delimiter=",", header="Id,PRP", comments="")