In [8]:
import numpy as np
import scipy
from scipy import stats

import pylab as plt
%matplotlib inline

trainX = np.loadtxt('X_train.csv', delimiter=',', skiprows=1)
traint = np.loadtxt('y_train.csv', delimiter=',', skiprows=1)
traint = traint[:,1][:,None]-1
testX = np.loadtxt('X_test.csv', delimiter=',', skiprows=1)
    
def to_binary(x):
    if x < 0.5:
        return 0
    return 1

def h(x, w):
    return sigmoid(np.dot(x, w))

def sigmoid(x):
    val = scipy.special.expit(x)
    return val

def cv(trainX, traint, folds=20, regularFactor=0):
    sample_count = trainX.shape[0]
    feature_count = trainX.shape[1]
    fold_size = sample_count//folds
    
    average_accuracy = 0
    
    for i in range(folds):
        split_index1 = i*fold_size
        split_index2 = (i+1)*fold_size
        
        cv_testX = trainX[split_index1:split_index2]
        cv_testt = traint[split_index1:split_index2]
        cv_trainX = np.concatenate((trainX[:split_index1], trainX[split_index2:]))
        cv_traint = np.concatenate((traint[:split_index1], traint[split_index2:]))
              
        w = get_gd_weights(cv_trainX, cv_traint, 0.9, regularFactor)
        predictions = np.array(list(map(to_binary, h(cv_testX, w))))[:,None]
        
        #
        # Sklearn models
        #
        #model = get_sk_model(cv_trainX, cv_traint)
        #predictions = model.predict(cv_testX)
        #print(predictions[0:5])
        #print(cv_testt[0:5])
        
        acc = get_accuracy(predictions, cv_testt)
        average_accuracy += acc
        #print("Iteration and accuracy:", i, acc)
    
    #print("Total accuracy:", average_accuracy/folds)
    return average_accuracy/folds
    
def get_accuracy(predictions, actual):
    N = predictions.shape[0]
    temp = predictions-actual
    misses = np.count_nonzero(temp)
    hits = N-misses
    return hits/N
        
def make_predictions(X, w):
    preds = np.zeros((X.shape[0], 1))
    for i in range(X.shape[0]):
        x = X[i]
        preds[i] = to_binary(h(x, w))
    return preds

def filter_by_variance(trainX, keep_count=10):
    sample_count = trainX.shape[0]
    feature_count = trainX.shape[1]
    variances = np.zeros(feature_count)
    
    # Find variance of each feature
    for i in range(feature_count):
        feature = trainX.T[i]
        variance = np.var(feature)
        variances[i] = variance    
    
    features_by_variance = variances.argsort()[:keep_count]

    newX = np.zeros((sample_count, keep_count))
    for i in range(keep_count):
        newX[:,i] = trainX[:,features_by_variance[i]]

    return newX, features_by_variance



# Newton Raphson
"""def get_nr_weights(trainx, traint, iterations=20):
    n_weights = trainx.shape[1]
    
    w = np.zeros((n_weights,1))
    dx = np.diag(np.dot(trainx, trainx.T))[:,None]
    
    # allw - for debugging
    #allw = np.zeros((n_weights,iterations))

    for i in range(iterations):
        #allw[:,i] = w.flatten()
        P = 1.0/(1.0 + np.exp(-np.dot(trainx, w)))
        gw = -w + np.sum(trainx*np.tile(traint-P,(1,n_weights)), axis=0)[:,None]
        temp = trainx*np.tile(P*(1-P), (1,n_weights))
        hw = -np.eye(n_weights) - np.dot(temp.T, trainx)
        w = w - np.dot(np.linalg.inv(hw), gw)
    
    return w"""

# Gradient descent
def get_gd_weights(X, t, iterations=5000, alpha=0.9, regularFactor=0):
    sample_count = X.shape[0]
    weight_count = X.shape[1]
    w = np.zeros(weight_count)[:,None]
    
    for i in range(iterations):
        w = w*(1-(alpha*regularFactor)/sample_count) - alpha/sample_count * np.dot(X.T, h(X,w)-t)
        
        # Check if gradient descent is running
        if i%10000 == 0: print(w[0:4].ravel(), i)

    return w

# 22 is chosen as a heuristic
trainX, top_feature_indices = filter_by_variance(trainX, 22)

#w = get_nr_weights(trainX, traint)
w = get_gd_weights(trainX, traint, 300000, 0.9)

# Single CV
#cv(trainX, traint)

# Loop CV to find subset of features
# for feature_count in range(1, 112):
#     newX, _ = filter_by_ttest(trainX, traint, feature_count)
#     #newX, _ = filter_by_variance(trainX, feature_count)
#     cv(newX, traint)







def save_predictions(predictions, filename="predictions_logit.csv"):
    N = predictions.shape[0]
    output = np.ones((N, 2))
    output[:,0] = range(N)
    output[:,1] = predictions.ravel()
    np.savetxt(filename, output, fmt='%d', delimiter=",", header="Id,EpiOrStroma")
    print("Predictions saved")

# Select only top features if filtered
testX = testX[:,top_feature_indices]
preds = make_predictions(testX, w)
save_predictions(preds+1)

[0.00021787 0.04819778 0.10811003 0.01479095] 0
[-0.15435782  3.3444323   6.05047574  2.27144526] 10000
[-0.31994812  4.81570632  8.64652651  3.48190252] 20000
[-0.48390063  5.83542163 10.5562751   4.39316969] 30000
[-0.64689634  6.57307754 12.1027191   5.10490868] 40000
[-0.80951603  7.10266592 13.40074479  5.67233209] 50000
[-0.97221191  7.46873164 14.50966785  6.12839601] 60000
[-1.13525673  7.70262355 15.4682123   6.49528543] 70000
[-1.29878461  7.82816098 16.30498178  6.78924957] 80000
[-1.46284005  7.8641594  17.04236976  7.02282214] 90000
[-1.62741434  7.82579159 17.69837116  7.20598033] 100000
[-1.79246944  7.72543601 18.28761221  7.34682669] 110000
[-1.95795265  7.57326276 18.82203888  7.45203103] 120000
[-2.12380545  7.37766653 19.31142718  7.52713787] 130000
[-2.28996862  7.14560027 19.76378325  7.57679122] 140000
[-2.45638509  6.88283864 20.18566559  7.60490464] 150000
[-2.6230014   6.59418942 20.58244754  7.61479293] 160000
[-2.78976844  6.28366448 20.95853193  7.60927585]