In [None]:
import numpy as np
import scipy
from scipy import stats
import pylab as plt
%matplotlib inline

#EPSILON=0.5

# Datum
trainX = np.loadtxt('X_train.csv', delimiter=',', skiprows=1)
traint = np.loadtxt('y_train.csv', delimiter=',', skiprows=1)
traint = traint[:,1][:,None]-1
testX = np.loadtxt('X_test.csv', delimiter=',', skiprows=1)

"""def plot_pair(X, t, f1, f2):
    plt.figure()
    pos0 = np.where(t==0)[0]
    pos1 = np.where(t==1)[0]
    plt.plot(X[pos1,f1], X[pos1,f2],'bo')
    plt.plot(X[pos0,f1], X[pos0,f2],'ro')"""

def save_predictions(predictions, filename="class_logit.csv"):
    N = predictions.shape[0]
    
    output = np.ones((N, 2))
    output[:,0] = range(N)
    output[:,1] = predictions.ravel()
    np.savetxt(filename, output, fmt='%d', delimiter=",", header="Id,EpiOrStroma")
    
def to_binary(x):
    if x < 0.5:
        return 0
    return 1

def h(x, w):
    return sigmoid(np.dot(x, w))

def sigmoid(x):
    val = scipy.special.expit(x)
    return val

def filter_by_ttest(trainX, traint, n=10):
    ps = np.zeros(trainX.shape[1])
    
    for i in range(trainX.shape[1]):
        feature = trainX.T[i][:,None]
        label = traint.flatten()[:,None]
        
        stacked = np.hstack((feature, label))
        stacked.sort(axis=0)
        count = (stacked[:,1] == 1).sum()
        group1 = stacked[:count,0]
        group2 = stacked[count:,0]
        
        p = stats.ttest_ind(group1,group2)[1]
        ps[i] = p
        
    features_by_distinction = ps.argsort()[:n]
    #print(features_by_distinction)
    newX = np.zeros((trainX.shape[0], n))
    
    for i in range(n):
        newX[:,i] = trainX[:,features_by_distinction[i]]
    
    return newX, features_by_distinction

# Results in worse performance
"""def filter_by_variance(trainX, keep_count=10):
    sample_count = trainX.shape[0]
    feature_count = trainX.shape[1]
    variances = np.zeros(feature_count)
    
    # Find variance of each feature
    for i in range(feature_count):
        feature = trainX.T[i]
        variance = np.var(feature)
        variances[i] = variance    
    
    features_by_variance = variances.argsort()[feature_count-keep_count:]

    newX = np.zeros((sample_count, keep_count))
    for i in range(keep_count):
        newX[:,i] = trainX[:,features_by_variance[i]]

    return newX, features_by_variance"""

def get_optimal_weights(w, X, t, iterations=300, alpha=0.0001):
    sample_count = X.shape[0]
    weight_count = X.shape[1]
    
    for iters in range(iterations):
        for j in range(weight_count):
            change_sum = 0
            for i in range(sample_count):
                prediction = to_binary(h(X[i,:], w))
                change = (prediction-t[i])*X[i,j]
                change_sum += change
                
            w[j] = w[j] - alpha*change_sum
        
        print(w[0:3].T)
        #print("Updated weights after iteration", iters)
        
    return w

# Variance filtering gives rubbish at the moment
trainX, top_feature_indices = filter_by_ttest(trainX, traint, 10)
#trainX, top_feature_indices = filter_by_variance(trainX, 60)

w = np.zeros(trainX.shape[1])[:,None]
w = get_optimal_weights(w, trainX, traint)

# Save predictions
testX = testX[:,top_feature_indices]

def save_predictions(predictions, filename="predictions_sklearn.csv"):
    N = predictions.shape[0]
    
    output = np.ones((N, 2))
    output[:,0] = range(N)
    output[:,1] = predictions.ravel()
    np.savetxt(filename, output, fmt='%d', delimiter=",", header="Id,EpiOrStroma")
    print("Predictions saved")

preds = np.zeros((testX.shape[0], 1))
for i in range(testX.shape[0]):
    preds[i] = to_binary(h(testX[i], w))
save_predictions(preds+1)

[[-0.01071876  0.00779731  0.00382983]]
[[-0.00527911  0.01703129  0.01807587]]
[[-0.01571354  0.01140752  0.01151097]]
[[-0.0264323   0.0036244   0.00171995]]
[[-0.02533097  0.00258822  0.00165868]]
[[-0.02348141  0.00257601  0.00258297]]
[[-0.02136136  0.00282812  0.00386132]]
[[-0.01736551  0.00702811  0.00980904]]
[[-0.01521939  0.01443428  0.02132136]]
[[-0.02575261  0.00782329  0.01334688]]
[[-3.64713741e-02  4.01763000e-05  3.55586830e-03]]
[[-0.03533708 -0.00093292  0.00357121]]
[[-0.03391232 -0.00143447  0.00399034]]
[[-0.03207168 -0.00146456  0.00485982]]
[[-0.02979346 -0.00082853  0.00648995]]
[[-0.02606049  0.00382506  0.01335159]]
[[-0.02422725  0.0110447   0.02463849]]
[[-0.03476047  0.00442794  0.01662889]]
[[-0.04547923 -0.00335517  0.00683787]]
[[-0.04405448 -0.00385673  0.007257  ]]
[[-0.04250623 -0.00419326  0.00783619]]
[[-0.04063908 -0.00417169  0.0087822 ]]
[[-0.03851561 -0.00291841  0.01092252]]
[[-0.03517455  0.00251455  0.01957859]]
[[-0.03469816  0.00898457  0