In [6]:
import numpy as np
import scipy
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import pylab as plt
%matplotlib inline

#EPSILON=0.5

# Datum
trainX = np.loadtxt('X_train.csv', delimiter=',', skiprows=1)
traint = np.loadtxt('y_train.csv', delimiter=',', skiprows=1)
traint = traint[:,1][:,None]-1

testX = np.loadtxt('X_test.csv', delimiter=',', skiprows=1)

"""def plot_pair(X, t, f1, f2):
    plt.figure()
    pos0 = np.where(t==0)[0]
    pos1 = np.where(t==1)[0]
    plt.plot(X[pos1,f1], X[pos1,f2],'bo')
    plt.plot(X[pos0,f1], X[pos0,f2],'ro')"""

def save_predictions(predictions, filename="class_logit.csv"):
    N = predictions.shape[0]
    output = np.ones((N, 2))
    output[:,0] = range(N)
    output[:,1] = predictions.ravel()
    np.savetxt(filename, output, fmt='%d', delimiter=",", header="Id,EpiOrStroma")
    
def to_binary(x):
    if x < 0.5:
        return 0
    return 1

def h(x, w):
    return sigmoid(np.dot(x, w))

def sigmoid(x):
    val = scipy.special.expit(x)
    return val

def cv(trainX, traint, folds=20, regularFactor=0):
    sample_count = trainX.shape[0]
    feature_count = trainX.shape[1]
    fold_size = sample_count//folds
    
    average_accuracy = 0
    
    for i in range(folds):
        split_index1 = i*fold_size
        split_index2 = (i+1)*fold_size
        
        cv_testX = trainX[split_index1:split_index2]
        cv_testt = traint[split_index1:split_index2]
        cv_trainX = np.concatenate((trainX[:split_index1], trainX[split_index2:]))
        cv_traint = np.concatenate((traint[:split_index1], traint[split_index2:]))
        
        #
        # Homemade models
        #        
        #w = get_nr_weights(cv_trainX, cv_traint)
        w = get_gd_weights(cv_trainX, cv_traint, 0.9, regularFactor)
        
        # Regularise
        #w = np.identity(feature_count)
        
        predictions = np.array(list(map(to_binary, h(cv_testX, w))))[:,None]
        #print(predictions)
        
        #
        # Sklearn models
        #
        #model = get_sk_model(cv_trainX, cv_traint)
        #predictions = model.predict(cv_testX)
        #print(predictions[0:5])
        #print(cv_testt[0:5])
        
        acc = get_accuracy(predictions, cv_testt)
        average_accuracy += acc
        #print("Iteration and accuracy:", i, acc)
    
    print("Total accuracy:", average_accuracy/folds)
    return average_accuracy/folds
    
def get_accuracy(predictions, actual):
    N = predictions.shape[0]
    temp = predictions-actual
    misses = np.count_nonzero(temp)
    hits = N-misses
    return hits/N
        
def make_predictions(X, w):
    preds = np.zeros((X.shape[0], 1))
    for i in range(X.shape[0]):
        x = X[i]
        preds[i] = to_binary(h(x, w))
    return preds

def filter_by_variance(trainX, keep_count=10):
    sample_count = trainX.shape[0]
    feature_count = trainX.shape[1]
    variances = np.zeros(feature_count)
    
    # Find variance of each feature
    for i in range(feature_count):
        feature = trainX.T[i]
        variance = np.var(feature)
        variances[i] = variance    
    
    features_by_variance = variances.argsort()[:keep_count]

    newX = np.zeros((sample_count, keep_count))
    for i in range(keep_count):
        newX[:,i] = trainX[:,features_by_variance[i]]

    return newX, features_by_variance

def get_sk_model(trainX, traint):
    clf = LogisticRegression()
    model = clf.fit(trainX, traint.ravel())
    return model

"""def predict_sklearn(model, testX):
    predictions = model.predict(testX)
    return predictions"""

# Newton Raphson
"""def get_nr_weights(trainx, traint, iterations=20):
    n_weights = trainx.shape[1]
    
    w = np.zeros((n_weights,1))
    dx = np.diag(np.dot(trainx, trainx.T))[:,None]
    
    # allw - for debugging
    #allw = np.zeros((n_weights,iterations))

    for i in range(iterations):
        #allw[:,i] = w.flatten()
        P = 1.0/(1.0 + np.exp(-np.dot(trainx, w)))
        gw = -w + np.sum(trainx*np.tile(traint-P,(1,n_weights)), axis=0)[:,None]
        temp = trainx*np.tile(P*(1-P), (1,n_weights))
        hw = -np.eye(n_weights) - np.dot(temp.T, trainx)
        w = w - np.dot(np.linalg.inv(hw), gw)
    
    return w"""

# Gradient descent
def get_gd_weights(X, t, iterations=5000, alpha=0.9, regularFactor=0):
    sample_count = X.shape[0]
    weight_count = X.shape[1]
    w = np.zeros(weight_count)[:,None]
    
    # Plot
    #allw = np.zeros((weight_count,iterations))
    
    for i in range(iterations):
        #allw[:,i] = w.flatten()
        w = w*(1-(alpha*regularFactor)/sample_count) - alpha/sample_count * np.dot(X.T, h(X,w)-t)
        
        # Debug
        #if i%10000 == 0: print(w[0:5].T, i)
            
        #print("Updated weights after iteration", iters)

#     [plt.plot(allw[w,:]) for w in range(weight_count)]
#     plt.xlabel('Iteration')
#     plt.ylabel('w')

    return w

# Filter by ttest
#trainX, top_feature_indices = filter_by_ttest(trainX, traint, 25)
trainX, top_feature_indices = filter_by_variance(trainX, 22)

#w = get_nr_weights(trainX, traint)
#w = get_gd_weights(trainX, traint, 300000, 0.9)



# Single CV
#cv(trainX, traint)

# Loop CV to find subset of features
# for feature_count in range(1, 112):
#     newX, _ = filter_by_ttest(trainX, traint, feature_count)
#     #newX, _ = filter_by_variance(trainX, feature_count)
#     cv(newX, traint)

def test_regularisation(X, PRP):
    for regularFactor in [0, 0.01, 0.1, 1, 10, 100, 1000]:
        print("Validation loss at alpha:", cv(X, PRP, 20, regularFactor), alpha)

test_regularisation(trainX, traint)







def save_predictions(predictions, filename="classification_predictions.csv"):
    N = predictions.shape[0]
    
    output = np.ones((N, 2))
    output[:,0] = range(N)
    output[:,1] = predictions.ravel()
    np.savetxt(filename, output, fmt='%d', delimiter=",", header="Id,EpiOrStroma")
    print("Predictions saved")

# Select only top features if filtered
testX = testX[:,top_feature_indices]

# Uncoment line below to predic w with logistical
#preds = make_predictions(testX, w)

# Uncomment block below to predict w with sklearn
model = get_sk_model(trainX, traint)
preds = model.predict(testX)

# Save
#save_predictions(preds+1)

TypeError: 'float' object cannot be interpreted as an integer