In [9]:
import numpy as np
import scipy
from scipy import stats
from sklearn.ensemble import RandomForestClassifier

trainX = np.loadtxt('X_train.csv', delimiter=',', skiprows=1)
traint = np.loadtxt('y_train.csv', delimiter=',', skiprows=1)
traint = traint[:,1][:,None]-1
testX = np.loadtxt('X_test.csv', delimiter=',', skiprows=1)

def cv(trainX, traint, folds=20):
    sample_count = trainX.shape[0]
    feature_count = trainX.shape[1]
    fold_size = sample_count//folds
    
    average_accuracy = 0
    
    for i in range(folds):
        split_index1 = i*fold_size
        split_index2 = (i+1)*fold_size
        
        cv_testX = trainX[split_index1:split_index2]
        cv_testt = traint[split_index1:split_index2]
        cv_trainX = np.concatenate((trainX[:split_index1], trainX[split_index2:]))
        cv_traint = np.concatenate((traint[:split_index1], traint[split_index2:]))
              
        model = get_sk_model(cv_trainX, cv_traint)
        predictions = model.predict(cv_testX)
        
        print(predictions.shape)
        print(cv_testt.shape)
        acc = get_accuracy(predictions, cv_testt.ravel())
        average_accuracy += acc
        print("Iteration and accuracy:", i, acc)
    
    #print("Total accuracy:", average_accuracy/folds)
    return average_accuracy/folds

def get_accuracy(predictions, actual):
    N = predictions.shape[0]
    temp = predictions-actual
    misses = np.count_nonzero(temp)
    hits = N-misses
    return hits/N

def get_sk_model(trainX, traint):
    clf = RandomForestClassifier()
    model = clf.fit(trainX, traint.ravel())
    return model

def filter_by_variance(trainX, keep_count=10):
    sample_count = trainX.shape[0]
    feature_count = trainX.shape[1]
    variances = np.zeros(feature_count)
    
    # Find variance of each feature
    for i in range(feature_count):
        feature = trainX.T[i]
        variance = np.var(feature)
        variances[i] = variance    
    
    features_by_variance = variances.argsort()[:keep_count]

    newX = np.zeros((sample_count, keep_count))
    for i in range(keep_count):
        newX[:,i] = trainX[:,features_by_variance[i]]

    return newX, features_by_variance

def save_predictions(predictions, filename="predictions_forest.csv"):
    N = predictions.shape[0]
    output = np.ones((N, 2))
    output[:,0] = range(N)
    output[:,1] = predictions.ravel()
    np.savetxt(filename, output, fmt='%d', delimiter=",", header="Id,EpiOrStroma")
    print("Predictions saved")

# 22 is chosen as a heuristic
trainX, top_feature_indices = filter_by_variance(trainX, 22)
    
model = get_sk_model(trainX, traint)

cv(trainX, traint)

# Select only top features if filtered
testX = testX[:,top_feature_indices]
preds = model.predict(testX)
#save_predictions(preds+1)

(30,)
(30, 1)
Iteration and accuracy: 0 0.9666666666666667
(30,)
(30, 1)
Iteration and accuracy: 1 0.9
(30,)
(30, 1)
Iteration and accuracy: 2 0.7
(30,)
(30, 1)
Iteration and accuracy: 3 0.8333333333333334
(30,)
(30, 1)
Iteration and accuracy: 4 0.8333333333333334
(30,)
(30, 1)
Iteration and accuracy: 5 0.9333333333333333
(30,)
(30, 1)
Iteration and accuracy: 6 0.9
(30,)
(30, 1)
Iteration and accuracy: 7 0.9666666666666667
(30,)
(30, 1)
Iteration and accuracy: 8 0.9
(30,)
(30, 1)
Iteration and accuracy: 9 1.0
(30,)
(30, 1)
Iteration and accuracy: 10 0.9333333333333333
(30,)
(30, 1)
Iteration and accuracy: 11 0.8666666666666667
(30,)
(30, 1)
Iteration and accuracy: 12 0.9666666666666667
(30,)
(30, 1)
Iteration and accuracy: 13 0.9333333333333333
(30,)
(30, 1)
Iteration and accuracy: 14 0.9666666666666667
(30,)
(30, 1)
Iteration and accuracy: 15 0.9666666666666667
(30,)
(30, 1)
Iteration and accuracy: 16 0.8666666666666667
(30,)
(30, 1)
Iteration and accuracy: 17 0.9666666666666667
(30,)