In [None]:
import numpy as np

from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as RF
import xgboost as xgb
import pandas as pd
import numpy as np
from glob import glob
from skimage import measure

In [1]:
def getRegionFromMask(mask):
    threshold = np.where(mask > np.mean(mask),0.,1.0)
    label = measure.label(threshold)
    labels = label.astype(int)
    region = measure.regionprops(labels)
    return region

def getFeatureVector(file_name):
    masks = np.load(file_name)
    num_mask = masks.shape[0]
    areas = []
    diameters = []
    total_area = 0.
    average_area = 0.
    max_area = 0.
    average_eccentricity = 0.
    average_diameter = 0.
    std_diameter = 0.
    X_pos = 0.
    Y_pos = 0.
    num_nodule = 0.
    num_nodule_slice = 0.
    
    regions = getRegionFromMask(masks[:,:])
    if len(regions) >= 1:
        for region in regions:
            total_area += region.area
            areas.append(region.area)
            #average_eccentricity += region.eccentricity
            #average_diameter += region.equivalent_diameter
            #diameters.append(region.equivalent_diameter)
            #X_pos += region.centroid[0]*region.area
            #Y_pos += region.centroid[1]*region.area
            num_nodule += 1
            
        #X_pos = X_pos / total_area
        #Y_pos = Y_pos / total_area
        #average_area = total_area / num_slice
        #average_eccentricity = average_eccentricity / numNodes
        #average_diameter = average_diameter / numNodes
        #std_diameter = np.std(eqDiameters)

        #max_area = max(areas)
        


        #num_nodule_slice = num_nodule*1. / num_nodule
    
    
        #return np.array([avgArea,maxArea,average_eccentricity,avgEquivlentDiameter,\
        #                 std_diameter, weightedX, weightedY, num_nodule, num_nodule_slice])
        return np.array([num_nodule, total_area])
    
    return np.zeros(2)


def createFeatureSet(nodfiles=None):
    if nodfiles == None:
        noddir = "training_set/" 
        nodfiles = glob(noddir +"*npy")
    truthdata = pd.read_csv('data/stage1_labels.csv')
    
    numfeatures = 2
    feature_array = np.zeros((len(nodfiles),numfeatures))
    truth_metric = np.zeros((len(nodfiles)))
    
    for i,nodfile in enumerate(nodfiles):
        patID = nodfile.split("/")[1].split(".")[0]
        truth_metric[i] = truthdata[truthdata.id == patID].cancer
        feature_array[i] = getFeatureVector(nodfile)
    
    np.save("dataY.npy", truth_metric)
    np.save("dataX.npy", feature_array)


def calculatelLoss(actual, predict):
    epsilon = 1e-15
    predict = np.max(epsilon, predict)
    predict = np.min(1-epsilon, predict)
    loss = sum(actual*np.log(predict) + (1 - actual)*np.log(1 - predict))
    loss = loss * -1.0/len(act)
    return loss


def classify():
    createFeatureSet()
    
    X = np.load("dataX.npy")
    Y = np.load("dataY.npy")

    print "--Random Forest--"
    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = RF(n_estimators=100, n_jobs=3)
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss: ",calculatelLoss(Y, y_pred))


    print "--Predicting all positive--"
    y_pred = np.ones(Y.shape)
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss: ",calculatelLoss(Y, y_pred))


    print "--Predicting all negative--"
    y_pred = Y*0
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss: ",calculatelLoss(Y, y_pred))


    print ("--XGBoost--")
    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = xgb.XGBClassifier(objective="binary:logistic")
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss: ",calculatelLoss(Y, y_pred))

In [None]:
classify()