In [3]:
import pandas as pd
import numpy as np
import inspect
import time
from itertools import combinations
from sklearn import model_selection,svm,metrics,linear_model
from sklearn.preprocessing import MinMaxScaler

from collections import defaultdict

In [5]:
def get_csv(ptnum,tube,markers):
    file_name = str(8*(ptnum-1)+tube)
    fcs = 'C:\\python27\\FCSlog10\\' + file_name + '.csv'
    return np.genfromtxt(fcs, delimiter=',',usecols=markers).astype(int)

def histo(data):
    """
    Input array of cells x markers. Marker levels must already be converted to bin numbers, 
    i.e. signal of 815 of max 1000 is bin 8.
    For each cell, the bin numbers are combined into a combination specific to the n-dimensional
    hypercube in the n-dimensional data space. 
    i.e. Cell #415 has signal 457 for SSC and 180 for CD15. If we are only considering those 2
    markers, the 'index' for the 2-d histogram bin is '4,1'. 
    Defaultdict functions as a counter, incrementing for the appropriate bin index for every 
    cell in the array.    
    """
    
    d = defaultdict(int) 
    for cell in data:
        d[cell] += 1
    return d

def scale_data(data):
    """
    MinMaxScaler normalizes data to 0-1. -1 to 1 may also be used interchangeably.
    """
    
    scale = MinMaxScaler()
    #scale = MaxAbsScaler()
    data = scale.fit_transform(data)
    #data = data[:,data.sum(axis=0)!=0] # drop 100% empty columns
    return data

def get_combo(tube,markers):
    """
    Input a given tube and set of markers to consider. 
    Calls histo() for every patient and adds the result as a new row in a df. 
    Returns 359 row df. Columns represent bin indexes. Data is counts of cells falling in those bins.
    """
    
    start = time.time()
    combo_hist = pd.DataFrame(histo(get_csv(1,tube,markers)),index=[0])
    
    for ptnum in range(2,360):
        combo_hist.ix[ptnum-1,:] = histo(get_csv(ptnum,tube,markers))

    print('combo complete - ' + str((time.time()-start)/60) + 'min')
    return combo_hist

def all_combos(tube):
    """
    Input tube to consider.
    For every combination of markers (however many markers at a time we are considering),
    call tube_data() to get histo data for all cells for every patient for that combo.
    Iterate for every combo of markers and concat.
    Output is 359 patients x nCr dfs concat'ed horizontally, where m_columns in those 
    dfs depends on how many bins actually had cells (no cells in the bin for all 
    patients = not added to df).
    """
    
    start = time.time()
    firstcombo = True
    tube_data = pd.DataFrame()
    for markers in range(1,8):
        if firstcombo == True:
            tube_data = get_combo(tube,markers)
            firstcombo = False
        else:
            tube_data = pd.concat([tube_data, get_combo(tube,markers)],axis = 1)
    tube_data = scale_data(tube_data.fillna(0))
    
    print('tube complete - ' + str((time.time()-start)/60) + 'min')
    return tube_data

def gridsearch(X,Y,algorithm,params,cv,scoring):
    """
    Grid search iterator for finding C.
    Input data, labels, algo (SVM), range of C to test, CV method, scoring method)
    Output result grid
    """
    
    #start = time.time()
    grid = model_selection.GridSearchCV(algorithm, param_grid=params, cv=cv, scoring = scoring)
    grid.fit(X,Y)
    #print 'gridsearch: ' + str((time.time()-start)/60) + 'min'
    return pd.DataFrame(grid.cv_results_)

def split_and_fit(data):
    """
    Create preds from tube data generated from all_tubes().
    Input data from all_tubes(). 
    Create test folds of 72 (5 folds total). 
    For training 80% of data, run gridsearch and select C that produces highest test score.
    Use that C to predict on test fold.
    Repeat for remaining folds.  Add predictions to end of list of previous fold(s).
    Return 359 predictions.    
    """
    
    start = time.time()
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    #Y = Y.drop(339,axis = 0).reset_index(drop = True)
    #data = np.delete(data,339,0)
    
    predictions = list()
    
    step = 72
    ptlist = range(0,Y.shape[0])[::step]
    
    for pt in ptlist:

        if pt == 180:
            print('pt 180 complete')
        
        try:
            XTRAIN, XTEST = np.delete(data, range(pt,pt+step), 0), data[pt:pt+step]
            YTRAIN = Y.drop(range(pt,pt+step),axis = 0).as_matrix().reshape(Y.shape[0]-step,)
        except ValueError: 
            print pt
            XTRAIN, XTEST = np.delete(data, range(pt,Y.shape[0]), 0), data[pt:Y.shape[0]]
            YTRAIN = Y.drop(range(pt,Y.shape[0]),axis = 0).as_matrix().reshape(pt,)
        
        hiC = 6
        lowC = 1
        
        f1_scorer = metrics.make_scorer(metrics.f1_score, labels=['aml','normal'], pos_label='aml')
        algorithm, params, scoring = svm.LinearSVC(), {'C': np.logspace(lowC, hiC, hiC-lowC+1, endpoint=True)}, f1_scorer
        cv = model_selection.StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
        results = gridsearch(XTRAIN, YTRAIN, algorithm, params, cv, scoring)
        
        #gamma = results.ix[results['mean_test_score'].idxmax(),5]
        C = results.ix[results['mean_test_score'].idxmax(),4]
        #if (C==10**hiC)|(C==10**lowC):
        print C
                
        linSVC = svm.LinearSVC(C = C)
        linSVC.fit(XTRAIN,YTRAIN)
        predictions.extend(linSVC.predict(XTEST))
        #predictions.extend(list(logREG.predict_proba(XTEST)[:,0]))
    print 'split_and_fit: ' + str((time.time()-start)/60) + 'min'
    return predictions

def pred_map():
    """
    Create 359 x 8 map. First column is true label for all patients (gold standard). 
    Remaining columns are preds corresponding to 1 tube per column.
    Also return a separate master list of preds based on this map where if any one tube was 
    called positive, the master pred was positive.
    """
    
    master = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    for tube in range(1,8):
        data = all_combos(tube)
        master[tube] = split_and_fit(data)
        print('Tube ' + str(tube) + ' complete')
    
    predictions = []
    for i in range(0,359):
        if 'aml' in master[range(1,8)].loc[i].values.tolist():
            predictions.extend(['aml'])
        else: 
            predictions.extend(['normal'])
    return master,predictions

def f1(YPRED):
    """
    f1 function = 2 * harmonic mean of recall and precision.
    Input list of preds.
    Compare with gold standard labels.
    Output f1, recall, precision, TP, FP, FN
    """
    
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    YTEST = Y['Condition'].tolist()
    TP, FN, FP = 0., 0., 0.
    for i in range(0,len(YTEST)):
        if (YTEST[i] == 'aml') and (YPRED[i]=='aml'):
            TP += 1
        if (YTEST[i] == 'aml') and (YPRED[i]=='normal'):    
            FN += 1
        if (YTEST[i] == 'normal') and (YPRED[i]=='aml'):
            FP += 1
    try: 
        recall = TP / (TP + FN)
        precision = TP / (TP + FP)
        f1 = 2 * (recall * precision) / (recall + precision)
    except ZeroDivisionError:
        return 0
        
    return f1, recall, precision, TP, FP, FN

In [10]:
Y, predictions = pred_map()
Y.to_csv("C:\\python27\\1dhist.csv")
Y.insert(1,"Ypred",predictions)
pd.options.display.max_rows = 360
f1(predictions)

combo complete - 0.491983334223min
combo complete - 0.49048333168min
combo complete - 0.489449997743min
combo complete - 0.490466666222min
combo complete - 0.500499999523min
combo complete - 0.524916668733min
combo complete - 0.523766668638min
tube complete - 3.51181666454min
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
pt 180 complete
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
350
10.0
split_and_fit: 0.3875min
Tube 1 complete
combo complete - 0.50693333149min
combo complete - 0.507033336163min
combo complete - 0.498099998633min
combo complete - 0.488083334764min
combo complete - 0.495800002416min
combo complete - 0.489283335209min
combo complete - 0.49678333203min
tube complete - 3.48213333289min
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
pt 180 complete
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
350
10.0
split_

(0.8118811881188119, 0.9534883720930233, 0.7068965517241379, 41.0, 17.0, 2.0)

In [11]:
Y

Unnamed: 0,Condition,Ypred,1,2,3,4,5,6,7
0,normal,normal,normal,normal,normal,normal,normal,normal,normal
1,normal,normal,normal,normal,normal,normal,normal,normal,normal
2,normal,normal,normal,normal,normal,normal,normal,normal,normal
3,normal,normal,normal,normal,normal,normal,normal,normal,normal
4,aml,aml,aml,aml,aml,aml,aml,aml,aml
5,normal,normal,normal,normal,normal,normal,normal,normal,normal
6,aml,normal,normal,normal,normal,normal,normal,normal,normal
7,normal,normal,normal,normal,normal,normal,normal,normal,normal
8,aml,aml,aml,aml,aml,aml,aml,aml,aml
9,normal,normal,normal,normal,normal,normal,normal,normal,normal


5 folds

In [4]:
Y, predictions = pred_map()
Y.to_csv("C:\\python27\\1dhist5fold.csv")
Y.insert(1,"Ypred",predictions)
pd.options.display.max_rows = 360
f1(predictions)
Y

combo complete - 0.582899999619min
combo complete - 0.550933333238min
combo complete - 0.5397666653min
combo complete - 0.541449999809min
combo complete - 0.544099998474min
combo complete - 0.551499998569min
combo complete - 0.544283334414min
tube complete - 3.8551333348min
10.0
10.0
10.0
10.0
288
10.0
split_and_fit: 0.0432500004768min
Tube 1 complete
combo complete - 0.574533331394min
combo complete - 0.557083332539min
combo complete - 0.551983332634min
combo complete - 0.551850001017min
combo complete - 0.553483335177min
combo complete - 0.569033332666min
combo complete - 0.570933334033min
tube complete - 3.92901666562min
10.0
10.0
100000.0
10.0
288
10.0
split_and_fit: 0.0417999982834min
Tube 2 complete
combo complete - 0.587299998601min
combo complete - 0.547300000985min
combo complete - 0.532616666953min
combo complete - 0.530716665586min
combo complete - 0.526400001844min
combo complete - 0.527383331458min
combo complete - 0.526333332062min
tube complete - 3.7783833305min
10.0
10.

Unnamed: 0,Condition,Ypred,1,2,3,4,5,6,7
0,normal,aml,normal,normal,normal,normal,normal,normal,aml
1,normal,normal,normal,normal,normal,normal,normal,normal,normal
2,normal,normal,normal,normal,normal,normal,normal,normal,normal
3,normal,normal,normal,normal,normal,normal,normal,normal,normal
4,aml,aml,aml,aml,aml,aml,aml,aml,aml
5,normal,normal,normal,normal,normal,normal,normal,normal,normal
6,aml,aml,normal,normal,normal,aml,normal,normal,normal
7,normal,normal,normal,normal,normal,normal,normal,normal,normal
8,aml,aml,aml,aml,aml,aml,aml,aml,aml
9,normal,normal,normal,normal,normal,normal,normal,normal,normal


In [6]:
f1(predictions)

(0.7766990291262136, 0.9302325581395349, 0.6666666666666666, 40.0, 20.0, 3.0)