In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import describe
from collections import defaultdict
from sklearn import model_selection,svm,metrics
from sklearn.preprocessing import MinMaxScaler
from itertools import combinations

In [2]:
def get_csv(ptnum,tube,markers):
    fcs = 'C:\\python27\\FCSlog10\\' + str(8*(ptnum-1)+tube) + '.csv'
    return np.genfromtxt(fcs, delimiter=',',usecols=markers)[1:,:].astype(int)

def histo(data):
    d = defaultdict(int) 
    for cell in data:
        d[tuple(cell)] += 1
    return d

def scale_data(data):
    scale = MinMaxScaler()
    #scale = MaxAbsScaler()
    data = scale.fit_transform(data)
    #data = data[:,data.sum(axis=0)!=0] # drop 100% empty columns
    return data

def get_combo(tube,markers):
    start = time.time()
    combo_hist = pd.DataFrame(histo(get_csv(1,tube,markers)),index=[0])
    
    for ptnum in range(2,360):
        combo_hist.ix[ptnum-1,:] = histo(get_csv(ptnum,tube,markers))

    print('combo complete - ' + str((time.time()-start)/60) + 'min')
    return combo_hist

def all_combos(tube):
    start = time.time()
    n = 0
    tube_data = pd.DataFrame()
    for markers in combinations(range(1,8),4):
        if n == 0:
            tube_data = get_combo(tube,markers)
        else:
            tube_data = pd.concat([tube_data, get_combo(tube,markers)],axis = 1)
        n += 1
    tube_data = scale_data(tube_data.fillna(0))
    
    print('tube complete - ' + str((time.time()-start)/60) + 'min')
    return tube_data

def gridsearch(X,Y,algorithm,params,cv,scoring):
    #start = time.time()
    grid = model_selection.GridSearchCV(algorithm, param_grid=params, cv=cv, scoring = scoring)
    grid.fit(X,Y)
    #print 'gridsearch: ' + str((time.time()-start)/60) + 'min'
    return pd.DataFrame(grid.cv_results_)

def split_and_fit(data):
    start = time.time()
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    #Y = Y.drop(339,axis = 0).reset_index(drop = True)
    #data = np.delete(data,339,0)
    
    predictions = list()
    
    step = 10
    ptlist = range(0,Y.shape[0])[::step]
    
    for pt in ptlist:

        if pt == 180:
            print('pt 180 complete')
        
        try:
            XTRAIN, XTEST = np.delete(data, range(pt,pt+step), 0), data[pt:pt+step]
            YTRAIN = Y.drop(range(pt,pt+step),axis = 0).as_matrix().reshape(Y.shape[0]-step,)
        except ValueError: 
            print pt
            XTRAIN, XTEST = np.delete(data, range(pt,Y.shape[0]), 0), data[pt:Y.shape[0]]
            YTRAIN = Y.drop(range(pt,Y.shape[0]),axis = 0).as_matrix().reshape(pt,)
        
        hiC = 4
        lowC = -8
        
        f1_scorer = metrics.make_scorer(metrics.f1_score, labels=['aml','normal'], pos_label='aml')
        algorithm, params, scoring = svm.LinearSVC(), {'C': np.logspace(lowC, hiC, hiC-lowC+1, endpoint=True)}, f1_scorer
        cv = model_selection.StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
        results = gridsearch(XTRAIN, YTRAIN, algorithm, params, cv, scoring)
        
        #gamma = results.ix[results['mean_test_score'].idxmax(),5]
        C = results.ix[results['mean_test_score'].idxmax(),4]
        #if (C==10**hiC)|(C==10**lowC):
        print C
                
        linSVC = svm.LinearSVC(C = C)
        linSVC.fit(XTRAIN,YTRAIN)
        predictions.extend(linSVC.predict(XTEST))
        #predictions.extend(list(logREG.predict_proba(XTEST)[:,0]))
    print 'split_and_fit: ' + str((time.time()-start)/60) + 'min'
    return predictions

def pred_map():
    master = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    for tube in range(1,8):
        data = all_combos(tube)
        master[tube] = split_and_fit(data)
        print('Tube ' + str(tube) + ' complete')
    
    predictions = []
    for i in range(0,359):
        if 'aml' in master[range(1,8)].loc[i].values.tolist():
            predictions.extend(['aml'])
        else: 
            predictions.extend(['normal'])
    return master,predictions

def f1(YPRED):
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    YTEST = Y['Condition'].tolist()
    TP, FN, FP = 0., 0., 0.
    for i in range(0,len(YTEST)):
        if (YTEST[i] == 'aml') and (YPRED[i]=='aml'):
            TP += 1
        if (YTEST[i] == 'aml') and (YPRED[i]=='normal'):    
            FN += 1
        if (YTEST[i] == 'normal') and (YPRED[i]=='aml'):
            FP += 1
    try: 
        recall = TP / (TP + FN)
        precision = TP / (TP + FP)
        f1 = 2 * (recall * precision) / (recall + precision)
    except ZeroDivisionError:
        return 0
        
    return f1, recall, precision, TP, FP, FN

In [3]:
Y, predictions = pred_map()

combo complete - 1.31823333104min
combo complete - 1.26620000203min
combo complete - 1.19566666683min
combo complete - 1.1721666654min
combo complete - 1.19566666683min
combo complete - 1.18505000273min
combo complete - 1.18176666896min
combo complete - 1.21041666667min
combo complete - 1.19519999822min
combo complete - 1.17719999949min
combo complete - 1.18463333448min
combo complete - 1.16616666714min
combo complete - 1.19670000076min
combo complete - 1.26890000105min
combo complete - 1.26793333292min
combo complete - 1.24181666772min
combo complete - 1.27074999809min
combo complete - 1.27871666749min
combo complete - 1.2453666687min
combo complete - 1.25140000184min
combo complete - 1.25654999812min
combo complete - 1.24150000016min
combo complete - 1.18471666574min
combo complete - 1.18621666829min
combo complete - 1.18056666851min
combo complete - 1.17206666867min
combo complete - 1.16639999946min
combo complete - 1.1785833319min
combo complete - 1.17315000296min
combo complete - 

  'precision', 'predicted', average, warn_for)


0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.1
0.01
0.01
0.01
0.01
0.01
0.01
0.01
pt 180 complete
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
350
0.01
split_and_fit: 9.85091666381min
Tube 1 complete
combo complete - 1.25828333298min
combo complete - 1.21593333483min
combo complete - 1.23870000045min
combo complete - 1.20143333276min
combo complete - 1.21955000162min
combo complete - 1.21816666524min
combo complete - 1.2046333313min
combo complete - 1.18914999962min
combo complete - 1.26541666587min
combo complete - 1.27773333391min
combo complete - 1.29145000378min
combo complete - 1.27998333375min
combo complete - 1.28450000286min
combo complete - 1.2987833341min
combo complete - 1.28811666568min
combo complete - 1.29371666908min
combo complete - 1.2987833341min
combo complete - 1.26048333645min
combo complete - 1.21480000019min
combo complete - 1.19821666876min
combo complete - 1.19341666698min
combo complete - 1.20389999946min
combo co

In [4]:
Y.to_csv('C:\\python27\\4dhist.csv')

In [7]:
Y.insert(1,'Ypred',predictions)


(0.9647058823529412, 0.9534883720930233, 41.0, 1.0, 2.0)

In [11]:
f1(predictions)

(0.9647058823529412, 0.9534883720930233, 0.9761904761904762, 41.0, 1.0, 2.0)

In [8]:
pd.options.display.max_rows = 360
Y

Unnamed: 0,Condition,Ypred,1,2,3,4,5,6,7
0,normal,normal,normal,normal,normal,normal,normal,normal,normal
1,normal,normal,normal,normal,normal,normal,normal,normal,normal
2,normal,normal,normal,normal,normal,normal,normal,normal,normal
3,normal,normal,normal,normal,normal,normal,normal,normal,normal
4,aml,aml,aml,aml,aml,aml,aml,aml,aml
5,normal,normal,normal,normal,normal,normal,normal,normal,normal
6,aml,normal,normal,normal,normal,normal,normal,normal,normal
7,normal,normal,normal,normal,normal,normal,normal,normal,normal
8,aml,aml,aml,aml,aml,aml,aml,aml,aml
9,normal,normal,normal,normal,normal,normal,normal,normal,normal


In [6]:
def split_and_fit(data):
    start = time.time()
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    #Y = Y.drop(339,axis = 0).reset_index(drop = True)
    #data = np.delete(data,339,0)
    
    predictions = list()
    
    step = 10
    ptlist = range(0,Y.shape[0])[::step]
    
    for pt in ptlist:

        if pt == 180:
            print('pt 180 complete')
        
        try:
            XTRAIN, XTEST = np.delete(data, range(pt,pt+step), 0), data[pt:pt+step]
            YTRAIN = Y.drop(range(pt,pt+step),axis = 0).as_matrix().reshape(Y.shape[0]-step,)
        except ValueError: 
            print pt
            XTRAIN, XTEST = np.delete(data, range(pt,Y.shape[0]), 0), data[pt:Y.shape[0]]
            YTRAIN = Y.drop(range(pt,Y.shape[0]),axis = 0).as_matrix().reshape(pt,)
        
        hiC = 6
        lowC = 1
        
        f1_scorer = metrics.make_scorer(metrics.f1_score, labels=['aml','normal'], pos_label='aml')
        algorithm, params, scoring = svm.SVC(), {'kernel': ['linear'], 'C': np.logspace(lowC, hiC, hiC-lowC+1, endpoint=True)}, f1_scorer
        cv = model_selection.StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
        results = gridsearch(XTRAIN, YTRAIN, algorithm, params, cv, scoring)
        
        #gamma = results.ix[results['mean_test_score'].idxmax(),5]
        C = results.ix[results['mean_test_score'].idxmax(),4]
        #if (C==10**hiC)|(C==10**lowC):
        print C
                
        linSVC = svm.SVC(kernel = 'linear', C = C, probability = True)
        linSVC.fit(XTRAIN,YTRAIN)
        #predictions.extend(linSVC.predict(XTEST))
        predictions.extend(list(linSVC.predict_proba(XTEST)[:,0]))
    print 'split_and_fit: ' + str((time.time()-start)/60) + 'min'
    return predictions

def pred_map():
    master = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    for tube in range(1,8):
        data = all_combos(tube)
        master[tube] = split_and_fit(data)
        print('Tube ' + str(tube) + ' complete')
    
    return master

In [7]:
Y = pred_map() 

combo complete - 1.18029999733min
combo complete - 1.16838333209min
combo complete - 1.17293333213min
combo complete - 1.1449666659min
combo complete - 1.16141666571min
combo complete - 1.13923333089min
combo complete - 1.13018333117min
combo complete - 1.16159999768min
combo complete - 1.18495000203min
combo complete - 1.14271666606min
combo complete - 1.14703333378min
combo complete - 1.07989999851min
combo complete - 1.07166666587min
combo complete - 1.08096666733min
combo complete - 1.07343333165min
combo complete - 1.07781666517min
combo complete - 1.16011666457min
combo complete - 1.1735833327min
combo complete - 1.12858333588min
combo complete - 1.10071666638min
combo complete - 1.10803333521min
combo complete - 1.09495000045min
combo complete - 1.14126666784min
combo complete - 1.1426333348min
combo complete - 1.14856666724min
combo complete - 1.1412833333min
combo complete - 1.13071666559min
combo complete - 1.12543333371min
combo complete - 1.09325000048min
combo complete - 1

In [8]:
Y

Unnamed: 0,Condition,1,2,3,4,5,6,7
0,normal,6.499880e-03,3.644383e-03,1.177276e-02,1.936144e-02,3.849198e-06,9.620666e-03,6.496179e-02
1,normal,1.371382e-06,3.670196e-03,3.774823e-07,3.289183e-03,4.169385e-06,2.563294e-03,2.735515e-03
2,normal,3.904726e-06,7.787156e-06,3.037031e-03,1.578893e-06,5.183742e-07,2.275450e-07,2.541850e-06
3,normal,2.691297e-03,4.271411e-03,5.983152e-03,4.747873e-07,2.203093e-02,1.244340e-01,2.567162e-02
4,aml,6.883395e-01,9.561998e-01,7.461582e-01,9.607898e-01,9.793576e-01,9.801510e-01,9.750465e-01
5,normal,2.383588e-06,8.312305e-03,3.266942e-06,9.311565e-07,4.648099e-03,2.178380e-06,9.145742e-07
6,aml,9.282207e-02,2.025373e-02,4.650753e-02,1.376758e-02,1.923277e-01,7.542855e-02,2.367138e-01
7,normal,2.859984e-03,3.595736e-02,6.939889e-03,7.139750e-03,9.029801e-06,1.847367e-05,6.759003e-03
8,aml,9.999994e-01,9.999877e-01,9.998034e-01,9.943091e-01,9.999999e-01,9.999913e-01,9.999995e-01
9,normal,6.387191e-03,1.220448e-02,3.791902e-03,3.751573e-03,2.911484e-03,1.382882e-05,6.489389e-06


In [10]:
pd.options.display.max_rows = 360
Y.round(3)

Unnamed: 0,Condition,1,2,3,4,5,6,7
0,normal,0.006,0.004,0.012,0.019,0.0,0.01,0.065
1,normal,0.0,0.004,0.0,0.003,0.0,0.003,0.003
2,normal,0.0,0.0,0.003,0.0,0.0,0.0,0.0
3,normal,0.003,0.004,0.006,0.0,0.022,0.124,0.026
4,aml,0.688,0.956,0.746,0.961,0.979,0.98,0.975
5,normal,0.0,0.008,0.0,0.0,0.005,0.0,0.0
6,aml,0.093,0.02,0.047,0.014,0.192,0.075,0.237
7,normal,0.003,0.036,0.007,0.007,0.0,0.0,0.007
8,aml,1.0,1.0,1.0,0.994,1.0,1.0,1.0
9,normal,0.006,0.012,0.004,0.004,0.003,0.0,0.0
