In [1]:
import numpy as np
import pandas as pd
import time

from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler
from sklearn.svm import LinearSVC

from sklearn import model_selection, metrics, svm
from scipy.stats import describe
from itertools import combinations

In [2]:
def get_csv(ptnum, tube):
    fcs = 'C:\\python27\\FCSlog10\\' + str((ptnum-1)*8+tube) + '.csv' 
    return np.genfromtxt(fcs, delimiter=',',usecols = range(1,8))[1:,:]

def get_histo(data):
    line = [0]*1000
    for cell in data:
        line[int(cell[0] + cell[1]*10 + cell[2]*10**2)] += 1
    return line

def get_tube(data):
    pt_histo = []
    n = 0
    for combo in combinations(range(7),3):
        if n == 0:
            pt_histo = get_histo(data[:,combo])
        else:
            pt_histo = pt_histo + get_histo(data[:,combo])
        n += 1  
    return pt_histo

def all_pts(tubenum):
    start = time.time()
    all_histo = np.zeros((359,35000))
    for ptnum in range(1,360):
        if ptnum == 180:
            print 'gathering pt 180 data'
        data = get_csv(ptnum,tubenum)
        all_histo[ptnum-1,:] = get_tube(data)
    all_histo = all_histo[:,all_histo.sum(axis=0)!=0]
    print 'all_pts - ' + str((time.time()-start)/60) + 'min'
    
    scale = MinMaxScaler()
    all_histo = scale.fit_transform(all_histo)
    return all_histo

def all_tubes():
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    for tubenum in range(1,8):
        data = all_pts(tubenum)
        preds = split_and_fit(data)
        Y[tubenum]=preds
    return Y

In [3]:
def gridsearch(X,Y,algorithm,params,cv,scoring):
    #start = time.time()
    grid = model_selection.GridSearchCV(algorithm, param_grid=params, cv=cv, scoring = scoring)
    grid.fit(X,Y)
    #print 'gridsearch: ' + str((time.time()-start)/60) + 'min'
    return pd.DataFrame(grid.cv_results_)

def split_and_fit(data):
    start = time.time()
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    #Y = Y.drop(115,axis = 0).reset_index(drop = True)
    #data = np.delete(data,115,0)
    
    predictions = list()
    
    step = 10
    ptlist = range(0,Y.shape[0])[::step]
    
    for pt in ptlist:

        if pt == 180:
            print('pt 180 complete')
        
        try:
            XTRAIN, XTEST = np.delete(data, range(pt,pt+step), 0), data[pt:pt+step]
            YTRAIN = Y.drop(range(pt,pt+step),axis = 0).as_matrix().reshape(Y.shape[0]-step,)
        except ValueError: 
            print pt
            XTRAIN, XTEST = np.delete(data, range(pt,Y.shape[0]), 0), data[pt:Y.shape[0]]
            YTRAIN = Y.drop(range(pt,Y.shape[0]),axis = 0).as_matrix().reshape(pt,)
        
        f1_scorer = metrics.make_scorer(metrics.f1_score, labels=['aml','normal'], pos_label='aml')
        algorithm, params, scoring = svm.SVC(), {'C': np.logspace(1, 5, 5, endpoint=True)}, f1_scorer
        cv = model_selection.StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
        results = gridsearch(XTRAIN, YTRAIN, algorithm, params, cv, scoring)
        
        #gamma = results.ix[results['mean_test_score'].idxmax(),5]
        C = results.ix[results['mean_test_score'].idxmax(),4]
        print C
        
        linSVC = svm.SVC(C = C,probability=False)
        linSVC.fit(XTRAIN,YTRAIN)
        predictions.extend(linSVC.predict(XTEST))
        #predictions.extend(list(linSVC.predict_proba(XTEST)[:,0]))
    print 'split_and_fit: ' + str((time.time()-start)/60) + 'min'
    return predictions

def f1(YPRED):
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    YTEST = Y['Condition'].tolist()
    TP, FN, FP = 0., 0., 0.
    for i in range(0,len(YTEST)):
        if (YTEST[i] == 'aml') and (YPRED[i]=='aml'):
            TP += 1
        if (YTEST[i] == 'aml') and (YPRED[i]=='normal'):    
            FN += 1
        if (YTEST[i] == 'normal') and (YPRED[i]=='aml'):
            FP += 1
    try: 
        recall = TP / (TP + FN)
        precision = TP / (TP + FP)
        f1 = 2 * (recall * precision) / (recall + precision)
    except ZeroDivisionError:
        return 0
        
    return f1, recall, TP, FP, FN



In [4]:
preds = all_tubes()

gathering pt 180 data
all_pts - 7.04088333448min


  'precision', 'predicted', average, warn_for)


1000.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
1000.0
1000.0
pt 180 complete
100.0
100.0
100.0
1000.0
100.0
100.0
100.0
100.0
1000.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
1000.0
350
100.0
split_and_fit: 41.4499499997min
gathering pt 180 data
all_pts - 7.00849999984min
1000.0
1000.0
100.0
1000.0
100.0
1000.0
100.0
1000.0
1000.0
100.0
100.0
1000.0
1000.0
1000.0
1000.0
1000.0
1000.0
1000.0
pt 180 complete
1000.0
1000.0
1000.0
1000.0
100.0
100.0
1000.0
1000.0
1000.0
1000.0
100.0
100.0
1000.0
1000.0
1000.0
1000.0
1000.0
350
1000.0
split_and_fit: 42.7357833306min
gathering pt 180 data
all_pts - 7.04453333219min
1000.0
100.0
1000.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
pt 180 complete
100.0
100.0
100.0
100.0
100.0
1000.0
100.0
100.0
1000.0
100.0
100.0
100.0
100.0
1000.0
100.0
100.0
100.0
350
100.0
split_and_fit: 39.8372666677min
gathering pt 180 data
all_pts - 7.02591666778min
1000.0
1000

In [11]:
preds.loc[0].values.tolist()

['normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal']

In [29]:
predictions = []
for row in range(0,359):
    if 'aml' in preds.iloc[row,2:9].values.tolist():
        predictions.extend(['aml'])
    else:
        predictions.extend(['normal'])
#preds.insert(1,'Ypred',predictions)
preds['Ypred'] = predictions
preds

Unnamed: 0,Condition,Ypred,1,2,3,4,5,6,7
0,normal,normal,normal,normal,normal,normal,normal,normal,normal
1,normal,normal,normal,normal,normal,normal,normal,normal,normal
2,normal,normal,normal,normal,normal,normal,normal,normal,normal
3,normal,normal,normal,normal,normal,normal,normal,normal,normal
4,aml,aml,aml,aml,normal,aml,aml,aml,aml
5,normal,normal,normal,normal,normal,normal,normal,normal,normal
6,aml,aml,normal,normal,normal,aml,normal,normal,normal
7,normal,normal,normal,normal,normal,normal,normal,normal,normal
8,aml,aml,aml,aml,aml,aml,aml,aml,aml
9,normal,normal,normal,normal,normal,normal,normal,normal,normal


In [30]:
f1(predictions)

(0.9411764705882352, 0.9302325581395349, 40.0, 2.0, 3.0)

In [5]:
from collections import defaultdict

def get_csv(ptnum,tube,markers):
    fcs = 'C:\\python27\\FCSlog10\\' + str(8*(ptnum-1)+tube) + '.csv'
    return np.genfromtxt(fcs, delimiter=',',usecols=markers)[1:,:].astype(int)

def histo(data):
    d = defaultdict(int) 
    for cell in data:
        d[tuple(cell)] += 1
    return d

def scale_data(data):
    scale = MinMaxScaler()
    #scale = MaxAbsScaler()
    data = scale.fit_transform(data)
    #data = data[:,data.sum(axis=0)!=0] # drop 100% empty columns
    return data

def get_combo(tube,markers):
    start = time.time()
    combo_hist = pd.DataFrame(histo(get_csv(1,tube,markers)),index=[0])
    
    for ptnum in range(2,360):
        combo_hist.ix[ptnum-1,:] = histo(get_csv(ptnum,tube,markers))

    print('combo complete - ' + str((time.time()-start)/60) + 'min')
    return combo_hist

def all_combos(tube):
    start = time.time()
    firstcombo = True
    tube_data = pd.DataFrame()
    for markers in combinations(range(1,8),3):
        if firstcombo == True:
            tube_data = get_combo(tube,markers)
            firstcombo = False
        else:
            tube_data = pd.concat([tube_data, get_combo(tube,markers)],axis = 1)
    tube_data = scale_data(tube_data.fillna(0))
    
    print('tube complete - ' + str((time.time()-start)/60) + 'min')
    return tube_data

def gridsearch(X,Y,algorithm,params,cv,scoring):
    #start = time.time()
    grid = model_selection.GridSearchCV(algorithm, param_grid=params, cv=cv, scoring = scoring)
    grid.fit(X,Y)
    #print 'gridsearch: ' + str((time.time()-start)/60) + 'min'
    return pd.DataFrame(grid.cv_results_)

def split_and_fit(data):
    start = time.time()
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    #Y = Y.drop(339,axis = 0).reset_index(drop = True)
    #data = np.delete(data,339,0)
    
    predictions = list()
    
    step = 10
    ptlist = range(0,Y.shape[0])[::step]
    
    for pt in ptlist:

        if pt == 180:
            print('pt 180 complete')
        
        try:
            XTRAIN, XTEST = np.delete(data, range(pt,pt+step), 0), data[pt:pt+step]
            YTRAIN = Y.drop(range(pt,pt+step),axis = 0).as_matrix().reshape(Y.shape[0]-step,)
        except ValueError: 
            print pt
            XTRAIN, XTEST = np.delete(data, range(pt,Y.shape[0]), 0), data[pt:Y.shape[0]]
            YTRAIN = Y.drop(range(pt,Y.shape[0]),axis = 0).as_matrix().reshape(pt,)
        
        hiC = 6
        lowC = 1
        
        f1_scorer = metrics.make_scorer(metrics.f1_score, labels=['aml','normal'], pos_label='aml')
        algorithm, params, scoring = svm.LinearSVC(), {'C': np.logspace(lowC, hiC, hiC-lowC+1, endpoint=True)}, f1_scorer
        cv = model_selection.StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
        results = gridsearch(XTRAIN, YTRAIN, algorithm, params, cv, scoring)
        
        #gamma = results.ix[results['mean_test_score'].idxmax(),5]
        C = results.ix[results['mean_test_score'].idxmax(),4]
        #if (C==10**hiC)|(C==10**lowC):
        print C
                
        linSVC = svm.LinearSVC(C = C)
        linSVC.fit(XTRAIN,YTRAIN)
        predictions.extend(linSVC.predict(XTEST))
        #predictions.extend(list(logREG.predict_proba(XTEST)[:,0]))
    print 'split_and_fit: ' + str((time.time()-start)/60) + 'min'
    return predictions

def pred_map():
    master = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    for tube in range(1,8):
        data = all_combos(tube)
        master[tube] = split_and_fit(data)
        print('Tube ' + str(tube) + ' complete')
    
    predictions = []
    for i in range(0,359):
        if 'aml' in master[range(1,8)].loc[i].values.tolist():
            predictions.extend(['aml'])
        else: 
            predictions.extend(['normal'])
    return master,predictions

def f1(YPRED):
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    YTEST = Y['Condition'].tolist()
    TP, FN, FP = 0., 0., 0.
    for i in range(0,len(YTEST)):
        if (YTEST[i] == 'aml') and (YPRED[i]=='aml'):
            TP += 1
        if (YTEST[i] == 'aml') and (YPRED[i]=='normal'):    
            FN += 1
        if (YTEST[i] == 'normal') and (YPRED[i]=='aml'):
            FP += 1
    try: 
        recall = TP / (TP + FN)
        precision = TP / (TP + FP)
        f1 = 2 * (recall * precision) / (recall + precision)
    except ZeroDivisionError:
        return 0
        
    return f1, recall, precision, TP, FP, FN

In [4]:
Y, predictions = pred_map()
Y.to_csv('C:\\python27\\3dhist.csv')
Y.insert(1,"Ypred",predictions)
pd.options.display.max_rows = 360

combo complete - 0.984083334605min
combo complete - 0.974666666985min
combo complete - 0.959033334255min
combo complete - 0.969900000095min
combo complete - 0.957233333588min
combo complete - 0.94076666832min
combo complete - 0.973283330599min
combo complete - 1.00434999863min
combo complete - 0.963433333238min
combo complete - 0.966166667144min
combo complete - 0.968566668034min
combo complete - 0.963616665204min
combo complete - 0.95783333381min
combo complete - 0.945666666826min
combo complete - 0.94701666832min
combo complete - 0.994050002098min
combo complete - 0.99293333292min
combo complete - 0.973566667239min
combo complete - 0.956283334891min
combo complete - 0.988650000095min
combo complete - 0.991666666667min
combo complete - 0.975700000922min
combo complete - 0.978933334351min
combo complete - 0.991550000509min
combo complete - 1.00681666533min
combo complete - 1.00958333413min
combo complete - 1.00975000064min
combo complete - 1.01883333524min
combo complete - 1.0168166637

(0.8913043478260869, 0.9534883720930233, 41.0, 8.0, 2.0)

In [6]:
f1(predictions)

(0.8913043478260869, 0.9534883720930233, 0.8367346938775511, 41.0, 8.0, 2.0)

In [7]:
Y

Unnamed: 0,Condition,Ypred,1,2,3,4,5,6,7
0,normal,normal,normal,normal,normal,normal,normal,normal,normal
1,normal,normal,normal,normal,normal,normal,normal,normal,normal
2,normal,normal,normal,normal,normal,normal,normal,normal,normal
3,normal,normal,normal,normal,normal,normal,normal,normal,normal
4,aml,aml,normal,aml,normal,aml,aml,aml,aml
5,normal,normal,normal,normal,normal,normal,normal,normal,normal
6,aml,normal,normal,normal,normal,normal,normal,normal,normal
7,normal,normal,normal,normal,normal,normal,normal,normal,normal
8,aml,aml,aml,aml,aml,aml,aml,aml,aml
9,normal,normal,normal,normal,normal,normal,normal,normal,normal
