In [36]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import describe
from collections import defaultdict
from sklearn import model_selection,svm,metrics
from sklearn.preprocessing import MinMaxScaler
from itertools import combinations

In [45]:
def get_csv(ptnum,tube,markers):
    fcs = 'C:\\python27\\FCSlog10\\' + str(8*(ptnum-1)+tube) + '.csv'
    return np.genfromtxt(fcs, delimiter=',',usecols=markers)[1:,:].astype(int)

def histo(data):
    d = defaultdict(int) 
    for cell in data:
        d[tuple(cell)] += 1
    return d

def scale_data(data):
    scale = MinMaxScaler()
    #scale = MaxAbsScaler()
    data = scale.fit_transform(data)
    #data = data[:,data.sum(axis=0)!=0] # drop 100% empty columns
    return data

def get_combo(tube,markers):
    start = time.time()
    combo_hist = pd.DataFrame(histo(get_csv(1,tube,markers)),index=[0])
    
    for ptnum in range(2,360):
        combo_hist.ix[ptnum-1,:] = histo(get_csv(ptnum,tube,markers))

    print('combo complete - ' + str((time.time()-start)/60) + 'min')
    return combo_hist

def all_combos(tube):
    start = time.time()
    n = 0
    tube_data = pd.DataFrame()
    for markers in combinations(range(1,8),6):
        if n == 0:
            tube_data = get_combo(tube,markers)
        else:
            tube_data = pd.concat([tube_data, get_combo(tube,markers)],axis = 1)
        n += 1
    tube_data = scale_data(tube_data.fillna(0))
    
    print('tube complete - ' + str((time.time()-start)/60) + 'min')
    return tube_data

def gridsearch(X,Y,algorithm,params,cv,scoring):
    #start = time.time()
    grid = model_selection.GridSearchCV(algorithm, param_grid=params, cv=cv, scoring = scoring)
    grid.fit(X,Y)
    #print 'gridsearch: ' + str((time.time()-start)/60) + 'min'
    return pd.DataFrame(grid.cv_results_)

def split_and_fit(data):
    start = time.time()
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    #Y = Y.drop(339,axis = 0).reset_index(drop = True)
    #data = np.delete(data,339,0)
    
    predictions = list()
    
    step = 10
    ptlist = range(0,Y.shape[0])[::step]
    
    for pt in ptlist:

        if pt == 180:
            print('pt 180 complete')
        
        try:
            XTRAIN, XTEST = np.delete(data, range(pt,pt+step), 0), data[pt:pt+step]
            YTRAIN = Y.drop(range(pt,pt+step),axis = 0).as_matrix().reshape(Y.shape[0]-step,)
        except ValueError: 
            print pt
            XTRAIN, XTEST = np.delete(data, range(pt,Y.shape[0]), 0), data[pt:Y.shape[0]]
            YTRAIN = Y.drop(range(pt,Y.shape[0]),axis = 0).as_matrix().reshape(pt,)
        
        hiC = 4
        lowC = -8
        
        f1_scorer = metrics.make_scorer(metrics.f1_score, labels=['aml','normal'], pos_label='aml')
        algorithm, params, scoring = svm.LinearSVC(), {'C': np.logspace(lowC, hiC, hiC-lowC+1, endpoint=True)}, f1_scorer
        cv = model_selection.StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
        results = gridsearch(XTRAIN, YTRAIN, algorithm, params, cv, scoring)
        
        #gamma = results.ix[results['mean_test_score'].idxmax(),5]
        C = results.ix[results['mean_test_score'].idxmax(),4]
        #if (C==10**hiC)|(C==10**lowC):
        print C
                
        linSVC = svm.LinearSVC(C = C)
        linSVC.fit(XTRAIN,YTRAIN)
        predictions.extend(linSVC.predict(XTEST))
        #predictions.extend(list(logREG.predict_proba(XTEST)[:,0]))
    print 'split_and_fit: ' + str((time.time()-start)/60) + 'min'
    return predictions

def pred_map():
    master = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    
    for tube in range(1,8):
        data = all_combos(tube)
        master[tube] = split_and_fit(data)
        print('Tube ' + str(tube) + ' complete')
    
    predictions = []
    for i in range(0,359):
        if 'aml' in master[range(1,8)].loc[i].values.tolist():
            predictions.extend(['aml'])
        else: 
            predictions.extend(['normal'])
    return master,predictions

def f1(YPRED):
    Y = pd.read_csv('C:\\python27\\CAPSTONE\\Y.csv')
    YTEST = Y['Condition'].tolist()
    TP, FN, FP = 0., 0., 0.
    for i in range(0,len(YTEST)):
        if (YTEST[i] == 'aml') and (YPRED[i]=='aml'):
            TP += 1
        if (YTEST[i] == 'aml') and (YPRED[i]=='normal'):    
            FN += 1
        if (YTEST[i] == 'normal') and (YPRED[i]=='aml'):
            FP += 1
    try: 
        recall = TP / (TP + FN)
        precision = TP / (TP + FP)
        f1 = 2 * (recall * precision) / (recall + precision)
    except ZeroDivisionError:
        return 0
        
    return f1, recall, precision, TP, FP, FN

In [40]:
Y, predictions = pred_map()

combo complete - 1.66761666536min
combo complete - 1.66106666724min
combo complete - 1.55036666791min
combo complete - 1.70305000146min
combo complete - 1.68521666527min
combo complete - 1.62181666692min
combo complete - 1.69220000108min
tube complete - 11.5870333314min
0.01
0.01
0.1
0.1
0.01
0.1
0.1
0.01
0.1
0.01
0.01
0.1
0.01
0.1
0.01
0.1
0.1
0.01
pt 180 complete
0.01
0.01
0.1
0.1
0.1
0.1
0.01
0.01
0.1
0.01
0.1
0.1
0.01
0.1
0.1
0.01
0.1
350
0.01
split_and_fit: 6.54185000261min
Tube 1 complete
combo complete - 1.74593333403min
combo complete - 1.74881666899min
combo complete - 1.60833333333min
combo complete - 1.7658833305min
combo complete - 1.72353333632min
combo complete - 1.87728333473min
combo complete - 1.71951666673min
tube complete - 12.1973166704min
0.01
0.1
0.1
0.1
0.01
0.1
0.01
0.1
0.1
0.01
0.01
0.1
0.1
0.01
0.01
0.01
0.01
0.1
pt 180 complete
0.01
0.1
0.1
1.0
1.0
0.01
0.1
0.01
0.1
0.01
0.01
0.01
0.01
0.01
0.1
0.01
0.01
350
0.1
split_and_fit: 9.42971666654min
Tube 2 complete

In [41]:
Y.to_csv('C:\\python27\\6dhist.csv')

In [42]:
f1(predictions)

(0.9534883720930233, 0.9534883720930233, 41.0, 2.0, 2.0)

In [44]:
pd.options.display.max_rows = 360
Y

Unnamed: 0,Condition,1,2,3,4,5,6,7
0,normal,normal,normal,normal,normal,normal,normal,normal
1,normal,normal,normal,normal,normal,normal,normal,normal
2,normal,normal,normal,normal,normal,normal,normal,normal
3,normal,normal,normal,normal,normal,normal,normal,normal
4,aml,aml,aml,aml,aml,aml,aml,aml
5,normal,normal,normal,normal,normal,normal,normal,normal
6,aml,normal,normal,normal,normal,normal,normal,normal
7,normal,normal,normal,normal,normal,normal,normal,normal
8,aml,aml,aml,aml,aml,aml,aml,aml
9,normal,normal,normal,normal,normal,normal,normal,normal


In [46]:
Y = pd.read_csv('C:\\python27\\6dhist.csv')

In [52]:
predictions = []
for i in range(0,359):
    if 'aml' in Y[range(2,9)].loc[i].values.tolist():
        predictions.extend(['aml'])
    else: 
        predictions.extend(['normal'])

In [53]:
f1(predictions)

(0.9534883720930233, 0.9534883720930233, 0.9534883720930233, 41.0, 2.0, 2.0)

In [54]:
Y

Unnamed: 0.1,Unnamed: 0,Condition,1,2,3,4,5,6,7
0,0,normal,normal,normal,normal,normal,normal,normal,normal
1,1,normal,normal,normal,normal,normal,normal,normal,normal
2,2,normal,normal,normal,normal,normal,normal,normal,normal
3,3,normal,normal,normal,normal,normal,normal,normal,normal
4,4,aml,aml,aml,aml,aml,aml,aml,aml
5,5,normal,normal,normal,normal,normal,normal,normal,normal
6,6,aml,normal,normal,normal,normal,normal,normal,normal
7,7,normal,normal,normal,normal,normal,normal,normal,normal
8,8,aml,aml,aml,aml,aml,aml,aml,aml
9,9,normal,normal,normal,normal,normal,normal,normal,normal
