In [1]:
# Comments from John

# This workflow is very much in an alpha state
# The current code is provided primarily as a reference
# A beta build (see below) will be provided ASAP for testing and general use

# # alpha -> software works but may have bugs and is under development
# # beta -> most bugs have been fixed, software is ready for wider testing
# # release -> all problems have been fixed and software is ready for widespread distribution

# The end goal is to create a python notebook that wil run the Machine Learning classifier defined here
# but will not require knowledge of Machine Learning by the researcher.

In [2]:
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import time

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.base import clone

In [3]:

def lambdaFunc(v):
    return int(v)

def openAndParse(f):
    listValue = []
    headerLine = ''
    start = True
    with open(f) as fOpen:
        for i in fOpen:
            if start:
                iLine = i.split(',')
                iLine = iLine.pop(0)
                headerLine = ','.join(iLine)
                start = False
            else:
                i = i.rstrip('\n')
                iSplit = i.split(',')
                iSplitInt = list(map(lambdaFunc, iSplit[1:]))
                # listValue.append((iSplit[0], iSplit[1:]))
                listValue.append((iSplit[0], iSplitInt))
    return (headerLine, listValue)

In [4]:
# insert dataset csv filenames.  They MUST be formatted as defined by Experiment 21324
# Future builds will take care of formatting, and will accept pivot tables and MiSeq output

trainingValues = openAndParse('allele_df-trainingSet-HapA.csv')
testingValues = openAndParse('allele_df-testingSet-HapA.csv')

In [17]:
X_train = np.empty([len(trainingValues[1]), len(trainingValues[1][1][1])], dtype=int)
Y_train = np.empty([len(trainingValues[1])], dtype=int)
X_test = np.empty([len(testingValues[1]), len(testingValues[1][1][1])], dtype=int)
Y_test = np.empty([len(testingValues[1])], dtype=int)
print(X_test)
print(X_test.shape)
print('\n\n')
print(Y_test)
print(Y_test.shape)
trainingValueFromTuple = trainingValues[1]
testingValueFromTuple = testingValues[1]
print(testingValueFromTuple)
print(len(testingValueFromTuple))
lookupTableBecauseNumpy = []
lookupTableBecauseNumpyTesting = []
stopRange = len(trainingValueFromTuple)
for x in range(0, stopRange):
    aValue_x = trainingValueFromTuple[x][1]
    aValue_y = trainingValueFromTuple[x][0]
    X_train[x] = aValue_x
    # X_train[x] = aValue_xList
    # np.insert(X_train[x], aValue_xList)
    lookupTableBecauseNumpy.append(aValue_y)
    Y_train[x] = np.array(x, dtype=int)
stopRange = len(testingValueFromTuple)
for x in range(0, stopRange):
    aValue_x = testingValueFromTuple[x][1]
    aValue_y = testingValueFromTuple[x][0]
    X_test[x] = aValue_x
    lookupTableBecauseNumpyTesting.append(aValue_y)
    Y_test[x] = np.array(x, dtype=int)

[[                  0 8101242554838556677                   0 ...,
  7308339944545415026 2915076257986120233 8391162080290301292]
 [7310868735423114857 7813874358961533510 7741528794290792805 ...,
  8656044306584971621 7453001576143022368 8031170617525938277]
 [4190992423152865904 7013900260281950218 2323048684161693036 ...,
           4294967297                   0         42949672962]
 ..., 
 [    140600934913568                   1         64424509458 ...,
      140600934914200          8589934620          4436819784]
 [                  1         12884927744         17179869204 ...,
                 6400         17179869207                   0]
 [              25600                   0                   0 ...,
                    0                   0                   0]]
(21, 61)



[ 1152921504606846976 -5764598735475447064                   15
                    0           4294967296  7235419174270214779
  4771073664489765410  3762815969819509301  3473174958971236407
  499116

In [18]:
# Commentary from John:
'''
One thing I kept scratching my head over--and then subsequently face-palming over--was why
the linear model for the Classifier simply refused to work.

As a cautionary tale to anyone else, what was happening was I had correctly set up the X values as a one-hot vector, 
but then when I set the labels for Y, I assigned each one to a unique integer and then mapped them back to the corresponding string values.
This meant the linear model was being set up as 
X          Y 
[vector]   0
[vector]   1
[vector]   2
[vector]   3

Where each vector was something like:
[0,0,0,1,0,0]

And each Y integer was unique, and mapped back to a string, like "A001".  
 
The vectors were created from the allele names, where a mapping table was created that defined an allele that was present or not for a given haplotype (order was maintained).
Each vector represents an instance in an experiment where alleles for a sample mapped back to a haplotype as allele frequencies.  
Initially, replicated instances of Haplotypes were flattened and merged, however, I'll leave it as an exercise to the reader to see just how poorly this implementaion fit any linear model. 

What I should have been doing (and subsequently did) was:
X          Y 
[vector]   0
[vector]   0
[vector]   0
[vector]   1
[vector]   1

Where again, each vector was something like:
[0,0,0,1,0,0]

Each Y integer was each instance of the corresponding haplotype, like "A001", that mapped to it
Subsequently, all allele frequencies were included, regardless if the same allele pairing was repeated.
This created a usable linear model, and multiple n values that the library could use for calculations.


'''
# For these purposes, the labels were manually reformatted (see above) by visually scanning the list .  This will be automated.

def formatTestingValues(tList, tupleList):
    r = []
    validationInt = len(tList)
    for v in tList:
        for vInTuple in tupleList:
            if v == vInTuple[0]:
                r.append(vInTuple[1])
                break
    if len(r) != len(tList):
        print('WARNING!')
        print(r)
        print(tList)
        return None
    else:
        return r
def valuesToTestList(valList, npArrayTupleList):
    r = []
    matched = False
    for v in valList:
        for vTuple in npArrayTupleList:
            if v == vTuple[0]:
                matched = True
                r.append(vTuple[1])
                break
        if matched:
            matched = False
            continue
        else:
            print('WARNING!')
            print(valList)
            print(npArrayTupleList)
            return None
    return r
        
    
def valuesToIntList(valList):
    listedValues = []
    ct = 0
    l_strings = []
    l_ints = []
    l_strings_r = []
    l_ints_r = []
    npArrayList = []
    for i in valList:
        if i not in l_strings:
            l_strings_r.append(i)
            l_ints_r.append(ct)
            l_strings.append(i)
            l_ints.append(ct)
            npArrayList.append(ct)
            ct += 1
        else:
            v_i = l_strings.index(i)
            v_i_string = l_strings[v_i]
            v_ct = l_ints[v_i]
            l_strings_r.append(v_i_string)
            l_ints_r.append(v_ct)
            npArrayList.append(v_ct)
    npArrayTupleList = list(zip(l_strings_r,l_ints_r))
    return (npArrayList, npArrayTupleList)
'''
listedValues = []
ct = 0
l_strings = []
l_ints = []
l_strings_r = []
l_ints_r = []
print('[')
npArrayList = []
for i in lookupTableBecauseNumpy:
    if i not in l_strings:
        print(str(i) + ',' + str(ct))
        l_strings_r.append(i)
        l_ints_r.append(ct)
        # print(str(ct) + ',')
        l_strings.append(i)
        l_ints.append(ct)
        npArrayList.append(ct)
        ct += 1
    else:
        v_i = l_strings.index(i)
        v_i_string = l_strings[v_i]
        v_ct = l_ints[v_i]
        print(str(v_i_string) + ',' + str(v_ct))
        l_strings_r.append(v_i_string)
        l_ints_r.append(v_ct)
        npArrayList.append(v_ct)
        # print(str(v_ct) + ',')
# print(']')
# print(npArrayList)
'''

parsedTrainingResults = valuesToIntList(lookupTableBecauseNumpy)
Y_train = np.array(parsedTrainingResults[0])
parsedTestingResults = valuesToIntList(lookupTableBecauseNumpyTesting)
Y_testList = valuesToTestList(lookupTableBecauseNumpyTesting, parsedTestingResults[1])
print(Y_train)
print(parsedTrainingResults[1])
Y_test = np.array(Y_testList)

[ 0  0  0  0  1  2  2  3  4  5  4  6  1  0  2  4  7  8  8  9  3  7  7  0  0
  0  4 10  0 10  0 11  0  8  5  5  3  6  7  5  7 11  7  0  4  4  9  2  0 10
  7  7  0  2  9  0  9  2  0  4  0 10  4  0  0  9  2  7  0  5  5  1  0  9  7
  3  0  0  0  0  7  5  9  0  9  2  0]
[('A004', 0), ('A004', 0), ('A004', 0), ('A004', 0), ('A004-A025', 1), ('A002a-A004', 2), ('A002a-A004', 2), ('A004-A011', 3), ('A004-A023', 4), ('A001-A008', 5), ('A004-A023', 4), ('A002a-A006', 6), ('A004-A025', 1), ('A004', 0), ('A002a-A004', 2), ('A004-A023', 4), ('A004-A008', 7), ('A001', 8), ('A001', 8), ('A001-A004', 9), ('A004-A011', 3), ('A004-A008', 7), ('A004-A008', 7), ('A004', 0), ('A004', 0), ('A004', 0), ('A004-A023', 4), ('A001-A002a', 10), ('A004', 0), ('A001-A002a', 10), ('A004', 0), ('A002a-A008', 11), ('A004', 0), ('A001', 8), ('A001-A008', 5), ('A001-A008', 5), ('A004-A011', 3), ('A002a-A006', 6), ('A004-A008', 7), ('A001-A008', 5), ('A004-A008', 7), ('A002a-A008', 11), ('A004-A008', 7), ('A004', 0), ('A

In [19]:
import itertools
x = [0,1,2,3,4,4,5,6,7,7,8,9,8,10,11,12,13,14,14,15,15,16,17,17,18,11,2,11,0,0,19,19,20,20,21,22,0,0,18,2,22,3,23,23,9,24,24,2,7,25,26,27,22,28,28,18,11,11,11,11,7,9,5,29,29,10,30,31,10,11,27,7,11,11,9,16,10,10,13,32,10,7,27,10,10,33,7,7,9,34,10,32,21,10,1,11,35,35,18,22,7,16,16,16,30,36,36,36,37,37,11,31,38,38,2,11,7,31,39,9,40,40,31,13,7,7,33,41,41,11,42,42,18,11,43,8,44,44,6,21,11,11,0,8,2,11,8,8,8,45,11,8,11,11,2,43,43,11,39,39,46,46,0,22,11,11,22,39,47,47,22,45,48,49,49,26,11,22,0,11,22,50,50,11,22,11,42,48,39,11,22,11,11,26,31,51,51,16,7,9,12,52,52,27,53,53,54,54,55,55,16,2,22,56,11,11,34,11,32,9,57,57,58,58,25,22,0,56,59,59,9,1,45,60,60,11,22,2,9,61,61,11,11,62,62,11,26]
y = [0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 8, 10, 11, 12, 13, 14, 14, 15, 15, 16, 17, 17, 18, 11, 2, 11, 0, 0, 19, 19, 20, 20, 21, 22, 0, 0, 18, 2, 22, 3, 23, 23, 9, 24, 24, 2, 7, 25, 26, 27, 22, 28, 28, 18, 11, 11, 11, 11, 7, 9, 5, 29, 29, 10, 30, 31, 10, 11, 27, 7, 11, 11, 9, 16, 10, 10, 13, 32, 10, 7, 27, 10, 10, 33, 7, 7, 9, 34, 10, 32, 21, 10, 1, 11, 35, 35, 18, 22, 7, 16, 16, 16, 30, 36, 36, 36, 37, 37, 11, 31, 38, 38, 2, 11, 7, 31, 39, 9, 40, 40, 31, 13, 7, 7, 33, 41, 41, 11, 42, 42, 18, 11, 43, 8, 44, 44, 6, 21, 11, 11, 0, 8, 2, 11, 8, 8, 8, 45, 11, 8, 11, 11, 2, 43, 43, 11, 39, 39, 46, 46, 0, 22, 11, 11, 22, 39, 47, 47, 22, 45, 48, 49, 49, 26, 11, 22, 0, 11, 22, 50, 50, 11, 22, 11, 42, 48, 39, 11, 22, 11, 11, 26, 31, 51, 51, 16, 7, 9, 12, 52, 52, 27, 53, 53, 54, 54, 55, 55, 16, 2, 22, 56, 11, 11, 34, 11, 32, 9, 57, 57, 58, 58, 25, 22, 0, 56, 59, 59, 9, 1, 45, 60, 60, 11, 22, 2, 9, 61, 61, 11, 11, 62, 62, 11, 26]
# xcmp = map(lambda (a, b): a == b, itertools.product(x,y))
# xcmp = [a == b for (a,b) in itertools.product(x,y)]
xLen = len(x)
yLen = len(y)
if xLen == yLen:
    print('list lengths are equal, proceeding...')
valuesEqual = True
for a in range(0, len(x)):
    if x[a] == y[a]:
        continue
    else:
        valuesEqual = False
if not valuesEqual:
    print('not all list values Match!!')
else:
    print('All values match!')
# print(xcmp)
'''
print(X_test.shape)
print(X_train.shape)
print(Y_train.shape)
print(Y_test.shape)
'''

list lengths are equal, proceeding...
All values match!


'\nprint(X_test.shape)\nprint(X_train.shape)\nprint(Y_train.shape)\nprint(Y_test.shape)\n'

In [20]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)
print(X_train)
print(Y_train)
print(X_test)
print(Y_test)

(87, 61)
(87,)
(21, 61)
(21,)
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [1 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
[ 0  0  0  0  1  2  2  3  4  5  4  6  1  0  2  4  7  8  8  9  3  7  7  0  0
  0  4 10  0 10  0 11  0  8  5  5  3  6  7  5  7 11  7  0  4  4  9  2  0 10
  7  7  0  2  9  0  9  2  0  4  0 10  4  0  0  9  2  7  0  5  5  1  0  9  7
  3  0  0  0  0  7  5  9  0  9  2  0]
[[1 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [1 1 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
[0 1 0 2 2 0 3 3 4 5 3 3 5 6 4 4 7 8 3 8 3]


In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = [{'weights': ["uniform", "distance"]}]

# KNearestNeighbors was used, but a different library may be applied later
knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=4)
knn_clf.fit(X_train, Y_train)

y_knn_pred = knn_clf.predict(X_test)

# forest_clf_pred = forest_clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, y_knn_pred)
# This accuracy score strictly tests the model, and may be subject to overfitting

0.0

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, Y_train)
forest_clf_pred = forest_clf.predict(X_test)

accuracy_score(Y_test, forest_clf_pred)
print(forest_clf_pred)
print(Y_test)


[10  8 10  9  9 10  0  0  7  4  0  0  4  5  7  7  6  2  0  2  0]
[0 1 0 2 2 0 3 3 4 5 3 3 5 6 4 4 7 8 3 8 3]


In [26]:
# Commentary from John
# 1) Cross-validation does exactly what it sounds like: it attempts to validate the results without worrying about overfitting, for a "truly" accurate score
# 2) This is a bit of a "hacked" approach, because it's switching to Stochastic Gradient Descent (SGD) from KNN for the cross validation, but I plan on moving that direction anyway
# 3) For the suspicious/worried, you can replace "sgd_clf" with "knn_clf" to also review the cross-validation result.
# 4) Ignore the warning.  I'm using an outdated module from SciKit-Learn, if I ultimately use TensorFlow or Caffe it won't appear, and if I use SciKit-Learn I'll use an upadated module.
sgd_clf = SGDClassifier(random_state=42)
cross_val_score(sgd_clf, X_train, Y_train, cv=2, scoring="accuracy")



array([ 0.84782609,  0.92682927])