In [1]:
import pandas as pd
import numpy as np
import functools as ft
import itertools as it
import multiprocessing as mp
import matplotlib.pyplot as plt
from collections import Counter
import sklearn
import json
%matplotlib inline

In [54]:
dat = pd.read_csv('./kinase_go_annotations.csv', header=None)
dat.columns = ['Kinase', 'GO Labels']
dat.set_index('Kinase',inplace=True)
print(dat.head(5))
dat = dat['GO Labels']

                                                GO Labels
Kinase                                                   
EPHB1   ['peptidyl-tyrosine phosphorylation', 'axon gu...
K6PP    ['identical protein binding', 'AMP binding', '...
MRCKG   ['magnesium ion binding', 'microtubule organiz...
HCK     ['regulation of cell shape', 'respiratory burs...
ROCK1   ['negative regulation of angiogenesis', 'negat...


In [55]:
def csv_writes_lists_weird_helper(i):
    return list(map(lambda x: x.lstrip(), i.replace('[', '').replace(']', '').replace('\'', '').split(',')))

dat = dat.apply(csv_writes_lists_weird_helper)    
dat.head(5)

Kinase
EPHB1    [peptidyl-tyrosine phosphorylation, axon guida...
K6PP     [identical protein binding, AMP binding, fruct...
MRCKG    [magnesium ion binding, microtubule organizing...
HCK      [regulation of cell shape, respiratory burst a...
ROCK1    [negative regulation of angiogenesis, negative...
Name: GO Labels, dtype: object

In [56]:
from sklearn.preprocessing import LabelBinarizer as LB

lb = LB()

In [63]:
lb.fit(list(set(filter(lambda x: x is not '', [token for sentence in dat for token in sentence]))))

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [67]:
df = dat.apply(lb.transform).apply(sum)

In [71]:
if(len(dat.iloc[0]) == sum(df.iloc[0])):
    print('Converted lists of tokens to one-hot vectors!')
else:
    print('Something went wrong!')

Converted lists of tokens to one-hot vectors!


In [73]:
len(df.iloc[0])

3700

In [75]:
from sklearn.svm import SVC

clf = SVC()

In [91]:
consensus = pd.read_csv('../reproduced/consensusclusters_spinglass_greaterthan90percent.txt', header=0, sep = '\t')
consensus = consensus.set_index('names')
consensus.head(5)

Unnamed: 0_level_0,cluster
names,Unnamed: 1_level_1
PAK1,1
ERBB2,2
RON,3
YES,3
TYRO3,3


In [92]:
consensus = consensus.join(other=df)
consensus.head(5)

Unnamed: 0_level_0,cluster,GO Labels
names,Unnamed: 1_level_1,Unnamed: 2_level_1
PAK1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
ERBB2,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
RON,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
YES,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
TYRO3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [2]:
print(**{})




In [94]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(consensus['GO Labels'], consensus['cluster'], test_size = 0.1, random_state =1911)

In [106]:
clf.fit(X_train.values.tolist(), y_train.values.tolist())



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [107]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test.values.tolist(), clf.predict(X_test.values.tolist()))

0.25

In [108]:
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.naive_bayes import GaussianNB as GNB

mnb = MNB()
gnb = GNB()

mnb.fit(X_train.values.tolist(), y_train.values.tolist())
gnb.fit(X_train.values.tolist(), y_train.values.tolist())

print(accuracy_score(y_test.values.tolist(), mnb.predict(X_test.values.tolist())))
print(accuracy_score(y_test.values.tolist(), gnb.predict(X_test.values.tolist())))


0.5833333333333334
0.625
