/
utils.py
92 lines (68 loc) · 2.56 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Import methods for multiple myeloma multiclass classifier
Gregory Way 2018
Usage:
from utils import shuffle_columns
from utils import apply_classifier
"""
import numpy as np
import pandas as pd
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.metrics import confusion_matrix
import seaborn as sns
def apply_classifier(x, w, b, proba=True, dropna=True):
"""
Apply the classifier to additional data.
See https://github.com/scikit-learn/scikit-learn/blob/ac1b04875331fc291a437025a18ddfefd4051d7c/sklearn/linear_model/base.py
for more details
Arguments:
x - pandas dataframe: the new data trying to predict (gene by sample)
w - pandas dataframe: the classifier coefficients (gene weight by class)
b - pandas dataframe: the multiclass classifier intercepts (b by class)
proba - output probability or decision scores
dropna - decision to drop missing values. If not dropped, fill with zero.
Import only
"""
# Align matrices
# NOTE: This will drop coefficients not present in X
if dropna:
x = x.reindex(w.index, axis='columns').dropna(axis='columns')
w = w.reindex(x.columns).dropna()
else:
x = x.reindex(w.index, axis='columns', fill_value=0)
w = w.reindex(x.columns, fill_value=0)
scores = safe_sparse_dot(x, w, dense_output=True) + np.array(b)
if proba:
scores *= -1
np.exp(scores, scores)
scores += 1
np.reciprocal(scores, scores)
scores /= scores.sum(axis=1).reshape((scores.shape[0], -1))
scores = pd.DataFrame(scores, index=x.index, columns=b.columns)
return scores
def shuffle_columns(gene):
"""
To be used in an `apply` pandas func to shuffle columns around a datafame
Import only
"""
import numpy as np
return np.random.permutation(gene.tolist())
def get_confusion_matrix(y_true, y_pred):
"""
Obtain confusion matrix for input truth and predictions (import only)
Arguments:
y_true - numpy array of class assignments
y_pred - numpy array of class predictions
name - label for output file and heatmap title
Output:
Confusion matrix table and heatmap axes
"""
conf_mat = confusion_matrix(y_true, y_pred)
conf_mat = (
pd.DataFrame(conf_mat,
index=['wildtype_true', 'KRAS_true', 'NRAS_true'],
columns=['wildtype_pred', 'KRAS_pred', 'NRAS_pred'])
)
conf_mat_percent = conf_mat.divide(conf_mat.sum(axis=0), axis=1)
ax = sns.heatmap(conf_mat_percent, annot=True, fmt='.1%')
return conf_mat, ax