## Comparing training files
Martin Kircher provided two training_data files: one is human readable and the other one is one-hot-encoded.
Now we need to check that the information is actually in the same order.

In [1]:
import dask
import dask.dataframe as dd
import dask.array as da
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import sklearn
import sklearn.model_selection as sklearn_sel
from sklearn.model_selection import ShuffleSplit

## 1. Read in dataset with Dask

In [2]:
training = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/training_data.tsv"
training_imputed = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/training_data.imputed.csv"

training_dd = dd.read_csv(training, sep='\t', 
                          dtype={'PolyPhenVal': 'float64',
                                 'SIFTval': 'float64',
                                 'mirSVR-E': 'float64',
                                 'mirSVR-Score': 'float64',
                                 'relCDSpos': 'float64',
                                 'relProtPos': 'float64'})

training_imp_dd = dd.read_csv(training_imputed, assume_missing=True)
training_dd.head()

Unnamed: 0,y,Chrom,Pos,Ref,Alt,Type,Length,isTv,Consequence,GC,...,SIFTcat,SIFTval,mirSVR-Score.na,targetScan.na,cDNApos.na,CDSpos.na,protPos.na,Grantham.na,PolyPhenVal.na,SIFTval.na
0,0,1,379177,T,G,SNV,0,1,NC,0.48,...,UD,0.0,1,1,0,1,1,1,1,1
1,0,1,379274,C,G,SNV,0,1,NC,0.54,...,UD,0.0,1,1,0,1,1,1,1,1
2,0,1,379476,A,T,SNV,0,1,UP,0.63,...,UD,0.0,1,1,1,1,1,1,1,1
3,0,1,379631,C,G,SNV,0,1,UP,0.34,...,UD,0.0,1,1,1,1,1,1,1,1
4,0,1,379724,A,G,SNV,0,0,UP,0.28,...,UD,0.0,1,1,1,1,1,1,1,1


### 1.a. Process with Dask

In [3]:
"""
y = training_imp_dd.y
X = training_imp_dd.drop('y', axis=1)
X_small = X.loc[:1000,:]
y_small = y.loc[:1000]

X_small = X_small.compute(n_workers)
y_small = y_small.compute()]
"""

"\ny = training_imp_dd.y\nX = training_imp_dd.drop('y', axis=1)\nX_small = X.loc[:1000,:]\ny_small = y.loc[:1000]\n\nX_small = X_small.compute(n_workers)\ny_small = y_small.compute()]\n"

### 1.b. Read in some rows of the dataframe with pandas and then convert to dask
Since I'm not so fluent with Dask I need the trial-and-error approach. For this purpose, I will select a subset of the dataframe using pandas and then read it as a Dask dataframe.

In [4]:
X_pandas = pd.read_csv(training_imputed, nrows=10000)
y_pandas = pd.read_csv(training, sep='\t', nrows=10000)[["y"]]
y_pandas = y_pandas.y

# Inserting some artificial positive examples
np.random.seed(10)
msk = np.random.rand(len(y_pandas)) < 0.2
y_pandas[msk] = 1

In [5]:
X_train, X_test, y_train, y_test = sklearn_sel.train_test_split(X_pandas, y_pandas, train_size=0.8, test_size=0.2)

In [6]:
# Drop constant columns
constant_cols = set()
for dataset in [X_pandas]:
    for col in dataset:
        # print(dataset.shape, type(dataset))
        # print(len(np.unique(dataset[col])))
        if len(np.unique(dataset[col])) < 20 and col != 'y':
            constant_cols.add(col)
print("Droped", len(constant_cols), "constant cols.")

for dataset in [X_train, X_test, y_train, y_test]:
    dataset.drop(columns=list(constant_cols), inplace=True, errors='ignore')

X_pandas.drop(columns=list(constant_cols), inplace=True, errors='ignore')

Droped 824 constant cols.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [7]:
X = dd.from_pandas(X_pandas, chunksize=100)
y = dd.from_pandas(y_pandas, chunksize=100)
X = X.drop('y', axis=1)

## 2. Split dataset into train and test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [9]:
type(X_train)

dask.dataframe.core.DataFrame

In [15]:
lr = LogisticRegression(penalty='l2', solver='lbfgs', n_jobs=64, max_iter=10)
lr.fit(X_train.values, y_train.values) # If leave just the dataframe, will throw an error saying "This estimator does not support dask dataframes."

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1.0, max_iter=10, multiclass='ovr', n_jobs=64,
          penalty='l2', random_state=None, solver='lbfgs',
          solver_kwargs=None, tol=0.0001, verbose=0, warm_start=False)

In [16]:
y_train_pred = lr.predict(X_train.values)
lr.predict_proba(X_test.values)
scores_train = lr.score(X_train.values, y_train_pred)

In [18]:
def confusion_matrix_dask(truth, predictions, labels_list=[]):
    TP=0
    FP=0
    FN=0
    TN=0
    if not labels_list:
        TP=(truth[predictions==1]==1).sum()
        FN=(truth[predictions!=1]==1).sum()
        TN=(truth[predictions!=1]!=1).sum()
        FP=(truth[predictions==1]!=1).sum()
    for label in labels_list:
        TP=(truth[predictions==label]==label).sum()+TP
        FN=(truth[predictions!=label]==label).sum()+FP
        TN=(truth[predictions!=label]!=label).sum()+TN
        FP=(truth[predictions==label]!=label).sum()+FN

    TN, FP, FN, TP = dask.compute(TN, FP, FN, TP)
    return(TN, FP, FN, TP)

TN, FP, FN, TP = confusion_matrix_dask(y_train.values, y_train_pred)
print("Read like \n[[TN, FP], \n[FN, TP]]\n")
print(np.array([[TN, FP], [FN ,TP]]))
sum([TP, FP, TN, FN])

Read like 
[[TN, FP], 
[FN, TP]]

[[6221   16]
 [1680   29]]


7946

In [21]:
"""
y_train_pred_computed = y_train_pred.compute()
scores_train_computed = scores_train.compute()
"""
# saving model as pickle
import pickle
save_model = "/s/project/kipoi-cadd/data/processed/kipoi_cadd_models/lr.pickle"
pickle.dump(lr, open(save_model, "wb"))

In [24]:
my_model = pickle.load(open(save_model, "rb"))
y_test_pred = my_model.predict(X_test.values)
TN_test, FP_test, FN_test, TP_test = confusion_matrix_dask(y_test.values, y_test_pred)
print("Read like \n[[TN, FP], \n[FN, TP]]\n")
print(np.array([[TN_test, FP_test], [FN_test ,TP_test]]))
print(sum([TP_test, FP_test, TN_test, FN_test]))
print(len(y_test))

Read like 
[[TN, FP], 
[FN, TP]]

[[1638   13]
 [ 400    3]]
2054
2054


In [19]:
from sklearn.metrics import confusion_matrix
y_train_computed = y_train.compute()
y_train_pred_computed = y_train_pred.compute()
print("Read like \n tn, fp, \n fn, tp \n")
tn, fp, fn, tp = confusion_matrix(y_train_computed, y_train_pred_computed).ravel()
confusion_matrix(y_train_computed, y_train_pred_computed)

Read like 
 tn, fp, 
 fn, tp 



array([[6221,   16],
       [1680,   29]])