# Train logistic regression with Scikit Learn
Martin Kircher provided two training_data files: one is human readable and the other one is one-hot-encoded.
Now we need to check that the information is actually in the same order.

In [103]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

## 1. Read in dataset with Dask

In [104]:
training = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/training_data.tsv"
training_imputed = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/training_data.imputed.csv"

training_df = pd.read_csv(training, sep='\t', nrows=10000)
training_imp_df = pd.read_csv(training_imputed, nrows=10000)
training_df.head()

Unnamed: 0,y,Chrom,Pos,Ref,Alt,Type,Length,isTv,Consequence,GC,...,SIFTcat,SIFTval,mirSVR-Score.na,targetScan.na,cDNApos.na,CDSpos.na,protPos.na,Grantham.na,PolyPhenVal.na,SIFTval.na
0,0,1,379177,T,G,SNV,0,1,NC,0.48,...,UD,0.0,1,1,0,1,1,1,1,1
1,0,1,379274,C,G,SNV,0,1,NC,0.54,...,UD,0.0,1,1,0,1,1,1,1,1
2,0,1,379476,A,T,SNV,0,1,UP,0.63,...,UD,0.0,1,1,1,1,1,1,1,1
3,0,1,379631,C,G,SNV,0,1,UP,0.34,...,UD,0.0,1,1,1,1,1,1,1,1
4,0,1,379724,A,G,SNV,0,0,UP,0.28,...,UD,0.0,1,1,1,1,1,1,1,1


In [105]:
y = training_df.y
X = training_imp_df.drop(columns="y")

# Inserting some artificial positive examples
np.random.seed(10)
msk = np.random.rand(len(y)) < 0.2
y[msk] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [107]:
# Drop constant columns
constant_cols = set()
dataset = X
for col in dataset:
    # print(dataset.shape, type(dataset))
    # print(len(np.unique(dataset[col])))
    if len(np.unique(dataset[col])) < 20 and col != 'y':
        constant_cols.add(col)
print("Droped", len(constant_cols), "constant cols.")

for dataset in [X_train, X_test, y_train, y_test]:
    dataset.drop(columns=list(constant_cols), inplace=True, errors='ignore')

X.drop(columns=list(constant_cols), inplace=True, errors='ignore')

Droped 824 constant cols.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


## 2. Split dataset into train and test

In [108]:
lr = LogisticRegression(penalty='l2', solver='lbfgs', n_jobs=10)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=10,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [109]:
y_train_pred = lr.predict(X_train)

In [110]:
print("Read like \n tn, fp, \n fn, tp \n")
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
confusion_matrix(y_train, y_train_pred)

Read like 
 tn, fp, 
 fn, tp 



array([[6304,    0],
       [1695,    1]])