# Train logistic regression with Keras
Martin Kircher provided two training_data files: one is human readable and the other one is one-hot-encoded.
Now we need to check that the information is actually in the same order.

In [117]:
import pandas as pd
import numpy as np
import pickle
import os
import csv
import time
import datetime
from keras.models import Model, Sequential
from keras.layers import Input, Dense

## 0. How many lines does the dataset have?
Let's create a shuffled index list and store it in our system, to be able to create batches of data to pass to the fit_generator

In [104]:
%%time
training_imputed = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/training_data.imputed.csv"
shuffled_index_file = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/shuffle_splits/shuffled_index.pickle"
batches_index = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/shuffle_splits/batches_index.pickle"
training = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/training_data.tsv"
training_batches_folder = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/shuffle_splits/training/"
testing_batches_folder = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/shuffle_splits/testing/"
validation_batches_folder = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/shuffle_splits/validation/"
with open(shuffled_index_file, 'rb') as f:
    shuffled_index = pickle.load(f)
with open(batches_index, 'rb') as f:
    batches_index = pickle.load(f)

CPU times: user 40 s, sys: 8.62 s, total: 48.7 s
Wall time: 48.5 s


## 1. Select idex list for batch and extract

In [87]:
def load_batch(directory, batch_size, sep=','):
    # Add for loop because the generator has to be able to start itself again
    sub_batch = None
    for file in os.listdir(directory):
        filename = directory + str(os.fsdecode(file))
        rows_df = pd.read_csv(filename, sep=sep)
        rows_df.y = [0 if r == -1 else r for r in rows_df.y]
        sub = (rows_df.shape[0] // batch_size) + 1
        for i in range(sub):
            start = (i) * batch_size
            if sub_batch is None:
                end = min(rows_df.shape[0], start + batch_size)
                sub_batch = rows_df.iloc[start:end,:]
            else:
                end = batch_size - sub_batch.shape[0]
                sub_batch = sub_batch.append(rows_df.iloc[start:end,:])
            if sub_batch.shape[0] == batch_size:
                yield (sub_batch.iloc[:,1:], sub_batch.iloc[:,0])
                sub_batch = None

In [81]:
i = 1
this_value = None
for batch in load_batch(training_batches_folder, 3200):
    this_value = batch
    print(this_value[0].shape, this_value[1].shape)
    if i ==4: break
    i += 1

/s/project/kipoi-cadd/data/raw/v1.3/training_data/shuffle_splits/training/1.pickle
(3200, 1063) (3200,)
(3200, 1063) (3200,)
(3200, 1063) (3200,)
/s/project/kipoi-cadd/data/raw/v1.3/training_data/shuffle_splits/training/2.pickle
(3200, 1063) (3200,)


In [54]:
this_value[0].shape

(9999, 1063)

## 1. Use single threaded generator and feed it to keras.models.fit_generator()
Previous tests have "demonstrated" that multithreading only makes things slower. Now, we use a generator that yields a single line, as required by the `fit_generator` method from `keras.models`. Right now, I'm taking inspiration from this tutorial in Medium: [Simple Logistic Regression using Keras](https://medium.com/@the1ju/simple-logistic-regression-using-keras-249e0cc9a970).

In [122]:
%%time
# Build the model
output_dim = 1 # One binary class
input_dim = 1063 # number of features of the input (102 for training, and 1063 for training_imputed)
model = Sequential() 
model.add(Dense(output_dim, input_dim=input_dim, activation='softmax'))
batch_size = 64
nb_epoch = 1
nb_steps_training = 100000 # 34693009 / batch_size = 542078.265625
nb_steps_prediction = 50000 # 350051 / batch_size = 5469.546875

training_generator = load_batch(training_batches_folder, batch_size)
testing_generator = load_batch(testing_batches_folder, batch_size)
validation_generator = load_batch(validation_batches_folder, batch_size)

"""
steps_per_epoch: Integer or None. Total number of steps (batches of samples) before declaring one epoch finished
and starting the next epoch. When training with input tensors such as TensorFlow data tensors, the default None
is equal to the number of samples in your dataset divided by the batch size, or 1 if that cannot be determined.
"""

# Compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy']) 
# history = model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,verbose=1, validation_data=(X_test, Y_test)) 
history = model.fit_generator(training_generator, steps_per_epoch=nb_steps_training, epochs=nb_epoch, shuffle=False, verbose=1)
score = model.evaluate_generator(testing_generator, steps=nb_steps_prediction, max_queue_size=10)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Epoch 1/1
Test score: 7.979681264748594
Test accuracy: 0.4994675
CPU times: user 37min 50s, sys: 2min 55s, total: 40min 46s
Wall time: 36min 32s


## 2. Predict on a validation set
See if the results are close to CADD

In [123]:
y_pred = model.predict_generator(validation_generator, steps=nb_steps_prediction)

In [118]:
model_file = ("/s/project/kipoi-cadd/data/models/model" +
                  datetime.datetime.now().strftime("%Y.%m.%d_%H:%M:%S") +
                  ".pickle")
with open(model_file, 'wb') as f:
        pickle.dump(model, f)

## 3. Check if batch splitting worked
Lines match.

In [97]:
# Check indexes
len(batches_index)
sum_lines = 0
for b in batches_index:
    lines = len(b.get('index_list'))
    if lines != 10000:
        print("lines =", lines)
    sum_lines += lines
print(sum_lines)

lines = 3060
35043060


In [126]:
# Check batches
total_num_examples = 35043061 - 1
current = 35046565 - 3505
diff = total_num_examples - current
diff

0

In [189]:
# Check the sizes of train and test
#! zcat train.csv.gz | wc -l
train_size = 34693009
#! zcat test.csv.gz | wc -l
test_size = 350051
(test_size / train_size)*100

1.008995789324587

In [103]:
# Check testing batches
output = ("/s/project/kipoi-cadd/data/raw/v1.3/training_data/" +
              "shuffle_splits/tests/")
batch = pd.read_csv(output+'3.csv')
batch.shape

(6619, 1064)

In [132]:
# Split last batch to get exact amount
path = ("/s/project/kipoi-cadd/data/raw/v1.3/training_data/" +
              "shuffle_splits/")
split_batch = pd.read_csv(path + "tests/3470.csv", index_col=0)
train = split_batch.iloc[:3009,:]
test = split_batch.iloc[3009:,:]
train.to_csv(path + "training/3469.csv", mode='a')
test.to_csv(path + "tests/3470.csv")
"""
Testing = 350087-36=350051
Training = 34696478-3469=34693009
"""