In [41]:
from kipoi_cadd.data_utils import load_csv_to_sparse_matrix
from kipoi_cadd.data import cadd_train_valid_data, train_test_split_indexes, CaddDataset
from kipoi_cadd.utils import dump_to_pickle, load_pickle
import pandas as pd
import time
from tqdm import tqdm, trange
import numpy as np
from kipoi.data_utils import numpy_collate_concat
from scipy.sparse import csr_matrix, save_npz, load_npz
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [9]:
training_dir = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/"
lmdb_dir = training_dir + "lmdb"
csv_file = training_dir + "training_data.imputed.csv"
valid_id_file = training_dir + "valid_idx.pkl"
all_ids_file = training_dir + "variant_ids.pkl"
ids_10k_file = training_dir + "sample_indices/ids_10k.pkl"
std_scaler_file = training_dir + "stats/standard_scaler_first10k.pkl"
short_csv = training_dir + "last10k.csv"

## Use read_csv with file buffer and nrows
Here we realize that `nrows` is probably not meant to be used together with buffers. We obtain weird results.

In [32]:
f = open(csv_file, "rb")

In [33]:
df = pd.read_csv(f, nrows=1000)
header = df.columns.values
dtypes = df.dtypes.to_dict()
df.head()

Unnamed: 0,y,RefxA,RefxC,RefxG,RefxT,RefxN,AltxA,AltxC,AltxG,AltxT,...,YxM,YxN,YxP,YxQ,YxR,YxS,YxT,YxV,YxW,YxY
0,-1,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,-1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,-1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,-1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,-1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
df2 = pd.read_csv(f, nrows=1000, names=header)
df2.head()

Unnamed: 0,y,RefxA,RefxC,RefxG,RefxT,RefxN,AltxA,AltxC,AltxG,AltxT,...,YxM,YxN,YxP,YxQ,YxR,YxS,YxT,YxV,YxW,YxY
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1,-1,0,1,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1,0,1,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1,0,0,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1,0,0,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
f.close()

In [37]:
df3 = pd.read_csv(csv_file, nrows=200)
df3.loc[99:105,:]

Unnamed: 0,y,RefxA,RefxC,RefxG,RefxT,RefxN,AltxA,AltxC,AltxG,AltxT,...,YxM,YxN,YxP,YxQ,YxR,YxS,YxT,YxV,YxW,YxY
99,-1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
100,-1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
101,-1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102,-1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
103,-1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
104,-1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
105,-1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Load entire sparse csv directly into a sparse matrix using Dask
I saw this approach here https://stackoverflow.com/questions/31888856/read-a-large-csv-into-a-sparse-pandas-dataframe-in-a-memory-efficient-way.
After loading the sparse matrix, we want to **shuffle** it and then **persist** it. The dataset size in disk should be considerably reduced.

In [28]:
colnames = pd.read_csv(short_csv, nrows=1).columns.values
dic = {}
for i, col in enumerate(colnames):
    dic[col] = i

In [12]:
%%time
# Load sparse matrix directly from csv file
res = load_csv_to_sparse_matrix(short_csv)

Started dask task.
[########################################] | 100% Completed | 15.6s
Finished dask task.
Finished transforming to csr_matrix.
CPU times: user 23.2 s, sys: 4.14 s, total: 27.3 s
Wall time: 18.7 s


In [40]:
# Shuffle a sparse matrix
index = load_pickle(training_dir + "shuffle_splits/shuffled_index_10k.pkl")
print(index)
shuff_res = res[index, :]
shuff_res

[3299 3730 9371 ... 7391 8441 3889]


<10000x1064 sparse matrix of type '<class 'numpy.float32'>'
	with 665816 stored elements in Compressed Sparse Row format>

In [46]:
print(shuff_res[:,0].shape)
print(shuff_res[:,1:].shape)

(10000, 1)
(10000, 1063)


In [47]:
# Persist csr_matrix
save_npz(training_dir + "example_sparse.npz", shuff_res)

## Extract training and validation set from sparse matrix in disk

In [55]:
from sklearn.model_selection import train_test_split
def sparse_cadd_datset(sparse_matrix_file, split=0.3, random_state=42):
    sparse_matrix = load_npz(sparse_matrix_file)
    y = sparse_matrix[:,0]
    X = sparse_matrix[:,1:]
    del sparse_matrix

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=split, random_state=random_state)
    return (X_train, y_train), (X_valid, y_valid)

In [56]:
(X_train, y_train), (X_valid, y_valid) = sparse_cadd_datset(training_dir + "example_sparse.npz")

<3000x1 sparse matrix of type '<class 'numpy.float32'>'
	with 3000 stored elements in Compressed Sparse Row format>

In [59]:
print("Finished loading training dataset. Shape: ", X_train.shape, "True values:", y_train.sum()/y_train.shape[0])

Finished loading training dataset. Shape:  (7000, 1063) True values:   (0, 0)	1.0


In [62]:
y_train.shape[0]

7000

## Sparse matrices transformations with scikit scalers

In [9]:
X, y = CaddDataset(lmdb_dir, ids_10k_file).load_all(drop_last=False)
X_sparse = csr_matrix(X, shape=None, dtype=np.float32, copy=False)

100%|██████████| 157/157 [00:00<00:00, 502.63it/s]


In [10]:
scaler = load_pickle(std_scaler_file)

In [11]:
X_sp_tr = scaler.transform(X_sparse)

In [14]:
X_sp_tr

<10000x1063 sparse matrix of type '<class 'numpy.float32'>'
	with 647432 stored elements in Compressed Sparse Row format>

In [None]:
save_npz(training_dir + "sparse_matrix_10percent.npz", X_sparse)
# save_npz("/s/project/kipoi-cadd/data/raw/v1.3/training_data/sparse_all.npz", res)