In [11]:
import scipy as sp
import h5py
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import gc
from utils import SPARSE_DATA_DIR, DATA_DIR
from sklearn.decomposition import TruncatedSVD
from joblib import dump, load

In [2]:
train_multi_inputs_values = sp.sparse.load_npz(
    f'{SPARSE_DATA_DIR}/train_multi_inputs_values.sparse.npz'
)
train_multi_targets_values = sp.sparse.load_npz(
    f'{SPARSE_DATA_DIR}/train_multi_targets_values.sparse.npz'
)

In [3]:
train_multi_inputs_idx = np.load(
    f'{SPARSE_DATA_DIR}/train_multi_inputs_idxcol.npz'
)
train_multi_targets_idx = np.load(
    f'{SPARSE_DATA_DIR}/train_multi_targets_idxcol.npz'
)

pretty long and very wide data

In [4]:
train_multi_inputs_values.shape

(105942, 228942)

In [5]:
train_multi_targets_values.shape

(105942, 23418)

We already know the input data is very sparse. 

- What about output?

	For CITE we know it's about 75% sparse, and this checks out:
	```
	2.3 GB   train_cite_inputs.h5
	711 MB  train_cite_inputs_values.sparse.npz
	```
	Outputs aren't really sparse at all
	```
	36 MB  train_cite_targets_values.sparse.npz
	37 MB   train_cite_targets.h5
	```
	For multi, I read that inputs were about 95% sparse. Hmmm.
	```
	11 GB   train_multi_inputs.h5
	2.8 GB  train_multi_inputs_values.sparse.npz
	```
	Regardless, targets are about as sparse as CITE inputs, which makes sense.
	```
	3.0 GB   train_multi_targets.h5
	824 MB  train_multi_targets_values.sparse.npz
	```



- How do columns change in availability across input/output and technologies?

In [16]:
# Slow to even load just 1 row of multi, so commented out for another method
# train_multi_target_cols = pd.read_hdf(f'{DATA_DIR}/train_multi_targets.h5', start=0, end=1)

# very difficult to get this working...
import hdf5plugin
def extract_cols(filename):
    f = h5py.File(f'{DATA_DIR}/{filename}')
    key = list(f.keys())[0]
    group = f[key]
    axis = group['axis0']  # type: ignore
    return pd.Series(axis[()])  # type: ignore

In [17]:
train_multi_targets_cols = extract_cols("train_multi_targets.h5")
train_multi_inputs_cols = extract_cols("train_multi_inputs.h5")
train_cite_inputs_cols = extract_cols("train_cite_inputs.h5")
train_cite_targets_cols = extract_cols("train_cite_targets.h5")

In [8]:
train_cite_targets_cols # type: ignore

0       b'CD86'
1      b'CD274'
2      b'CD270'
3      b'CD155'
4      b'CD112'
         ...   
135    b'HLA-E'
136     b'CD82'
137    b'CD101'
138     b'CD88'
139    b'CD224'
Length: 140, dtype: bytes88

In [9]:
train_cite_inputs_cols # type: ignore

0            b'ENSG00000121410_A1BG'
1        b'ENSG00000268895_A1BG-AS1'
2             b'ENSG00000175899_A2M'
3         b'ENSG00000245105_A2M-AS1'
4           b'ENSG00000166535_A2ML1'
                    ...             
22045        b'ENSG00000198455_ZXDB'
22046        b'ENSG00000070476_ZXDC'
22047      b'ENSG00000162378_ZYG11B'
22048         b'ENSG00000159840_ZYX'
22049       b'ENSG00000074755_ZZEF1'
Length: 22050, dtype: bytes328

In [10]:
train_multi_targets_cols # type: ignore

0        b'ENSG00000121410'
1        b'ENSG00000268895'
2        b'ENSG00000175899'
3        b'ENSG00000245105'
4        b'ENSG00000166535'
                ...        
23413    b'ENSG00000070476'
23414    b'ENSG00000203995'
23415    b'ENSG00000162378'
23416    b'ENSG00000159840'
23417    b'ENSG00000074755'
Length: 23418, dtype: bytes120

In [11]:
train_multi_inputs_cols # type: ignore

0         b'GL000194.1:114519-115365'
1           b'GL000194.1:55758-56597'
2           b'GL000194.1:58217-58957'
3           b'GL000194.1:59535-60431'
4         b'GL000195.1:119766-120427'
                     ...             
228937        b'chrY:7814107-7815018'
228938        b'chrY:7818751-7819626'
228939        b'chrY:7836768-7837671'
228940        b'chrY:7869454-7870371'
228941        b'chrY:7873814-7874709'
Length: 228942, dtype: bytes208

# Simple linear model. 

- Need to do PCA on input because extremely wide
- TruncatedSVD time scales with `n_components`, so need to keep sorta small. 1000 was taking forever (nothing after 30min), but 1 components took ~2min. 16 took ~4 min. 32 = 6min. 64 = 9min.

In [8]:
N_COMPONENTS = 64

In [None]:
pca = TruncatedSVD(n_components = N_COMPONENTS, random_state=1)
reduced_train_multi_inputs_values = pca.fit_transform(train_multi_inputs_values)
X = pd.DataFrame(reduced_train_multi_inputs_values, columns = [str(i) for i in range(N_COMPONENTS)])
X.to_feather(f'{DATA_DIR}/../reduced/{N_COMPONENTS}_reduced_train_multi_inputs_values.feather')

In [22]:
Y = pd.read_hdf('../data/original/train_multi_targets.h5')

In [35]:
model = LinearRegression()

# Took 16min to train on full data, 64 components
# though only ~20seconds to train 10k rows (I blame swapping)
model.fit(X, Y)
dump(model, '64_reduced_full_multi_LinearRegression.joblib') 

In [4]:
X_test = train_multi_targets_values = sp.sparse.load_npz(
    f'{SPARSE_DATA_DIR}/test_multi_inputs_values.sparse.npz'
)

In [1]:
# TODO: probably want to add this to the training data to do PCA
# on all available inputs. Might overfit on unseen inputs, but that's 
# not part of this competition 😈
pca = TruncatedSVD(n_components = N_COMPONENTS, random_state=1)
reduced_test_multi_inputs_values = pca.fit_transform(X_test)

NameError: name 'TruncatedSVD' is not defined

In [42]:
X_test = pd.DataFrame(
    reduced_test_multi_inputs_values, 
    columns = [str(i) for i in range(N_COMPONENTS)],
    )
X_test.to_feather(f'{DATA_DIR}/../reduced/{N_COMPONENTS}_reduced_test_multi_inputs_values.feather')

In [12]:
reg = load('../data/models/64_reduced_full_multi_LinearRegression.joblib') 

In [13]:
Y_hat = reg.predict(X_test)

In [None]:
# TODO: below code should be reusable across technologies

In [None]:
test_index = np.load("../data/sparse/test_multi_inputs_idxcol.npz",
                    allow_pickle=True)["index"]
y_columns = np.load("../data/sparse/train_multi_targets_idxcol.npz",
                    allow_pickle=True)["columns"]

# Maps from row number to cell_id
cell_dict = dict((k,i) for i, k in enumerate(test_index))
assert( len(cell_dict) == len(test_index))
gene_dict = dict((k,i) for i, k in enumerate(y_columns))
assert( len(gene_dict) == len(y_columns))

In [None]:
assert(len(train_multi_targets_cols) == Y_hat.shape[1])
assert(len(test_index) == Y_hat.shape[0])

In [40]:
#  The below approach was too slow
    # Y_hat_indexed = pd.DataFrame(
    #     Y_hat, 
    #     columns = train_multi_targets_cols.values,   # type: ignore
    #     index = test_index
    # )
    # Y_hat_indexed.stack()

eval_ids = pd.read_parquet('../data/sparse/evaluation.parquet')

# Create two arrays of indices, so that for every row in long `eval_ids` 
# list we have the coordinates of the corresponding value in the 
# model's rectangular output matrix. 
eval_ids_cell_num = eval_ids.cell_id.apply(
    lambda x: cell_dict.get(x, -1)
)
eval_ids_gene_num = eval_ids.gene_id.apply(
    lambda x: gene_dict.get(x, -1)
)

# Eval_id rows that have both and "x" and "y" index are valid
# TODO: should check that nothing has just one (x or y) index
valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)

In [None]:
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)

In [138]:
# Neat numpy trick to make a 1d array from 2d based on two arrays: 
# one of "x" coordinates and one of "y" coordinates of the 2d array.
submission.iloc[valid_multi_rows] = Y_hat[  # type: ignore
    eval_ids_cell_num[valid_multi_rows].to_numpy(),  # type: ignore
    eval_ids_gene_num[valid_multi_rows].to_numpy()  # type: ignore
]

In [None]:
cite_submission = pd.read_csv(
    '../data/submissions/cite_linreg.csv',
    index_col=0
)

In [204]:
submission[submission.isna()] = cite_submission.head(6812820)['target']
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'
assert(submission.isna().sum() == 0)

In [215]:
submission.to_csv("../data/submissions/full_64_reduced_linreg.csv")