# Process screening data

In [1]:
import sys
sys.path.append("../src/")

import numpy as np

import matplotlib.pyplot as plt

from mask import simulate_mask
from plotting import plot_profile, plot_hmap
from splitting import train_val_split
from screening_data_generator.from_file import matlab_to_ndarray

%matplotlib inline

ModuleNotFoundError: No module named 'h5py'

In [None]:
SEED = 42

# Location where data matrix and masks will be stored.
PATH_TO_DATA = "/Users/sela/Desktop/recsys/data/screening_data/"

# Load screening data from file

In [None]:
# Load screening data from matlab file into numpy array.
X = matlab_to_ndarray("/Users/sela/phd/data/real/data_matrix_3m.mat", "X")

# Training and validation sets

In [None]:
prediction_window = 4

O_train, O_val, valid_rows = train_val_split(X, prediction_window, method='last_observed', 
                                             return_valid_rows=True, seed=SEED)

X = X[valid_rows, :]
O_val = O_val[valid_rows, :]
O_train = O_train[valid_rows, :]

X_val = X * O_val
X_train = X * O_train

# NB: Quantifies density of profiles and not the complete matrix.
train_density = []
for row in X_train:
    last_train_idx = np.argmax(np.cumsum(row))
    train_density.append(np.count_nonzero(row[:last_train_idx]) / row[:last_train_idx].size)

print(np.mean(train_density))

# Inspect

In [None]:
_, axes = plt.subplots(nrows=3, ncols=3, figsize=(10, 10))
for num, axis in enumerate(axes.ravel()):
    
    # Add plot to figure.
    plot_profile(X_train[num], axis, show=False, label='Training')
    plot_profile(X_val[num], axis, show=False, label='Validation')
    
    axis.legend()

In [None]:
fig = plt.figure(figsize=(8, 8))
plt.title("Validation scores", fontsize=20)
plot_hmap(fig, X_val)

vals, cnts = np.unique(X_val[X_val != 0], return_counts=True)
print('States:', vals)
print('Counts:', cnts)

# Sanity check.
print('Number of all-zero profiles:', len(np.squeeze(np.where(np.sum(X_val, axis=1) == 0))))

In [None]:
fig = plt.figure(figsize=(8, 8))
plt.title("Training scores", fontsize=20)
plot_hmap(fig, X_train)

vals, cnts = np.unique(X_train[X_train != 0], return_counts=True)
print('States:', vals)
print('Counts:', cnts)

# Sanity check.
print('Number of all-zero profiles:', len(np.squeeze(np.where(np.sum(X_train, axis=1) == 0))))

# Save data to disk

In [None]:
np.save(f'{PATH_TO_DATA}/X.npy', X)
np.save(f'{PATH_TO_DATA}/O_val.npy', O_val)
np.save(f'{PATH_TO_DATA}/O_train.npy', O_train)