In [1]:
# Copyright 2023 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
# Skip this cell if running from local environment
# !pip install -q --no-warn-conflicts git+https://github.com/google-deepmind/nuclease_design.git
# After installing, restart the kernel
# Then run cells as normal

# Imports

In [3]:
from multiprocessing import pool as pool_lib
import pathlib

from nuclease_design import utils
from nuclease_design import preprocessing
from nuclease_design import constants


# Constants

In [4]:
kwargs = {
    'g4': constants.G4_INFO,
    'g3': constants.G3_INFO,
    'g2': constants.G2_INFO,
    'g1': constants.G1_INFO,
}

In [5]:
INPUT_DATA_DIR = constants.DATA_DIR

# Write to local directory
LOCAL_OUTPUT_DATA_DIR = pathlib.Path('data')
LOCAL_OUTPUT_DATA_DIR.mkdir(exist_ok=True, parents=True)

# Library fns

In [6]:
def _get_pvalue_df(dfs, generation):
  return dfs[f'{generation}_pvalue_df']

def _get_fiducial_efdf(dfs, generation, fiducial):
  return dfs[f'{generation}_fiducials_to_efdf'][fiducial]

In [7]:
def preprocess_wrapper(generation):
  pvalue_df, fiducials_to_efdf = preprocessing.preprocess_generation(
      data_dir=INPUT_DATA_DIR, **kwargs[generation]
  )

  # Write to disk
  for fiducial in kwargs[generation]['fiducials']:
    filename = constants.get_fiducial_filename(fiducial, generation)
    (LOCAL_OUTPUT_DATA_DIR / filename).parent.mkdir(exist_ok=True, parents=True)
    print(f'writing {generation} {filename}')
    with utils.open_file(filename, 'w', LOCAL_OUTPUT_DATA_DIR) as f:
      utils.encode_df(fiducials_to_efdf[fiducial]).to_csv(f, index=False)

  filename = constants.get_processed_data_file(generation)
  print(f'writing {generation} {filename}')

  (LOCAL_OUTPUT_DATA_DIR / filename).parent.mkdir(exist_ok=True, parents=True)
  with utils.open_file(filename, 'w', LOCAL_OUTPUT_DATA_DIR) as f:
    utils.encode_df(pvalue_df).to_csv(f, index=False)

  return {
      'pvalue_df': pvalue_df,
      'fiducials_to_efdf': fiducials_to_efdf,
      'generation': generation,
  }

In [8]:
%%time
all_generations = ['g4', 'g3', 'g2', 'g1']

with pool_lib.ThreadPool(len(all_generations)) as pool:
    dfs = pool.map(preprocess_wrapper, all_generations)
    pool.close()
    pool.join()

In [None]:
dfs[0]['fiducials_to_efdf']['a63p_a73r_d74h_i84y'].head()

Unnamed: 0,nuc_mutations,mutations,read_count_0_input_g4,read_count_1_70_g4,read_count_2_90_g4,read_count_3_98_g4,read_count_4_99.5_g4,ef_1_70_g4,ef_2_90_g4,ef_3_98_g4,ef_4_99.5_g4
2499,"((G, 187, C), (A, 189, G), (G, 217, A), (C, 21...","((A, 63, P), (A, 73, R), (D, 74, H), (I, 84, Y))",210.0,149.0,397.0,1428.0,1504.0,2.118487,4.076701,14.973083,22.973279
3063,"((A, 117, C), (G, 187, C), (A, 189, G), (G, 21...","((A, 63, P), (A, 73, R), (D, 74, H), (I, 84, Y))",160.0,182.0,374.0,3261.0,2072.0,3.396333,5.040682,44.877964,41.539782
3259,"((G, 187, C), (A, 189, G), (C, 198, T), (G, 21...","((A, 63, P), (A, 73, R), (D, 74, H), (I, 84, Y))",152.0,93.0,257.0,2562.0,2763.0,1.82683,3.646089,37.114009,58.308483
3261,"((G, 187, C), (A, 189, G), (C, 198, T), (G, 21...","((A, 63, P), (A, 73, R), (D, 74, H), (I, 84, Y))",130.0,47.0,217.0,694.0,1361.0,1.079477,3.599599,11.754887,33.582206
3328,"((T, 111, C), (G, 187, C), (A, 189, G), (A, 21...","((A, 63, P), (A, 73, R), (D, 74, H), (I, 84, Y))",100.0,15.0,110.0,1422.0,561.0,0.447868,2.372086,31.31136,17.995226
