In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import json

file_path = "/content/V2_data.xlsx"

In [8]:
sheet_name = "Results"

df = pd.read_excel(
    file_path,
    sheet_name=sheet_name,
    header=7,            # headers start on row 8
    engine="openpyxl"
)

print("Raw shape:", df.shape)

Raw shape: (4329, 13)


  warn(msg)


In [9]:
df.columns = (
    df.columns
      .astype(str)
      .str.strip()
      .str.replace("\n", "", regex=False)
      .str.replace("\r", "", regex=False)
)

print("Cleaned columns:")
for col in df.columns:
    print(repr(col))

Cleaned columns:
'Construct Barcode'
'Extended Spacer(NNNN[20nt]NGGNNN)'
'Strand'
'Transcript'
'Gene Symbol'
'Amino Acid Cut position'
'Percent Peptide'
'Annotation'
'sgRNA Score'
'Low Flag'
'AZD_200nM'
'6TG_2ug/mL'
'PLX_2uM'


In [10]:
required_cols = [
    "Construct Barcode",
    "Extended Spacer(NNNN[20nt]NGGNNN)",
    "Gene Symbol",
    "Amino Acid Cut position",
    "Percent Peptide",
]

In [11]:
# VERIFICATION STEP
missing_cols = [c for c in required_cols if c not in df.columns]

if missing_cols:
    raise ValueError(
        f"Missing required columns: {missing_cols}\n"
        f"Available columns: {df.columns.tolist()}"
    )

print("All required columns found.")


All required columns found.


In [12]:
# drop rows with missing values
before = df.shape[0]

df = df.dropna(subset=required_cols).reset_index(drop=True)

after = df.shape[0]
print(f"Rows before dropna: {before}")
print(f"Rows after dropna : {after}")

Rows before dropna: 4329
Rows after dropna : 4328


In [13]:
# one-hot encode 20-mer sequence
BASE2IDX = {"A": 0, "C": 1, "G": 2, "T": 3}

def one_hot_20mer(seq, length = 20):
  seq = str(seq).upper().strip()
  if len(seq) != length:
    raise ValueError(f"Sequence length {len(seq)} != {length} for: {seq}")

  arr = np.zeros((4, length), dtype = np.float32)
  for i, base in enumerate(seq):
    idx = BASE2IDX.get(base, None)
    if idx is None:
      raise ValueError(f"Invalid base '{base}' in sequence: {seq}")
    arr[idx, i] = 1.0

  return arr




In [14]:
seq_list = []
for s in df["Construct Barcode"]:
  seq_list.append(one_hot_20mer(s))

X_seq = np.stack(seq_list, axis = 0)
print("X_seq shape:", X_seq.shape)



X_seq shape: (4328, 4, 20)


In [15]:
# Extract and encode PAM from Extended Spacer

def extracct_pam(ext_seq):
  """
  Default: positions 24:27 (NNNN[20nt]NGGNNN). If string is shorter/odd, use last 3 bases as fallback
  """

  s = str(ext_seq).upper().replace(" ","")
  if len(s) >= 27:
    pam = s[24:27]
  else:
    pam = s[-3:]
  return pam

df["PAM"] = df["Extended Spacer(NNNN[20nt]NGGNNN)"].apply(extracct_pam)
print("Unique PAMs:", df["PAM"].unique())


Unique PAMs: ['GGG' 'CGG' 'TGG' 'AGG']


In [17]:
unique_pams = sorted(df["PAM"].unique())
pam2id = {p: i for i, p in enumerate(unique_pams)}
print("pam2id mapping:", pam2id)

pam_ids = df["PAM"].map(pam2id).values

X_pam = np.zeros((len(df), len(unique_pams)), dtype=np.float32)
X_pam[np.arange(len(df)), pam_ids] = 1.0

print("X_pam shape:", X_pam.shape)

pam2id mapping: {'AGG': 0, 'CGG': 1, 'GGG': 2, 'TGG': 3}
X_pam shape: (4328, 4)


In [18]:
# Encode Gene Symbol as inteder IDs

unique_genes = sorted(df["Gene Symbol"].astype(str).unique())
gene2id = {g: i for i, g in enumerate(unique_genes)}
print("Number of unique genes:", len(unique_genes))

gene_ids = df["Gene Symbol"].astype(str).map(gene2id).astype(np.int64).values
print("gene_ids shape:", gene_ids.shape, "min:", gene_ids.min(), "max:", gene_ids.max())

Number of unique genes: 15
gene_ids shape: (4328,) min: 0 max: 14


In [19]:
# Amino Acid Cut Position (scalar + normalization)

cut_raw = df["Amino Acid Cut position"].astype(np.float32).values
print("Cut position eaw stats: min", cut_raw.min(), "max:", cut_raw.max())

Cut position eaw stats: min 1.0 max: 2826.0


In [20]:
y_raw = df["Percent Peptide"].astype(np.float32).values
print("Percent Peptide eaw stats: min", y_raw.min(), "max:", y_raw.max())

Percent Peptide eaw stats: min 0.05 max: 100.0


In [24]:
N = len(df)
indices = np.arange(N)

#60% train, 40% temp
train_idx, temp_idx = train_test_split(
    indices,
    test_size=0.4,
    random_state=42,
    shuffle=True,
)

# Split temp into 20 val, 20 test
val_idx, test_idx = train_test_split(
    temp_idx,
    test_size = 0.5,
    random_state=42,
    shuffle=True,
)

print("Train size:", len(train_idx), "Val size:", len(val_idx), "Test size", len(test_idx))

Train size: 2596 Val size: 866 Test size 866


In [28]:
# Fit scalers on train only and transform all

# Reshape to 2D for sklearn
cut_raw_2d = cut_raw.reshape(-1, 1)
y_raw_2d = y_raw.reshape(-1, 1)

cut_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

# Fit only on traning subset
cut_scaler.fit(cut_raw_2d[train_idx])
y_scaler.fit(y_raw_2d[train_idx])

#Transform full arrays
cut_scaled_full = cut_scaler.transform(cut_raw_2d).astype(np.float32)
y_scaled_full = y_scaler.transform(y_raw_2d).astype(np.float32)

print("Cut scaled range (full):", cut_scaled_full.min(), cut_scaled_full.max())
print("y scaled range (full):", y_scaled_full.min(), y_scaled_full.max())

Cut scaled range (full): 0.0 0.99999994
y scaled range (full): 0.0 1.0018042


In [29]:
# slice into train / val / test arrays

def split_by_index(arr, train_idx, val_idx, test_idx):
  return arr[train_idx], arr[val_idx], arr[test_idx]

In [30]:
# Sequence branch
X_seq_train, X_seq_val, X_seq_test = split_by_index(X_seq, train_idx, val_idx, test_idx)

#PAM branch (one-hot)
X_pam_train, X_pam_val, X_pam_test = split_by_index(X_pam, train_idx, val_idx, test_idx)

#Gene IDs (for embedding)
X_gene_train, X_gene_val, X_gene_test = split_by_index(gene_ids, train_idx, val_idx, test_idx)

# Cut position scalar
X_cut_train, X_cut_val, X_cut_test = split_by_index(cut_scaled_full, train_idx, val_idx, test_idx)

# Percent Peptide scalar (Targets)
y_train, y_val, y_test = split_by_index(y_scaled_full, train_idx, val_idx, test_idx)

print("X_seq_train shape:", X_seq_train.shape)
print("X_pam_train shape:", X_pam_train.shape)
print("X_gene_train shape:", X_gene_train.shape)
print("X_cut_train shape:", X_cut_train.shape)
print("y_train shape:", y_train.shape)

X_seq_train shape: (2596, 4, 20)
X_pam_train shape: (2596, 4)
X_gene_train shape: (2596,)
X_cut_train shape: (2596, 1)
y_train shape: (2596, 1)


In [31]:
np.save("X_seq_train.npy", X_seq_train.astype(np.float32))
np.save("X_seq_val.npy",   X_seq_val.astype(np.float32))
np.save("X_seq_test.npy",  X_seq_test.astype(np.float32))

np.save("X_pam_train.npy", X_pam_train.astype(np.float32))
np.save("X_pam_val.npy",   X_pam_val.astype(np.float32))
np.save("X_pam_test.npy",  X_pam_test.astype(np.float32))

np.save("X_gene_train.npy", X_gene_train.astype(np.int64))
np.save("X_gene_val.npy",   X_gene_val.astype(np.int64))
np.save("X_gene_test.npy",  X_gene_test.astype(np.int64))

np.save("X_cut_train.npy", X_cut_train.astype(np.float32))
np.save("X_cut_val.npy",   X_cut_val.astype(np.float32))
np.save("X_cut_test.npy",  X_cut_test.astype(np.float32))

np.save("y_train.npy", y_train.astype(np.float32))
np.save("y_val.npy",   y_val.astype(np.float32))
np.save("y_test.npy",  y_test.astype(np.float32))

In [32]:
with open("gene2id.json", "w") as f:
  json.dump(gene2id, f)

with open("pam2id.json", "w") as f:
  json.dump(pam2id, f)

# save scaler parameters (so i can inverse-transform later)
np.save("cut_scaler_min.npy", cut_scaler.data_min_)
np.save("cut_scaler_max.npy", cut_scaler.data_max_)
np.save("y_scaler_min.npy",   y_scaler.data_min_)
np.save("y_scaler_max.npy",   y_scaler.data_max_)