In [1]:
import pandas as pd
import numpy as np
import torch
import os
from sklearn.model_selection import train_test_split

In [8]:
#Load the raw gRNA efficiency dataset
df_raw = pd.read_excel("V2_data.xlsx", sheet_name="Results", header=7)

print("Raw data loaded")
print(df_raw.columns.tolist())
print(df_raw.shape)

Raw data loaded
['Construct Barcode', 'Extended Spacer(NNNN[20nt]NGGNNN)', 'Strand', 'Transcript', 'Gene Symbol', 'Amino Acid Cut position', 'Percent Peptide', 'Annotation', 'sgRNA Score', 'Low Flag', 'AZD_200nM', '6TG_2ug/mL', 'PLX_2uM']
(4329, 13)


  warn(msg)


In [10]:
# Filtering out only biologically relevant columns and renaming
df = df_raw[["Construct Barcode", "Percent Peptide"]].copy()
df.columns = ["sequence", "efficiency"]

print("Selected biological columns:")
print(df.head())

Selected biological columns:
               sequence  efficiency
0  TCCGGGTTGGCCTTCCACTG       73.72
1  GGCTGCTTTACCCGCTGTGG       86.69
2  CTCCGAGTCATTGTAGAGAC       35.15
3  CAGCATCCTTCGGAAAGCTC       27.30
4  CGGTAGAAGCAGGTAGTCTG       82.25


In [11]:
# Clean DNA sequences (ACGTonly, 20bp only)
df["sequence"] = df["sequence"].astype(str)

valid_mask = (
    df["sequence"].str.len() == 20
) & (
    df["sequence"].str.match("^[ACGT]+$")
)

df = df[valid_mask].reset_index(drop=True)

print("Cleaned dataset")
print(df.shape)

Cleaned dataset
(4329, 2)


In [17]:
# Normalizing peptide percents/efficiency
df["efficiency"] = (df["efficiency"] - df["efficiency"].min()) / (
    df["efficiency"].max() - df["efficiency"].min()
)

df["efficiency"].describe()

Unnamed: 0,efficiency
count,4328.0
mean,0.488074
std,0.292975
min,0.0
25%,0.232416
50%,0.487844
75%,0.746273
max,1.0


In [18]:
df.to_csv("V2_data_clean.csv", index=False)
print("V2_data_clean.csv saved")

V2_data_clean.csv saved


In [20]:
df = pd.read_csv("V2_data_clean.csv")
print(df.head(), df.shape)

               sequence  efficiency
0  TCCGGGTTGGCCTTCCACTG    0.737069
1  GGCTGCTTTACCCGCTGTGG    0.866833
2  CTCCGAGTCATTGTAGAGAC    0.351176
3  CAGCATCCTTCGGAAAGCTC    0.272636
4  CGGTAGAAGCAGGTAGTCTG    0.822411 (4329, 2)


In [21]:
# one hot encode real crispr guides

mapping = {"A":0, "C":1, "G":2, "T":3}

def one_hot_encode(seq):
  arr = np.zeros((20, 4))
  for i, base in enumerate(seq):
    arr[i, mapping[base]] = 1
  return arr

X_real = np.stack(df["sequence"].apply(one_hot_encode).values)
y_real = df["efficiency"].values

print(X_real.shape, y_real.shape)

(4329, 20, 4) (4329,)


In [23]:
# Creating a 605, 205, 205 split

X_train, X_temp, y_train, y_temp = train_test_split(
    X_real, y_real, test_size = 0.40, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size = 0.50, random_state=42
)

print("✅60/20/20 split shapes:")
print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

✅60/20/20 split shapes:
Train: (2597, 20, 4)
Val: (866, 20, 4)
Test: (866, 20, 4)


In [25]:
# Convert into tensors

X_train_t = torch.tensor(X_train, dtype = torch.float32)
y_train_t = torch.tensor(y_train, dtype = torch.float32)

X_val_t = torch.tensor(X_val, dtype = torch.float32)
y_val_t = torch.tensor(y_val, dtype = torch.float32)

X_test_t = torch.tensor(X_test, dtype = torch.float32)
y_test_t = torch.tensor(y_test, dtype = torch.float32)


In [26]:
# Saving tensors for later use

os.makedirs("real_data", exist_ok=True)

np.save("real_data/X_train_real.npy", X_train_t.numpy())
np.save("real_data/y_train_real.npy", y_train_t.numpy())

np.save("real_data/X_val_real.npy", X_val_t.numpy())
np.save("real_data/y_val_real.npy", y_val_t.numpy())

np.save("real_data/X_test_real.npy", X_test_t.numpy())
np.save("real_data/y_test_real.npy", y_test_t.numpy())

print("Real CRISPR tensors saved with 60/20/20 split.")

Real CRISPR tensors saved with 60/20/20 split.
