In [None]:
%cd ~/LevSeq

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext blackcellmagic

In [None]:
import sys
sys.version

## Preprocess data

In [None]:
from levseq.seqfit import process_plate_files, gen_seqfitvis

In [None]:
# process the seq and fit to merge the two files
processed_plate_df, seqfit_path = process_plate_files(
    products=["pdt"], seq_csv="tests/seqfit/HMC0225_HMC0226.csv"
)

## Perform statistical analysis

In [None]:
from levseq.seqfit import normalise_calculate_stats, calculate_mutation_combinations

parent = "#PARENT#"
value_columns = ["pdt"]
normalise = "standard"  # one of parent, standard, minmax, none
stats_method = "mannwhitneyu"

stats_df = normalise_calculate_stats(
    processed_plate_df,
    value_columns,
    normalise="standard",
    stats_method="mannwhitneyu",
    parent_label="#PARENT#",
)
stats_df = stats_df.sort_values(by="amount greater than parent mean", ascending=False)
stats_df.to_csv("stats.csv", index=False)

In [None]:
stats_df

In [None]:
mutation_df = calculate_mutation_combinations(stats_df)

In [None]:
mutation_df

## Calculate the embeddings space of the variants

In [None]:
from levseq.seqfit import append_xy

append_xy(products=["pdt"], input_file=seqfit_path, batch_size=32)

## Visualize 

In [None]:
gen_seqfitvis(seqfit_path=seqfit_path, products=["pdt"])

## Perform variant analysis

In [None]:
amino_acid_to_codon = {
    "A": "GCT",
    "R": "CGT",
    "N": "AAT",
    "D": "GAT",
    "C": "TGT",
    "Q": "CAA",
    "E": "GAA",
    "G": "GGT",
    "H": "CAT",
    "I": "ATT",
    "L": "CTT",
    "K": "AAA",
    "M": "ATG",
    "F": "TTT",
    "P": "CCT",
    "S": "TCT",
    "T": "ACT",
    "W": "TGG",
    "Y": "TAT",
    "V": "GTT",
    "*": "TAA",
}

aas = list(amino_acid_to_codon.keys())
from levseq.utils import *
from sklearn.preprocessing import OneHotEncoder


seqs = []
one_hots_nc = []
one_hots_aa = []
# Initialize OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(np.array(["A", "T", "G", "C", "-", "*"]).reshape(-1, 1))

encoder_aa = OneHotEncoder()
encoder_aa.fit(np.array(aas).reshape(-1, 1))

for nc in processed_plate_df["nt_sequence"].values:
    if nc != "Deletion":
        seq = translate(nc)
        one_hot_encoded = encoder.transform(np.array(list(nc)).reshape(-1, 1))
        one_hot_encoded_array = one_hot_encoded.toarray().flatten()
        one_hots_nc.append(one_hot_encoded_array)

        one_hot_encoded = encoder_aa.transform(np.array(list(seq)).reshape(-1, 1))
        one_hot_encoded_array = one_hot_encoded.toarray().flatten()
        one_hots_aa.append(one_hot_encoded_array)
    else:
        print("Deletion")

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

pca = PCA(n_components=20)
X = np.array(one_hots_nc)
pca = pca.fit(X)
pcs = pca.transform(X)

non_deletions_df = processed_plate_df[processed_plate_df["nt_sequence"] != "Deletion"]
non_deletions_df["PC 1"] = pcs[:, 0]
non_deletions_df["PC 2"] = pcs[:, 1]

sns.scatterplot(non_deletions_df, x="PC 1", y="PC 2", hue="pdt")

In [None]:
PC_values = np.arange(pca.n_components_) + 1
plt.plot(
    PC_values, (pca.explained_variance_ratio_ * 100), "o-", linewidth=2, color="blue"
)
plt.title("Scree Plot")
plt.xlabel("Principal Component")
plt.ylabel("Variance Explained")
plt.show()