In [None]:
%cd ~/LevSeq

In [None]:
! pwd

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.version

In [None]:
! pip install biopandas

In [None]:
from levseq.seqfit import process_plate_files, gen_seqfitvis

In [None]:
# process the seq and fit to merge the two files
processed_plate_df, seqfit_path = process_plate_files(product=["pdt"], input_csv="/Users/arianemora/Documents/code/MinION/example/HMC0225_HMC0226.csv")

In [None]:
gen_seqfitvis(seqfit_path=seqfit_path)

In [None]:
# Get them w.r.t to a mutation
from scipy.stats import mannwhitneyu
from tqdm import tqdm
import pandas as pd
import numpy as np

parent = '#PARENT#'
value_column = 'pdt'
normalise = 'standard' # one of parent, standard, minmax, none
stats_method = 'mannwhitneyu'

parent_values = processed_plate_df[processed_plate_df['Mutations'] == parent][value_column].values
parent_mean = np.mean(parent_values)
parent_sd = np.std(parent_values)

# if nomrliase normalize with standard normalisation
if normalise:
    processed_plate_df[f'{value_column} standard norm'] = (processed_plate_df[value_column].values - parent_mean)/parent_sd
    value_column = f'{value_column} standard norm'
    
parent_values = list(processed_plate_df[processed_plate_df['Mutations'] == parent][value_column].values)
parent_mean = np.mean(parent_values)
parent_sd = np.std(parent_values)
sd_cutoff = 1.5 # The number of standard deviations we want above the parent values
# Now for all the other mutations we want to look if they are significant, first we'll look at combinations and then individually
grouped_by_mutations = processed_plate_df.groupby('Mutations')

rows = []
for mutation, grp in tqdm(grouped_by_mutations):
    # Get the values and then do a ranksum test
    if mutation != parent:
        vals = list(grp[value_column].values)
        U1, p = None, None
        # Now check if there are 3 otherwise we just do > X S.D over - won't be sig anyway.
        if len(grp) > 2:
            # Do stats
            U1, p = mannwhitneyu(parent_values, vals, method="exact")
        mean_vals = np.mean(vals)
        std_vals = np.std(vals)
        median_vals = np.median(vals)
        sig = mean_vals > ((sd_cutoff*parent_sd) + parent_mean)
        rows.append([mutation, len(grp), mean_vals, std_vals, median_vals, mean_vals - parent_mean, sig, U1, p])
stats_df = pd.DataFrame(rows, columns=['mutation', 'number of wells with mutation', 'mean', 'std', 'median', 'amount greater than parent mean', f'greater than > {sd_cutoff} parent', 'man whitney U stat', 'p-value'])
stats_df

In [None]:
stats_df = stats_df.sort_values(by='p-value')
stats_df.to_csv('stats.csv', index=False)

In [None]:
! pip install xlsxwriter

In [None]:
from collections import defaultdict

mutation_dict = defaultdict(list)
for mutation in stats_df['mutation'].values:
    mutations = mutation.split('_')
    for m in mutations:
        mutation_dict[m].append(mutation)

rows = []
with pd.ExcelWriter('mutations.xlsx', engine='xlsxwriter') as writer:
    for mutation, mutations in mutation_dict.items():
        # Here we want to now get the values for each of these i.e. the stats values for each one and summarize it maybe for now we'll just make a excel file
        df1 = stats_df[stats_df['mutation'].isin(mutations)]
        mutation = mutation.replace('*', '.')
        df1.to_excel(writer, sheet_name=mutation)
        # Also just take the mean of the mean lol and the sum of the number of the wells
        rows.append([mutation, np.sum(df1['number of wells with mutation'].values), '|'.join(set(list(mutations))), np.mean(df1['mean'].values), 
                     np.median(df1['median'].values), np.mean(df1['amount greater than parent mean'].values), 
                     np.max(df1['amount greater than parent mean'].values)])

df = pd.DataFrame(rows, columns=['mutation', 'number of wells with mutation', 
                                 'other-mutations', 'mean', 'median', 
                                 'mean amount greater than parent', 'max amount greater than parent'])
df.sort_values(by='mean amount greater than parent', ascending=False)

In [None]:
amino_acid_to_codon = {
    'A': 'GCT', 'R': 'CGT', 'N': 'AAT', 'D': 'GAT', 'C': 'TGT',
    'Q': 'CAA', 'E': 'GAA', 'G': 'GGT', 'H': 'CAT', 'I': 'ATT',
    'L': 'CTT', 'K': 'AAA', 'M': 'ATG', 'F': 'TTT', 'P': 'CCT',
    'S': 'TCT', 'T': 'ACT', 'W': 'TGG', 'Y': 'TAT', 'V': 'GTT',
    '*': 'TAA'
}

aas = list(amino_acid_to_codon.keys())
from levseq.utils import *
from sklearn.preprocessing import OneHotEncoder


seqs = []
one_hots_nc = []
one_hots_aa = []
# Initialize OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(np.array(['A', 'T', 'G', 'C', '-', '*']).reshape(-1, 1))

encoder_aa = OneHotEncoder()
encoder_aa.fit(np.array(aas).reshape(-1, 1))

for nc in processed_plate_df['nc_variant'].values:
    if nc  != 'Deletion':
        seq = translate(nc)
        one_hot_encoded = encoder.transform(np.array(list(nc)).reshape(-1, 1))
        one_hot_encoded_array = one_hot_encoded.toarray().flatten()
        one_hots_nc.append(one_hot_encoded_array)
    
        one_hot_encoded = encoder_aa.transform(np.array(list(seq)).reshape(-1, 1))
        one_hot_encoded_array = one_hot_encoded.toarray().flatten()
        one_hots_aa.append(one_hot_encoded_array)
    else:
        print('Deletion')


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
pca = PCA(n_components=20)
X = np.array(one_hots_nc)
pca = pca.fit(X)
pcs = pca.transform(X)

non_deletions_df = processed_plate_df[processed_plate_df['nc_variant'] != 'Deletion']
non_deletions_df['PC 1'] = pcs[:, 0]
non_deletions_df['PC 2'] = pcs[:, 1]

sns.scatterplot(non_deletions_df, x='PC 1', y='PC 2', hue='pdt standard norm')

In [None]:
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, (pca.explained_variance_ratio_*100), 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()