In [None]:
from scripts import helpers, mlogo

In [None]:
helpers.read_json()

In [None]:
# Modules import
import os 
import pandas as pd
from scripts import helpers

# Class that generate motif-logo and perform data analysis on DNA sequences
class mlogo():
    """
    Class that generate motif-logo and perform data analysis on DNA sequences.
    The data can be grouped by the "group_by" argument (column) and generate 
    seperate plot for each subject 
    """

    def __init__(self,
                 seq_file,
                 group_by:str = None,
                 divide_subject:bool = True,
                 subject_col:str = "subject_id",
                 sequence_col:str = "sequence",
                 germline_col:str = "germline"):
        """
        seq_file : str -> file path / df object of sequencing data.
        group_by : str -> group the dataset by this column. will create number of subplots according
                          to the number of unique values in the grouped by column.
        divide_subject : bool -> Generate different plot for each subject under `subject_id` column.
        subject_col : str -> name of the subject column, defualt is `subject_id`. 
        sequence_col : str -> name of the DNA sequence column, defualt it `sequence`.
        germline_col : str -> name of the germline DNA sequence column, defualt it `germline`.
        """
        
        # Loading sequences dataset
        try:
            if isinstance(seq_file, str):        
                self.seq_df = pd.read_csv(seq_file, index_col=0)
                print(f"> Dataset loaded (seq_file = '{seq_file}')")

            elif isinstance(seq_file, pd.DataFrame):
                 self.seq_df = seq_file
                 print("> Dataset loaded")

        except:
                print(f"> Invalid input, please make sure seq_file argument entred correctly.\n  (Invalid: seq_file = '{seq_file}')")

        # Translating germline and sequence DNA into amino acid sequence
        self.seq_df["sequence_aa"] = self.seq_df["sequence"].apply(helpers.nt_transalte_104) # Somatic sequence
        self.seq_df["germline_aa"] = self.seq_df["germline"].apply(helpers.nt_transalte_104) # Germline sequence
        

        def motif_logo(self):
             pass


In [39]:
test_df = pd.read_csv("input\\cl_seqs_motif.csv", index_col=0).head(1000)
test = mlogo(seq_file=test_df)
test.seq_df

> Dataset loaded


Unnamed: 0,seq_id,ai,sample_id,subject_id,clone_id,functional,copy_number,cdr3_aa,sequence,germline,ab_target,time_point,label,sequence_aa
17877,M03592:154:000000000-JCY9V:1:2117:16836:3379,257367,8,subj_3,651989,1,4,CATDFNSNKAHW,.................................................,CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCGTGGTCCAGCCTG...,sn,4,sn.4.subj_3,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
17880,M03592:154:000000000-JCY9V:1:2119:17338:11554,310211,10,subj_3,651991,1,9,CTQGAVAGKPEYFHNX,.................................................,CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCGTGGTCCAGCCTG...,sp,1,sp.1.subj_3,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
17881,M03592:154:000000000-JCY9V:1:2119:16221:19845,310212,10,subj_3,651991,1,3,CTQGAVAGKPEYFHNX,.................................................,CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCGTGGTCCAGCCTG...,sp,1,sp.1.subj_3,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
17882,M03592:154:000000000-JCY9V:1:2117:14781:11724,310217,10,subj_3,651991,1,3,CTQGAVAGKPEYFXNW,.................................................,CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCGTGGTCCAGCCTG...,sp,1,sp.1.subj_3,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
17883,M03592:154:000000000-JCY9V:1:2114:22937:3453,310221,10,subj_3,651991,1,4,CTQGAVAGKPEXFXNW,.................................................,CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCGTGGTCCAGCCTG...,sp,1,sp.1.subj_3,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19031,M03592:154:000000000-JCY9V:1:2112:12328:4442,339103,12,subj_3,652294,1,2,CARAGXGSWSLIXXW,.................................................,CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCGTGGTCCAGCCTG...,sp,2,sp.2.subj_3,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
19032,M03592:154:000000000-JCY9V:1:2110:21993:10357,339106,12,subj_3,652294,1,3,CXRADFGSWSLIDNW,.................................................,CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCGTGGTCCAGCCTG...,sp,2,sp.2.subj_3,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
19033,M03592:154:000000000-JCY9V:1:2110:15095:22503,339110,12,subj_3,652294,1,2,CARAXFGSXSLIDNW,.................................................,CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCGTGGTCCAGCCTG...,sp,2,sp.2.subj_3,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
19034,M03592:154:000000000-JCY9V:1:2110:12209:13507,339111,12,subj_3,652294,1,3,CARADFGXRSLIDNW,.................................................,CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCGTGGTCCAGCCTG...,sp,2,sp.2.subj_3,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."


In [10]:
from scripts.helpers import nt_transalte_104
seq = "ATGCGTGAGATCGATAATAAT"
"".join(nt_transalte_104(seq))


'MREIDN'

In [42]:
import logomaker
logomaker.get_example_matrix("nn_saliency_matrix")

Description of example matrix "nn_saliency_matrix":
# 
# Saliency matrix illustrated in Figure 1F.
# Data are from Figure 1D of Janganathan et al. (2019),
# and were kindly provided by Kyle Farh and Kishore Jaganathan.
# 
# References:
# 
# Jaganathan K et al. (2019) Predicting Splicing from Primary Sequence with
# Deep Learning. Cell. 176(3):535â€“548.e24.
# 



Unnamed: 0_level_0,A,C,G,T
pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-0.0,-0.000000,-0.001725,-0.000000
1,0.0,0.000000,0.033557,0.000000
2,0.0,0.000000,0.030026,0.000000
3,0.0,0.000000,0.012748,0.000000
4,0.0,0.000000,0.000337,0.000000
...,...,...,...,...
124,0.0,0.000000,0.000000,0.021006
125,0.0,0.000000,0.000000,0.019015
126,0.0,0.000000,0.000000,0.010700
127,0.0,0.000000,0.000000,0.010441


In [None]:
from scripts.helpers import read_json

def create_folders():
    config_dict = read_json()
    main_folders = [config_dict["input_folder"], config_dict["output_folder"]]
    subf_output = [config_dict["output_folder"] + i for i in ["//motif_figure", "//motif_data`"]]

    for folder in main_folders + subf_output:
        if os.path.exists(folder) is False:
            os.mkdir(folder)
            print(f"> folder `{folder}` was created.")


create_folders()