# Analyze guides

Since the guides can be given to us in a variety of orders and we need to order different ranges of sequence, this becomes rather tricky & I am having trouble working directly with R.

# imports & globals 🌎

In [21]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import regex as re
from pickle import dump,load
import sys

# Grab data & example

In [22]:
gene_data = pd.read_csv("../data/gene_data.csv")
print(gene_data.shape)

(47, 13)


In [23]:
guide_data = pd.read_excel("../inputs/47 genes and its guide sequence- synthego.xlsx")
print(guide_data.shape)

(162, 2)


In [24]:
guide_data[guide_data["Gene Name"] == "LYPLAL1"]

Unnamed: 0,Gene Name,Guides
129,LYPLAL1,A*U*A*CUGGGGUAACCGAGGGG
130,LYPLAL1,G*C*C*AUGCAGGAAGAUCAGAG
131,LYPLAL1,A*U*C*AGCGAUGGCGGCUGCGU


In [25]:
gene_data[gene_data.gene == "LYPLAL1"]

Unnamed: 0.1,Unnamed: 0,gene,organism,gene_link,gene_bank_url,ncbi_id,start,stop,strand,ncbi_phid,genbank_jquery,CDS,seq
40,40,LYPLAL1,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/127018,https://www.ncbi.nlm.nih.gov//nuccore/NC_00000...,568815597,219173878,219445496,off,CE8C9E9A32CEF9910000000001B900A1.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"14..104,5270..5369,5523..5549,19205..19374,366...",agtggcatcagcgatggcggctgcgtcggggtcggttctgcagcgc...


In [26]:
guides = set(guide_data[guide_data["Gene Name"] == "LYPLAL1"].Guides)
guides

{'A*U*A*CUGGGGUAACCGAGGGG',
 'A*U*C*AGCGAUGGCGGCUGCGU',
 'G*C*C*AUGCAGGAAGAUCAGAG'}

In [27]:
seq = gene_data[gene_data.gene == "LYPLAL1"].seq.iloc[0]
cds = gene_data[gene_data.gene == "LYPLAL1"].CDS.iloc[0]
print(cds)
print(seq)

14..104,5270..5369,5523..5549,19205..19374,36655..36770,37615..37851
agtggcatcagcgatggcggctgcgtcggggtcggttctgcagcgctgtatcgtgtcgccggcagggaggcatagcgcctctctgatcttcctgcatggctcaggtggatttcaattttacgtcctggttttctacagctcgggaaacatcctcccctcggttaccccagtattgcccaagtggaggtgtcgccgggcccaaatagggcccagtgggtggctcccccgtcgccagccccggctgtagatgcacgggcagcgccacctgcccccagcctcccccgttagtcttcctctttctatcgggcggtcactggtccaccaccctcgctttggggccttgtgtcccgccgctcagccagccctccatccccacaacacacctccccattcctcgccccaagtttaaacagagcaagttagtaaggtaagtcttctgctagagggaaattatttagtgtttttcattatcagctactagtgatttttcagtcgtaattgcccattgctttttccaggctcccatactaatgtagacaattgagattcacagttaacttagatactgtttacagattatctcttttctgtaagccactgtgccatggctggagtgctttctctcactttcctttttgctgcttctctttccttcctttccctttcccgcctgtgtcattaaacttccaagatagactacaggggcaggacccccagggacaggcatcttccactacctgcccttagtaacgtgtgcttaatgctgtatcgttacttaacactcatctgtaatcacttttgtctgtcttccccactggattgtgaacttcccactggattatgagagtaagggcccttgtgtcaacctttgcacccccagtgcttggcccagggtaagctctcaggaaatgttttttaaagaaatgagatg

# Perform the matching & the cuts

In [28]:
preprocess = lambda s: "".join(s.lower().split("*"))
def rna_2_dna(s):
    s = preprocess(s)
    s = s.replace("u","t")
    return s
[rna_2_dna(g) for g in guides]

['gccatgcaggaagatcagag', 'atcagcgatggcggctgcgt', 'atactggggtaaccgagggg']

In [29]:
def dna_to_complement(s):
    s = preprocess(s)
    s = s.replace("a","+")
    s = s.replace("t","a")
    s = s.replace("+","t")
    s = s.replace("g","+")
    s = s.replace("c","g")
    s = s.replace("+","c")
    return s
[dna_to_complement(rna_2_dna(g)) for g in guides]

['cggtacgtccttctagtctc', 'tagtcgctaccgccgacgca', 'tatgaccccattggctcccc']

In [30]:
def dna_to_rev(s):
    s = preprocess(s)
    return s[::-1]
[dna_to_rev(rna_2_dna(g)) for g in guides]

['gagactagaaggacgtaccg', 'tgcgtcggcggtagcgacta', 'ggggagccaatggggtcata']

In [31]:
def find_guide_loc(seq,guide,primer="NGG",cut_offset = 3):
    """
    @param seq: sequence to look through (str)
    @param guide: rna guide to search for
    @param primer: "NGG" or "G" defines the primer to look for
    @param cut_offset: int the number of bases past the primer (into RNA guide) to cut at
    """
    beg = "c"
    if primer == "NGG":
        beg = "gg."
    # turn guide into dna
    dna_guide = rna_2_dna(guide)
    
    poss_patts = [beg+dna_guide[::-1], # reverse orientation
                  dna_to_complement(beg+dna_guide[::-1]) # reverse complement
                 ]
    offset = cut_offset + (3 if primer == "NGG" else 1)
    breaks = []
    for patt in poss_patts:
        res = re.search(re.compile(patt),seq)
        if res:
            #print(patt)
            breaks.append(res.span()[0]+offset)
        # try the opp order
        res = re.search(re.compile(patt[::-1]),seq)
        if res:
            #print(patt,"rev")
            breaks.append(res.span()[1]-offset)
    if len(breaks) == 0:
        raise ValueError("Could not find guide '{}' with primer '{}'".format(guide,primer))
    elif len(breaks) == 1:
        return breaks[0]
    else:
        raise ValueError("Could not find unique guide '{}' with primer '{}'\ncuts ({})".format(guide,primer,",".join([str(b) for b in breaks])))
    
    

In [32]:
cut1 = find_guide_loc(seq,list(guides)[0])

In [33]:
cut1

83

In [34]:
print("LYPLAL1")
print(cds)
print(seq[:9]+"|"+seq[9:50])
print(seq[:23]+"|"+seq[23:50])

LYPLAL1
14..104,5270..5369,5523..5549,19205..19374,36655..36770,37615..37851
agtggcatc|agcgatggcggctgcgtcggggtcggttctgcagcgctgta
agtggcatcagcgatggcggctg|cgtcggggtcggttctgcagcgctgta


In [35]:
cut_seq = seq[:cut1]+"|"+seq[cut1:]
print(cut_seq[cut1-10:cut1+10])

agcgcctctc|tgatcttcc


In [36]:
cut1

83

In [37]:
for guide in guides:
    cut = find_guide_loc(seq,guide)
    cut_seq = seq[:cut]+"|"+seq[cut:]
    print("{:3d} {:20s}".format(cut,cut_seq[cut-8:cut+8]))

 83 cgcctctc|tgatctt    
 23 ggcggctg|cgtcggg    
156 atcctccc|ctcggtt    


In [38]:
for gene in set(guide_data["Gene Name"]):
    gene = gene.split()[0]
    guides = set(guide_data[guide_data["Gene Name"] == gene].Guides)
    seq = gene_data[gene_data.gene == gene].seq.iloc[0]
    cds = gene_data[gene_data.gene == gene].CDS.iloc[0]
    print(gene)
    for guide in guides:
        cut = find_guide_loc(seq,guide)
        cut_seq = seq[:cut]+"|"+seq[cut:]
        print("{:5d} {:20s}".format(cut,cut_seq[cut-8:cut+8]))

SERPINA1
 7413 cttctgtc|tcgtggg    
 7558 ccccaacc|tggctga    
 7519 ccaccatg|atcagga    
PNPLA2
  933 acgcacat|ctacggc    
  823 gcgagaag|acgtgga    
  875 ctacgtcg|gcgtggc    
SUN1
23385 ggtgccga|cagcggc    
23333 tagtttgc|gcctggc    
23424 aaccgagc|ggccagg    
LYPLAL1
   83 cgcctctc|tgatctt    
   23 ggcggctg|cgtcggg    
  156 atcctccc|ctcggtt    
ADH1B
 5499 taacgttg|ccaaggt    
 5351 agcccatt|caccact    
LMNA
48055 taccaaga|aggaggg    
48118 gaactcca|aggaggc    
48190 gcggggcc|aggtggc    
ALDH2
  212 gagtcccc|cgcaggc    
   70 gccgcccg|cttcggg    
  120 cacccagg|ccgtgcc    
ZMPSTE24
   93 agcgtatc|ttcgggg    
   42 ccatgggg|atgtggg    
  154 agcacagc|ggcaggt    
MBOAT7
 5989 accctggg|gctgctg    
 6055 ggaatcat|gacaggt    
 5907 gctggtga|gcctggc    
ERLIN1
10080 attgaatt|gtttggt    
 9970 ttgatatc|gtgagga    
10007 gaccttaa|tcttcaa    
TOR1B
  223 ctcaacgc|ttcgggt    
  150 ggccatca|ccggcta    
   91 gcccgagt|ggtggcg    
MTTP
17890 taccgcat|ttcctcc    
17782 ggtcacac|aactggt    
17

In [39]:
test_guide = 'G*G*G*AUCUCUAGCCUACAGGA'

In [40]:
test_dna = rna_2_dna(test_guide)
test_dna

'gggatctctagcctacagga'