# Chapter 21: Bioinformatic Knick-knacks and regular expressions

## Knick-knacks

#### Changing a char in a string

In [1]:
# indexing string

seq = "ACTCG"
seq[2]

'T'

In [2]:
# try to change a char in a string?

seq[2] = "A" #error

TypeError: 'str' object does not support item assignment

In [3]:
# need to make it a list before doing so

seql = list(seq)
seql[2] = "A"
print(seql)

['A', 'C', 'A', 'C', 'G']


In [5]:
# joining it back

seqj = "".join(seql)
print(seqj)

ACACG


#### Reversing a string

> similar principle: need to make it a list first, then reverse, then join it back

In [6]:
seq = "ACTAG"
seql = list(seq)
seql.reverse()
print(seql)

['G', 'A', 'T', 'C', 'A']


In [7]:
seqr = "".join(seql)
print(seqr)

GATCA


#### Find and replace (All)

> No need to make it a list

In [9]:
seq = "GAGAGAGAGATATGAGA"
seqr = seq.replace("T", "U")
print(seqr)

GAGAGAGAGAUAUGAGA


#### The library "re"

In [10]:
import re

In [26]:
# searching for a pattern in a string

seq = "GAGAGAGAGATATGAGA"
re.search(r"GAG", seq)

<re.Match object; span=(0, 3), match='GAG'>

In [25]:
# replacing a pattern with another pattern

seq = "GAGAGAGAGATATGAGA"
result = re.subn(r"GAG", # pattern to be found
                "X", # replacement
                seq, # sequence
                0) # number of changes; 0 for all
print(result) # prints a tuple; tuple[0] is the modified pattern, tuple[1] is the number of replacement

('XAXAGATATXA', 3)


> sidetrack

In [23]:
# a method to get all the arguments in a function

import inspect
inspect.signature(re.subn)

<Signature (pattern, repl, string, count=0, flags=0)>

In [22]:
inspect.getfullargspec(re.subn)

FullArgSpec(args=['pattern', 'repl', 'string', 'count', 'flags'], varargs=None, varkw=None, defaults=(0, 0), kwonlyargs=[], kwonlydefaults=None, annotations={})

## Q1: Function to reverse complement

In [3]:
def reverse_complement(seq):
    seq_com = ""
    for i in seq:
        if i == "A":
            seq_com = seq_com + "T"
        elif i == "T":
            seq_com = seq_com + "A"
        elif i == "C":
            seq_com = seq_com + "G"
        elif i == "G":
            seq_com = seq_com + "C"
        else:
            seq_com = seq_com + "X"
    seq_com_ls = list(seq_com)
    seq_com_ls.reverse()
    seq_com_r = "".join(seq_com_ls)
    return(seq_com_r)

In [4]:
seq = "GAGAGAGAGATATGAGA"
print(reverse_complement(seq))

TCTCATATCTCTCTCTC


## Q2: Function to get 6 reading frames; first three from the normal seq, next three from the rev comp seq

In [6]:
def get_windows (seq, win_size, step_size):
    ls = list()
    pos = 0
    while pos < len(seq):
        if len(seq[pos:(pos+win_size)]) < win_size:
            break
        ls.append(seq[pos:(pos+win_size)])
        pos = pos + step_size
    return(ls)

In [9]:
def seq_to_six(seq):
    ls_123 = get_windows(seq,6,1)[0:3]
    seq_r = reverse_complement(seq)
    ls_456 = get_windows(seq_r,6,1)[0:3]
    frames = ls_123 + ls_456
    return(frames)

In [10]:
seq = "ACTAGACG"
print(seq_to_six(seq))

['ACTAGA', 'CTAGAC', 'TAGACG', 'CGTCTA', 'GTCTAG', 'TCTAGT']


## Q3: Function to get the longest aa sequence from a DNA seq

In [12]:
import io

dc_codon = dict()
no = sum(1 for i in io.open("data_20/codon.txt")) # source: https://github.com/zhanxw/anno/blob/master/codon.txt

with io.open("data_20/codon.txt") as fh:
    for i in range(no):
        ls = fh.readline().strip().split()
        dc_codon[ls[0]] = ls[2]

In [13]:
def codon_to_aa(codon):
    if len(codon) != 3:
        print("error")
    elif codon not in dc_codon.keys():
        print("X")
    else:
        return(dc_codon[codon])

In [15]:
def dna_to_aa_stop(seq):
    ls = get_windows(seq,3,3)
    aa = ""
    for i in ls:
        if str(codon_to_aa(i)) == "O": # modified
            break
        aa = aa + str(codon_to_aa(i))
    return(aa)

In [20]:
def longest_non_stop(seq):
    longest_aa = ""
    for i in range(3):
        aa = dna_to_aa_stop(seq[i:])
        print(aa) # to print every iterations
        if len(aa) > len(longest_aa):
            longest_aa = aa
    seqr = reverse_complement(seq)
    for i in range(3):
        aa = dna_to_aa_stop(seqr[i:])
        print(aa) # to print every iterations
        if len(aa) > len(longest_aa):
            longest_aa = aa
    return(longest_aa)

In [21]:
seq = "AGCTACTAGGAAGATAGACGATTAGAC"
print(longest_non_stop(seq))

SY
ATRKIDD
LLGR
V
SNRLSS
LIVYLPSS
LIVYLPSS


## Q4: Modifying the grape_count_gata.py

In [24]:
import re
import io

def count_motifs(seq, motif):
    pieces = re.split(motif, seq)
    return len(pieces) - 1

In [29]:
motif_ls = [r"[AT]GATA[GA]",r"[CGT]ACGTG[GT][AC]",r"TTGAC"]

with (io.open("data_21/grape_promoters.txt")) as fh:
    for line in range(5):
        line = fh.readline()
        linestripped = line.strip()
        line_list = re.split(r"\s+", linestripped)
        gid = line_list[0]
        seq = line_list[1]
        
        for i in motif_ls:
            num_motifs = count_motifs(seq, i)
            print(gid + "\t" + str(num_motifs) + "\t" + str(i))

GSVIVT01034325001_1	3	[AT]GATA[GA]
GSVIVT01034325001_1	0	[CGT]ACGTG[GT][AC]
GSVIVT01034325001_1	2	TTGAC
GSVIVT01034326001_2	2	[AT]GATA[GA]
GSVIVT01034326001_2	0	[CGT]ACGTG[GT][AC]
GSVIVT01034326001_2	4	TTGAC
GSVIVT01034329001_3	2	[AT]GATA[GA]
GSVIVT01034329001_3	0	[CGT]ACGTG[GT][AC]
GSVIVT01034329001_3	0	TTGAC
GSVIVT01034331001_4	2	[AT]GATA[GA]
GSVIVT01034331001_4	0	[CGT]ACGTG[GT][AC]
GSVIVT01034331001_4	0	TTGAC
GSVIVT01034332001_5	1	[AT]GATA[GA]
GSVIVT01034332001_5	0	[CGT]ACGTG[GT][AC]
GSVIVT01034332001_5	2	TTGAC


## Q5: Restriction enzymes and subsequent bin sizes

In [31]:
def count_pieces(seq, motif):
    pieces = re.split(motif, seq)
    return pieces

In [34]:
def gbs_cut(seq, motif, bin):
    dc = dict()
    pieces = count_pieces(seq, motif)
    pieces = sorted(pieces)
    for p in pieces:
        key = len(p) // bin
        if key in dc.keys():
            dc[key] += 1
        else:
            dc[key] = 1
    return(dc)

In [35]:
seq = "AAAAGCAGCAAAAAAGCTGCAAGCAGCAAAAA"
gbs_cut(seq, "GC[AT]GC", 3)

{0: 1, 1: 2, 2: 1}