# Regex Tutorial

In [3]:
import re

### 1. Match a word that contains 4 or more consecutive vowels (a, e, i, o and u). For example, it should match aeie and daoioaidas; but not aebee or auu.

In [1]:
patt1 = r"[aeiou]{4,}"

In [6]:
re.findall(patt1, "aeie daoioaidas aebee auu")

['aeie', 'aoioai']

### 2. Match a string with at least 12 characters with “z” as the last one. That is, the total length of the string should be at least 12, including the last “z” character.

In [24]:
patt2 = r"\w{11,}z$"

In [27]:
strings = ["sfaleifjaleiz", "fsleaijfl", "faleijz", "flseiafjlesiajf"]
for s in strings:
    print(re.findall(patt2, s))

['sfaleifjaleiz']
[]
[]
[]


### 3. Match a string with at least 8 characters but no vowels.

In [28]:
patt3 = r"[^aeiou]{8,}"

In [29]:
strings = ["qqqqqqqqqq", "qqqqqqaqqqqq", "qqqq", "aeiou"]
for s in strings:
    print(re.findall(patt3, s))

['qqqqqqqqqq']
[]
[]
[]


### 4. Match a number which is only composed of even digits, including 0. But don't allow 0 to be the first digit. For example, it should match: 248, 4200, and 6; but not 0, 020, 5 or 123.

In [98]:
patt4_1 = r"^[2468]"
patt4_2 = r"^[02468]*$"
patt4_3 = r"^[2468][02468]*$"

In [94]:
strings = ["0246", "2468", "3131846", "4200", "6", "020", "5", "123", "654"]
for s in strings:
    if re.search(patt4_1, s):
        if re.search(patt4_2, s):
            print(s)

2468
4200
6


In [100]:
for s in strings:
    if re.search(patt4_3,s):
        print(s)

2468
4200
6


### 5. Match an RNA sequence that begins with "AUG" and ends with either of "UAA","UAG", or "UGA". An RNA sequence should be composed of only A, C, G and U.

In [111]:
patt5 = r"^(AUG)[ACGU]*(UAA|UAG|UGA)$"

In [112]:
strings = ["AUGACGUUAA", "AUGXYZUAA", "AUGUGA", "AAAUUUUGA"]

In [113]:
for s in strings:
    if re.search(patt5, s):
        print(s)

AUGACGUUAA
AUGUGA


### Part 2

Complete “grape_count_gata.not_finished.py” first, and then modify it so that it can process a different 
input file and a list of motifs specified in the command line. Rename the new script as 
“count_motifs.py”. When run the following command in the command line:

$ python count_motifs.py grape_promoters.subset.txt [AT]GATA[GA],[CGT]ACGTG[GT][AC],TTGAC

In [135]:
import re
import sys


def main():
    
    input_file = input("input_file")
    motifs = input("motifs sep by comma")
    motifs = motifs.split(",")
    
    with open(input_file, "r") as fh:
        for line in fh:
            line = line.strip()
            gid, seq = re.split(r"\s+", line)
            
            for motif in motifs:

                num_motifs = count_motifs(seq, motif)
                print(f"{gid}\t{num_motifs}\t{motif}")
        

def count_motifs(seq, motif):
    
    return len(re.findall(motif, seq))

In [136]:
main()

input_file grape_promoters.subset.txt

motifs sep by comma [AT]GATA[GA],[CGT]ACGTG[GT][AC],TTGAC

GSVIVT01034325001_1	3	[AT]GATA[GA]
GSVIVT01034325001_1	0	[CGT]ACGTG[GT][AC]
GSVIVT01034325001_1	2	TTGAC
GSVIVT01034326001_2	2	[AT]GATA[GA]
GSVIVT01034326001_2	0	[CGT]ACGTG[GT][AC]
GSVIVT01034326001_2	4	TTGAC
GSVIVT01034329001_3	2	[AT]GATA[GA]
GSVIVT01034329001_3	0	[CGT]ACGTG[GT][AC]
GSVIVT01034329001_3	0	TTGAC
GSVIVT01034331001_4	2	[AT]GATA[GA]
GSVIVT01034331001_4	0	[CGT]ACGTG[GT][AC]
GSVIVT01034331001_4	0	TTGAC
GSVIVT01034332001_5	1	[AT]GATA[GA]
GSVIVT01034332001_5	0	[CGT]ACGTG[GT][AC]
GSVIVT01034332001_5	2	TTGAC
GSVIVT01034334001_6	2	[AT]GATA[GA]
GSVIVT01034334001_6	1	[CGT]ACGTG[GT][AC]
GSVIVT01034334001_6	0	TTGAC
GSVIVT01034337001_7	4	[AT]GATA[GA]
GSVIVT01034337001_7	1	[CGT]ACGTG[GT][AC]
GSVIVT01034337001_7	0	TTGAC
GSVIVT01034340001_8	0	[AT]GATA[GA]
GSVIVT01034340001_8	0	[CGT]ACGTG[GT][AC]
GSVIVT01034340001_8	0	TTGAC
GSVIVT01034341001_9	0	[AT]GATA[GA]
GSVIVT01034341001_9	0	[CGT]ACGTG[GT][AC]
GSVIVT01034341001_9	1	TTGAC
GSVIVT01034344001_10	1	[AT]GATA[GA]
GSVIVT01034344001_10	0	[CGT]