# Chapter 20 Dictionaries

## Some random trying

##### setting up a dict

In [1]:
dc = dict()
ls1 = ["CYP6B", "AGP2", "CATB"] 
ls2 = [0.56, 0.24, 0.73]

In [2]:
for i in range(len(ls1)):
    dc[ls1[i]] = ls2[i]
print(dc)

{'CYP6B': 0.56, 'AGP2': 0.24, 'CATB': 0.73}


##### sorting a dict by keys

In [3]:
dc_sort = dict()
sorted_keys = sorted(dc.keys())
for i in sorted_keys:
    dc_sort[i] = dc[i]
print(dc_sort) # sorted by keys

{'AGP2': 0.24, 'CATB': 0.73, 'CYP6B': 0.56}


##### get the values and items

In [4]:
dc.values()

dict_values([0.56, 0.24, 0.73])

In [5]:
dc.items()

dict_items([('CYP6B', 0.56), ('AGP2', 0.24), ('CATB', 0.73)])

##### sorting a dict by values

In [6]:
dc_val = dict(sorted(dc.items(), key = lambda i:i[1]))

In [7]:
print(dc_val)

{'AGP2': 0.24, 'CYP6B': 0.56, 'CATB': 0.73}


## The gene ontology problem

##### first problem being finding the number of appearance for each id

In [8]:
import io

no = sum(1 for i in io.open("data_20/PZ.annot.txt"))
print(no)

25106


In [9]:
dc_id = dict()
with io.open("data_20/PZ.annot.txt") as fh:
    for i in range(no):
        lineid = fh.readline().strip().split("\t")[0]
        if lineid in dc_id.keys():
            dc_id[lineid] += 1
        else:
            dc_id[lineid] = 1

In [10]:
# printing the first 10 pairs
dc_id_keys = list(dc_id.keys())
for i in range(10):
    print(dc_id_keys[i] + "\t" + str(dc_id[dc_id_keys[i]]))

PZ7180000020811_DVU	1
PZ7180000020752_DVU	1
PZ7180000034678_DWY	1
PZ7180000024883_EZN	4
PZ7180000023260_APN	1
PZ7180000035568_APN	1
PZ7180000020052_APQ	4
PZ547337_APR	1
PZ7180000033253_APS	5
PZ7180000033254_APS	7


##### Create a function for subsequent problems

In [11]:
# combining all together
def key_counter(file):
    no = sum(1 for i in io.open(file))
    dc_id = dict()
    with io.open(file) as fh:
        for i in range(no):
            lineid = fh.readline().strip().split("\t")[0]
            if lineid in dc_id.keys():
                dc_id[lineid] += 1
            else:
                dc_id[lineid] = 1
    return(dc_id)

In [12]:
dc_id = key_counter("data_20/PZ.annot.txt")

##### grep transcriptase in the third column (without using the command tools)

In [13]:
# the grep function

import re

def grepper(file, column, word):
    no = sum(1 for i in io.open(file))
    ls = list()
    with io.open("data_20/PZ.annot.txt") as fh:
        for i in range(no):
            line = fh.readline().strip()
            col = line.split("\t")[column]
            if re.search(word, col):
                ls.append(line)
    return(ls)

In [14]:
# grepping the "transcriptase"

ls_transcriptase = grepper("data_20/PZ.annot.txt", 2, "transcriptase")

In [15]:
# writing the filtered file

with io.open("data_20/PZ.annot2.txt", "w") as fh:
    for i in range(len(ls_transcriptase)):
        fh.write(ls_transcriptase[i] + "\n")

In [16]:
dc_trans = key_counter("data_20/PZ.annot2.txt")

In [17]:
dc_trans # these are the keys with "transcriptase" in the third column

{'PZ7180000000003_PI': 4,
 'PZ840833_BZS': 1,
 'PZ858982_CAA': 2,
 'PZ7180000029134_AHQ': 3,
 'PZ7180000000012_IL': 3,
 'PZ7180000000017_IL': 3,
 'PZ7180000000006_HO': 1,
 'PZ7180000000009_HO': 1,
 'PZ59_HO': 4,
 'PZ7180000000012_DC': 5,
 'PZ32722_B': 7,
 'PZ7180000000108_N': 1,
 'PZ7180000000070_N': 1,
 'PZ7180000000089_N': 4,
 'PZ7180000000107_N': 1,
 'PZ924_N': 4,
 'PZ7180000000590_B': 2,
 'PZ578878': 3,
 'PZ492962': 1,
 'PZ7180000025781': 2,
 'PZCAP37180000034572_A': 1}

##### finding the id with the highest count

In [18]:
dc_trans_s = dict(sorted(dc_trans.items(), key = lambda i:i[1], reverse = True))

In [19]:
print(str(list(dc_trans_s.keys())[0]) + "\t" + str(dc_trans_s[list(dc_trans_s.keys())[0]]))

PZ32722_B	7


## Question 1: Create a codon dictionary / function

In [20]:
dc_codon = dict()
no = sum(1 for i in io.open("data_20/codon.txt")) # source: https://github.com/zhanxw/anno/blob/master/codon.txt

with io.open("data_20/codon.txt") as fh:
    for i in range(no):
        ls = fh.readline().strip().split()
        dc_codon[ls[0]] = ls[2]

In [21]:
print(dc_codon)

{'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AGA': 'R', 'AGC': 'S', 'AGG': 'R', 'AGT': 'S', 'ATA': 'I', 'ATC': 'I', 'ATG': 'M', 'ATT': 'I', 'CAA': 'Q', 'CAC': 'H', 'CAG': 'Q', 'CAT': 'H', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'GAA': 'E', 'GAC': 'D', 'GAG': 'E', 'GAT': 'D', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 'TAA': 'O', 'TAC': 'Y', 'TAG': 'O', 'TAT': 'Y', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', 'TGA': 'O', 'TGC': 'C', 'TGG': 'W', 'TGT': 'C', 'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F'}


In [22]:
# creating a function

def codon_to_aa(codon):
    if len(codon) != 3:
        print("error")
    elif codon not in dc_codon.keys():
        print("X")
    else:
        return(dc_codon[codon])

In [23]:
codon_to_aa("TGG")

'W'

In [24]:
codon_to_aa("TAA")

'O'

In [25]:
codon_to_aa("BOB")

X


## Question 2: DNA to aa

In [26]:
# from Chapter 18

def get_windows (seq, win_size, step_size):
    ls = list()
    pos = 0
    while pos < len(seq):
        if len(seq[pos:(pos+win_size)]) < win_size:
            break
        ls.append(seq[pos:(pos+win_size)])
        pos = pos + step_size
    return(ls)

In [27]:
def dna_to_aa(seq):
    ls = get_windows(seq,3,3)
    aa = ""
    for i in ls:
        aa = aa + str(codon_to_aa(i))
    return(aa)

In [28]:
dna_to_aa("AAACTGTCTCTA")

'KLSL'

## Question 3: kmer counter

In [29]:
def count_kmer(seq):
    ls = get_windows(seq,3,1)
    kmer = dict()
    for i in ls:
        if i in kmer.keys():
            kmer[i] += 1
        else:
            kmer[i] = 1
    return(kmer)

In [30]:
count_kmer("AAACTGTCTCTA")

{'AAA': 1,
 'AAC': 1,
 'ACT': 1,
 'CTG': 1,
 'TGT': 1,
 'GTC': 1,
 'TCT': 2,
 'CTC': 1,
 'CTA': 1}

## Question 4: Combining two dictionaries

> if same key with diff values -> higher value

In [32]:
def union_dict(d1,d2):
    d = d1
    for k in d2.keys():
        if k in d.keys():
            if d2[k] > d[k]:
                d[k] = d2[k]
        else:
            d[k] = d2[k]
    return(d)

In [34]:
d1 = count_kmer("AAACTGTCTCTA")
d2 = count_kmer("AAACTGTCTCTACTA")

In [35]:
d = union_dict(d1,d2)

In [36]:
d1

{'AAA': 1,
 'AAC': 1,
 'ACT': 2,
 'CTG': 1,
 'TGT': 1,
 'GTC': 1,
 'TCT': 2,
 'CTC': 1,
 'CTA': 2,
 'TAC': 1}

In [37]:
d2

{'AAA': 1,
 'AAC': 1,
 'ACT': 2,
 'CTG': 1,
 'TGT': 1,
 'GTC': 1,
 'TCT': 2,
 'CTC': 1,
 'CTA': 2,
 'TAC': 1}

In [38]:
d

{'AAA': 1,
 'AAC': 1,
 'ACT': 2,
 'CTG': 1,
 'TGT': 1,
 'GTC': 1,
 'TCT': 2,
 'CTC': 1,
 'CTA': 2,
 'TAC': 1}