# Chapter 23: Objects and classes

## Trying on the SNP class

> To check if the SNP is "transition" or "transversion"

In [143]:
class SNP:
    
    def __init__(self, chrom, pos, id, ref, alt):
        assert ref != alt, "Error: ref == alt"
        self.chrom = chrom
        self.pos = pos
        self.id = id
        self.ref = ref
        self.alt = alt
        
    def is_transition(self):
        if self.ref == "A" or self.ref == "G":
            if self.alt == "A" or self.alt == "G":
                return(True)
        if self.ref == "C" or self.ref == "T":
            if self.alt == "C" or self.alt == "T":
                return(True)
        return(False)
    
    def is_transversion(self):
        if self.is_transition():
            return(False)
        return(True)

In [148]:
import io

with io.open("data_23/trio.sample.vcf.txt") as fh:
    for i in range(15):
        ln = fh.readline().strip().split("\t")
        if i > 9:
            snp = SNP(ln[0],ln[1],ln[2],ln[3],ln[4])
            print(snp.id)
            print(snp.is_transition())
            print(snp.is_transversion())

rs57181708
True
False
.
False
True
rs4970461
False
True
rs6689107
False
True
rs13302914
True
False


In [146]:
# another better method

import re

with io.open("data_23/trio.sample.vcf.txt") as fh:
    for i in range(15):
        ln = fh.readline().strip()
        if not (re.search(r"^#", ln)):
            ln = ln.split("\t")
            snp = SNP(ln[0],ln[1],ln[2],ln[3],ln[4])
            print(snp.id)
            print(snp.is_transition())
            print(snp.is_transversion())

rs57181708
True
False
.
False
True
rs4970461
False
True
rs6689107
False
True
rs13302914
True
False


## Now with the chromosome class

In [45]:
class chromosome:
    
    def __init__(self, chrom):
        self.chrom = chrom
        self.snploc = dict()
        
    def add_snp(self, chrom, pos, id, ref, alt):
        assert pos not in self.snploc.keys(), "Error: duplicated SNP"
        assert chrom == self.chrom, "Error: wrong chromosome"
        newsnp = SNP(chrom, pos, id, ref, alt)
        self.snploc[pos] = newsnp
        
    def count_transition(self):
        count = 0
        locs = self.snploc.keys()
        for loc in locs:
            if self.snploc[loc].is_transition():
                count += 1
        return(count)
    
    def count_transversion(self):
        count = 0
        locs = self.snploc.keys()
        for loc in locs:
            if self.snploc[loc].is_transversion():
                count += 1
        return(count)

In [69]:
import io

no = sum(1 for line in io.open("data_23/trio.sample.vcf.txt"))

with io.open("data_23/trio.sample.vcf.txt") as fh:
    chr = dict()
    for i in range(no):
        ln = fh.readline().strip().split("\t")
        if i > 10:
            if ln[0] not in chr.keys():
                chr[ln[0]] = chromosome(ln[0])
            chr[ln[0]].add_snp(ln[0],ln[1],ln[2],ln[3],ln[4])

In [66]:
for i in range(1,23):
    print(chr[str(i)].count_transition())
    print(chr[str(i)].count_transversion())
# literally counted all chromosomes

9344
4262
10309
5130
8708
4261
9050
4372
7586
3874
7874
3697
6784
3274
6520
3419
5102
2653
6165
2952
5944
2908
5876
2700
4926
2368
4016
1891
3397
1676
3449
1891
3024
1357
3791
1738
2198
962
2656
1187
1773
848
1539
639


## Extending the "chromosome" class: searching for the SNP dense region

In [87]:
class chromosome:
    
    def __init__(self, chrom):
        self.chrom = chrom
        self.snploc = dict()
        
    def add_snp(self, chrom, pos, id, ref, alt):
        assert pos not in self.snploc.keys(), "Error: duplicated SNP"
        assert chrom == self.chrom, "Error: wrong chromosome"
        newsnp = SNP(chrom, pos, id, ref, alt)
        self.snploc[pos] = newsnp
        
    def count_transition(self):
        count = 0
        locs = self.snploc.keys()
        for loc in locs:
            if self.snploc[loc].is_transition():
                count += 1
        return(count)
    
    def count_transversion(self):
        count = 0
        locs = self.snploc.keys()
        for loc in locs:
            if self.snploc[loc].is_transversion():
                count += 1
        return(count)
    
### below being new functions
    
    def density(self, l, m):
        count = 0
        for loc in self.snploc.keys():
            loc = int(loc)
            if loc >= l and loc <= m:
                count += 1
        den = count / (m - l + 1) * 1000
        return(den)
    
    def best_region(self, region_size):
        
        lastsnp = int(sorted(self.snploc.keys(), reverse = True)[0])
        best = [0.0, 1, region_size - 1] # density, start, end
        
        for loc in range(1, lastsnp, region_size):
            loc = int(loc)
            den = self.density(loc, loc + region_size - 1)
            if den > best[0]:
                best = [den, loc, loc + region_size - 1]
        return(best)

In [88]:
# need to update the dictionary

no = sum(1 for line in io.open("data_23/trio.sample.vcf.txt"))

with io.open("data_23/trio.sample.vcf.txt") as fh:
    chr = dict()
    for i in range(no):
        ln = fh.readline().strip().split("\t")
        if i > 10:
            if ln[0] not in chr.keys():
                chr[ln[0]] = chromosome(ln[0])
            chr[ln[0]].add_snp(ln[0],ln[1],ln[2],ln[3],ln[4])

In [89]:
chr["1"].best_region(100000)

[0.21000000000000002, 5200001, 5300000]

## Q1: Calculating the density for each region (and the number of transitions, the number of snps)

In [110]:
class chromosome:
    
    def __init__(self, chrom):
        self.chrom = chrom
        self.snploc = dict()
        
    def add_snp(self, chrom, pos, id, ref, alt):
        assert pos not in self.snploc.keys(), "Error: duplicated SNP"
        assert chrom == self.chrom, "Error: wrong chromosome"
        newsnp = SNP(chrom, pos, id, ref, alt)
        self.snploc[pos] = newsnp
        
    def count_transition(self):
        count = 0
        locs = self.snploc.keys()
        for loc in locs:
            if self.snploc[loc].is_transition():
                count += 1
        return(count)
    
    def count_transversion(self):
        count = 0
        locs = self.snploc.keys()
        for loc in locs:
            if self.snploc[loc].is_transversion():
                count += 1
        return(count)
    
    def density(self, l, m):
        count = 0
        for loc in self.snploc.keys():
            loc = int(loc)
            if loc >= l and loc <= m:
                count += 1
        den = count / (m - l + 1) * 1000
        return(den)
    
    def best_region(self, region_size):
        
        lastsnp = int(sorted(self.snploc.keys(), reverse = True)[0])
        best = [0.0, 1, region_size - 1] # density, start, end
        
        for loc in range(1, lastsnp, region_size):
            loc = int(loc)
            den = self.density(loc, loc + region_size - 1)
            if den > best[0]:
                best = [den, loc, loc + region_size - 1]
        return(best)
    
### Below are the new functions in the class "chromosome"

    def region_trans(self, l, m):
        count_snp = 0
        count_transition = 0
        count_transversion = 0
        for loc in self.snploc.keys():
            loc = int(loc)
            if loc >= l and loc <= m:
                count_snp += 1
                if self.snploc[str(loc)].is_transition():
                    count_transition += 1
                if self.snploc[str(loc)].is_transversion():
                    count_transversion += 1
        try:
            return [count_snp, round(count_transition / count_snp,2), round(count_transversion / count_snp,2)]
        except:
            return [count_snp, 0, 0]

    def region_den(self, region_size):
        
        lastsnp = int(sorted(self.snploc.keys(), reverse = True)[0])
        rden = dict()
        
        for loc in range(1, lastsnp, region_size):
            loc = int(loc)
            den = round(self.density(loc, loc + region_size - 1),2)
            reg_trans = self.region_trans(loc, loc + region_size - 1)
            no_snp = reg_trans[0]
            percent_transition = reg_trans[1]
            key = str(loc) + ".." + str(loc + region_size - 1)
            rden[key] = [key, den, percent_transition, no_snp]
        return(rden)

In [111]:
# need to update the dictionary

no = sum(1 for line in io.open("data_23/trio.sample.vcf.txt"))

with io.open("data_23/trio.sample.vcf.txt") as fh:
    chr = dict()
    for i in range(no):
        ln = fh.readline().strip().split("\t")
        if i > 10:
            if ln[0] not in chr.keys():
                chr[ln[0]] = chromosome(ln[0])
            chr[ln[0]].add_snp(ln[0],ln[1],ln[2],ln[3],ln[4])

In [114]:
dc1 = chr["1"].region_den(1000000)

In [118]:
print("chr" + "\t" + "loc" + "\t" + "dens" + "\t" + "%transition" + "\t" + "no_snp")
for i in range(10):
    ls = dc1[list(dc1.keys())[i]]
    print("1" + "\t" + str(ls[0]) + "\t" + str(ls[1]) + "\t" + str(ls[2]) + "\t" + str(ls[3]))

chr	loc	dens	%transition	no_snp
1	1..1000000	0.02	0.71	17
1	1000001..2000000	0.03	0.79	34
1	2000001..3000000	0.07	0.68	72
1	3000001..4000000	0.06	0.66	65
1	4000001..5000000	0.08	0.75	83
1	5000001..6000000	0.07	0.68	71
1	6000001..7000000	0.06	0.71	58
1	7000001..8000000	0.07	0.74	73
1	8000001..9000000	0.06	0.72	57
1	9000001..10000000	0.06	0.8	55


## Q2: Creating the "bug" class

In [5]:
import random

In [4]:
class bug:
    
    def __init__(self):
        self.bases = ["A", "T", "C", "G"]
        self.genome = [random.choice(self.bases) for i in range(100)]
        
    def mutate_base(self):
        base_no = random.randint(0,99)
        self.genome[base_no] = random.choice(self.bases)
        
    def set_base(self, index, base):
        self.genome[index] = base
        
    def get_fitness(self):
        self.fitness = 0
        fitness = 0
        for i in range(100):
            if self.genome[i] == "G" or self.genome[i] == "C":
                fitness += 1
        genome_str = "".join(self.genome)
        if "AAA" in genome_str:
            fitness += 5
        self.fitness = fitness

In [265]:
a = bug()
a.genome[:5]

['G', 'C', 'A', 'T', 'G']

In [266]:
a.mutate_base()

In [267]:
a.set_base(0,"T")

In [268]:
a.genome[:5]

['T', 'C', 'A', 'T', 'G']

In [270]:
a.get_fitness()
a.fitness

57

In [185]:
# Creating 10 bugs; have them mutated; and calculated their fitness

for i in range(10):
    newbug = bug()
    print(newbug.get_fitness())
    for j in range(10):
        newbug.mutate_base()
    print(newbug.get_fitness())

54
54
44
46
56
57
44
42
56
60
51
51
49
50
43
47
59
58
56
55


## Q3: Adding on the class "population"

In [138]:
class population:
    
    def __init__(self):
        self.bug_list = [bug() for i in range(50)]
    
    def create_offspring(self):
        new_pop = list()
        oldbug = [i for i in self.bug_list]
        for i in self.bug_list:
            i.mutate_base()
        newbug = [i for i in self.bug_list]
        new_pop = oldbug + newbug
        self.bug_list = new_pop
    
    def cull(self):
        new_pop = list()
        fitness = list()
        for i in self.bug_list:
            i.get_fitness()
            fitness.append(i.fitness)
        halfnumber = sorted(fitness, reverse = True)[len(fitness)//2]
        for i in range(len(fitness)):
            if fitness[i] > halfnumber:
                new_pop.append(self.bug_list[i])
        for i in range(len(fitness)):
            if fitness[i] == halfnumber:
                if len(new_pop) < len(fitness)//2:
                    new_pop.append(self.bug_list[i])
                else:
                    break
        self.bug_list = new_pop
        
    def mean_fitness(self):
        
        import statistics
        
        fitness = list()
        for i in self.bug_list:
            i.get_fitness()
            fitness.append(i.fitness)
        return(statistics.mean(fitness))

In [140]:
popa = population()
print(len(popa.bug_list))
print(popa.mean_fitness())

50
54.08


In [141]:
popa.create_offspring()
print(len(popa.bug_list))
print(popa.mean_fitness())

100
54


In [142]:
popa.cull()
print(len(popa.bug_list))
print(popa.mean_fitness())

50
58.28
