In [3]:
from Bio import SeqIO
import Bio
from collections import defaultdict
from graphviz import Digraph, Graph

class Vertex:
    
    def __init__(self, seq):
        self.seq = seq
        self.coverage = 1
        self.in_edges = {}
        self.out_edges = {}
        
    def increase_coverage(self):
        self.coverage += 1

class Edge:
    
    def __init__(self,k1,k2):
        self.seq = k1 + k2[-1]
        self.n = 2
        self.coverage = 0
    
    def calc_coverage(self,c1,c2):
        self.coverage = (c1+c2)/2


class Graph:

    def __init__(self,k):
        self.vertices = {}
        self.k = k
        
    def add_read(self,read):
        read_lng = len(read)
        if read_lng < self.k:
            return
            
        kmer = read[:k]
        if kmer in self.vertices:
            self.vertices[kmer].increase_coverage()
        else:
            self.vertices[kmer] = Vertex(kmer)
        
        for next_kmer_indx in range(1,read_lng-k+1,1):
            next_kmer = read[next_kmer_indx:(next_kmer_indx+k)]
            if next_kmer in self.vertices:
                self.vertices[next_kmer].increase_coverage()
            else:
                self.vertices[next_kmer] = Vertex(next_kmer)
            
            new_edge = Edge(kmer,next_kmer)
            
            self.vertices[next_kmer].in_edges[kmer]  = [new_edge]
            
            self.vertices[kmer].out_edges[next_kmer] = [new_edge]

            kmer = next_kmer
    
    def calc_init_edge_coverage(self):
        
        for current_vertex in self.vertices.keys():
            for next_vertex in self.vertices[current_vertex].out_edges.keys():
                self.vertices[current_vertex].out_edges[next_vertex][0].calc_coverage(self.vertices[current_vertex].coverage,self.vertices[next_vertex].coverage)
    
    def visualize(self, path, full):
        
        self.graph =  Digraph(comment='assembly')

        for vertex, edge in self.vertices.items():
            label = str(v) if full else 'coverage={}'.format(self.vertices[v].coverage) 
            self.graph.node(vertex, label=label)
                
            for child_vertex, child_edge in edge.out_edges.items():
                print(child_edge)
                label = str(child_edge[0].seq) if full else 'coverage={} size={}'.format(len(child_edge[0].seq), child_edge[0].coverage)
                self.graph.edge(vertex, child_vertex, label='{}'.format(child_edge[0].seq)) 
        
        with open (path, 'w') as handle:
            handle.write(self.graph.source)
    
    def merge(self):
        
        filtered_vertices = []
        for vertex in self.vertices: 
            if len(self.vertices[vertex].in_edges) == 1 and len(self.vertices[vertex].out_edges) == 1:
                filtered_vertices.append(self.vertices[vertex].seq)
    

if __name__ == '__main__':
    
    dataset = './hw_4_5_dataset.fasta'

    k = 3
    
    my_graph = Graph(k)
    
    with open(dataset, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            read = str(record.seq)
            my_graph.add_read(read)

    my_graph.calc_init_edge_coverage()
    
    for v in my_graph.vertices:
        print('Vertex: {}, coverage: {}'.format(v,my_graph.vertices[v].coverage))
        for e in my_graph.vertices[v].out_edges:
            print('-> Out edge: {}'.format(e))
        for e in my_graph.vertices[v].in_edges:
            print('-> In edge: {}'.format(e)) 
    
    my_graph.visualize('graph.dot', full=False) # .dot file can be seen via Xdot, for example

Vertex: CAG, coverage: 4875
-> Out edge: AGA
-> Out edge: AGG
-> Out edge: AGC
-> Out edge: AGT
-> In edge: ACA
-> In edge: TCA
-> In edge: CCA
-> In edge: GCA
Vertex: GTG, coverage: 3489
-> Out edge: TGC
-> Out edge: TGT
-> Out edge: TGG
-> Out edge: TGA
-> In edge: AGT
-> In edge: TGT
-> In edge: CGT
-> In edge: GGT
Vertex: CGA, coverage: 1453
-> Out edge: GAG
-> Out edge: GAC
-> Out edge: GAA
-> Out edge: GAT
-> In edge: TCG
-> In edge: CCG
-> In edge: ACG
-> In edge: GCG
Vertex: TTG, coverage: 6554
-> Out edge: TGC
-> Out edge: TGT
-> Out edge: TGG
-> Out edge: TGA
-> In edge: TTT
-> In edge: GTT
-> In edge: CTT
-> In edge: ATT
Vertex: CCC, coverage: 4726
-> Out edge: CCG
-> Out edge: CCT
-> Out edge: CCC
-> Out edge: CCA
-> In edge: ACC
-> In edge: CCC
-> In edge: TCC
-> In edge: GCC
Vertex: ACG, coverage: 1941
-> Out edge: CGC
-> Out edge: CGG
-> Out edge: CGT
-> Out edge: CGA
-> In edge: GAC
-> In edge: CAC
-> In edge: AAC
-> In edge: TAC
Vertex: GTT, coverage: 5531
-> Out edge: