# Assess the consensus genome coverage per sample

In [1]:
import os
import sys
import pandas as pd
import numpy as np

from Bio import SeqIO

In [2]:
dir_base = os.path.expanduser("~/WGS/analyses/Coverage/")
fa_path = dir_base+"Samples.fasta"

In [3]:
fa_path = os.path.expanduser("~/Desktop/121240.fasta")
famulti_path = os.path.expanduser("~/Desktop/allsequences.fasta")

# Manually calculate from Fasta

In [7]:
#Manual calculation without using a library
#Open up the fasta file and read in the data
with open(fa_path, "r") as fa:
    headers = []
    sequences = []
    i=-1
    nt=""
    for line in fa:
        if line[0] == ">":
            headers.append(line.strip(">\n'")) #Add to header
            sequences.append(i) #Add new entry to sequences font to replace
            i += 1 #Increment the count
            nt = "" #Reset nt
        else :
            #Concatenate sequences
            nt = nt+line.rstrip()
            #Add sequence to the correct position
            sequences[i]=nt

            i = 0
print("Sample\tLength\t#N\tCoverage")
for seq in sequences :
    # Reset values
    N = 0
    coverage = 0
    length = 0
    
    for char in seq: 
        if char == 'N':
            N += 1 #Count number of N's
    length=len(seq) #count number of NT
    
    coverage=(length-N)/length
    print(headers[i]+"\t"+str(length)+"\t"+str(N)+"\t"+str(coverage))
    i += 1

Sample	Length	#N	Coverage
121240	29897	5205	0.8259022644412483


# Use SeqIO for single entry FASTA

In [30]:
#Use a library for speed!
record = SeqIO.read(fa_path, "fasta")
fa_length = len(record.seq) 
fa_num_N = record.seq.count('N') 
fa_coverage = ((fa_length - fa_num_N) / fa_length) *100
print("Sample %s has %d bp (%d N's) therefore %d%% coverage." % (record.id,fa_length, fa_num_N, fa_coverage))


Sample 121240 has 29897 bp (5205 N's) therefore 82% coverage.
Coverage is therefore 82 %.


In [33]:
def fasta_single_breadth (fasta_fp):
    """
    Define the breadth of coverage from a single entry fasta as a percentage
    
    Parameters
        fasta_fp : filepath to fasta file
            
    Returns
        dt : dict, containing statistics on fasta file            
    """
    #Load the record
    record = SeqIO.read(fasta_fp, "fasta")
    
    #Calculate the statistics
    fa_length = len(record.seq) 
    fa_num_N = record.seq.count('N') 
    fa_coverage = ((fa_length - fa_num_N) / fa_length) *100
    
    # Package key statistics into a dictionary
    dt = {
        "coverage_breadth": fa_coverage,
        "number_Ns": fa_num_N,
        "consensus_genome_length": fa_length
    }
    
    return dt

In [34]:
fasta_single_breadth(fa_path)

{'coverage_breadth': 82.59022644412482,
 'number_Ns': 5205,
 'consensus_genome_length': 29897}

# Calculate for multi-line fasta

In [66]:
count = 0
for record in SeqIO.parse(famulti_path, "fasta") :
    fa_length = len(record.seq) 
    fa_num_N = record.seq.count('N') 
    fa_coverage = ((fa_length - fa_num_N) / fa_length) *100
    if count <5:
        print("Sample %s has %d bp (%d N's) therefore %d%% coverage." % (record.id,fa_length, fa_num_N, fa_coverage))
    count += 1

Sample 67a has 29903 bp (344 N's) therefore 98% coverage.
Sample 67c has 29903 bp (421 N's) therefore 98% coverage.
Sample 29a has 29903 bp (167 N's) therefore 99% coverage.
Sample 29b has 29903 bp (21473 N's) therefore 28% coverage.
Sample 67b has 29901 bp (346 N's) therefore 98% coverage.


In [52]:
#Load the fasta file and iterate through
for record in SeqIO.parse(famulti_path, "fasta") :
    #Calculate the statistics
    fa_length = len(record.seq) 
    fa_num_N = record.seq.count('N') 
    fa_coverage = ((fa_length - fa_num_N) / fa_length) *100

    # Package key statistics into a dictionary with the record.id as the key
    dt.update({ record.id : {"coverage_breadth": fa_coverage,
                             "number_Ns": fa_num_N,
                             "consensus_genome_length": fa_length}
              })

In [20]:
def fasta_stats (fasta_fp):
    """
    Define statistics from a fasta (single- or multi-line)
    
    Parameters
        fasta_fp : filepath to fasta file
            
    Returns
        dt : dict, containing statistics on fasta file            
    """
    #Define the dictionary
    dt = {}
    
    #Load the fasta file and iterate through
    for record in SeqIO.parse(fasta_fp, "fasta") :
    
        #Calculate the statistics
        fa_length = len(record.seq) 
        fa_num_N = record.seq.count('N') 
        fa_coverage = ((fa_length - fa_num_N) / fa_length) *100
        
        # Package key statistics into a dictionary with the record.id as the key
        dt.update({ record.id : {"coverage_breadth_fasta": fa_coverage,
                                 "number_Ns_fasta": fa_num_N,
                                 "consensus_genome_length_fasta": fa_length}
                  })
    
    return dt

In [21]:
#Analse the fasta file
fasta_dt = fasta_stats(famulti_path)

In [18]:
#Turn dictionary into a dataframe in correct orientation
fasta_df = pd.DataFrame.from_dict(fasta_dt, orient ='index') 

In [22]:
fasta_df.reset_index(inplace=True)

In [24]:
print("   Stats generated for %d sequences" % len(fasta_df))

   Stats generated for 1248 sequences


In [25]:
print("Merging fasta stats with bam / fastq stats (gisaid_df)")

Merging fasta stats with bam / fastq stats (gisaid_df)
