# General genomic information
Author: Thomas Cokelaer, Institut Pasteur, ParSig team, 2025

In [2]:
from sequana import GFF3, FastA

## Infantum genome size and  number of coding genes. 

In [5]:
g = GFF3("references/infantum.gff")
N = len(g.df.query("genetic_type=='gene'"))
print(f"Number of genes in infantum={N}")
L = g.df.query("genetic_type=='region'")[['start', 'stop', 'Name']].dropna().stop.sum()/1e6
print(f"Total length={L}Mb")
assert N == 8683

Number of genes in infantum=8683
Total length=32.803248Mb


## Tarentolae genome size and  number of coding genes. 

In [8]:
g = GFF3("references/tarentolae.gff")
L = g.df.query("genetic_type=='region'")[['start', 'stop', 'Name']].stop.sum()/1e6
print(f"Total length={L}Mb")

N = len(g.df.query("genetic_type=='gene'"))
print(f"Number of genes in tarentolae={N}")

L = g.df.query("genetic_type=='region'")[['start', 'stop', 'Name']].dropna().stop.sum()/1e6
nL = len(g.df.query("genetic_type=='region'")[['start', 'stop', 'Name']].dropna().stop)
print(f"Total length ignoring unassigned contigs={L}Mb, ncontigs={nL}")

valid_contigs = g.df.loc[g.df.query("genetic_type=='region'")[['start', 'stop', 'Name']].dropna().index].seqid.values

N2 = len(g.df.query("genetic_type=='gene' and seqid in @valid_contigs"))
print(f"Number of genes in tarentolae ignoring unassigned contigs={N}")


Total length=35.416496Mb
Number of genes in tarentolae=8703
Total length ignoring unassigned contigs=32.283028Mb, ncontigs=57
Number of genes in tarentolae ignoring unassigned contigs=8703


In [9]:
# ratio of genome within assigned contigs 
32.28/35.41 *100

91.16068907088393

In [10]:
# ratio of coding genes within assigned contigs (larger)
N2/N *100

97.2538205216592

## GC content

In [11]:
f = FastA("references/infantum.fa")
100 * sum([seq.upper().count('G') + seq.upper().count('C') for seq in f.sequences]) / f.get_stats()['total_length']

59.744678331852995

In [12]:
f = FastA("references/tarentolae.fa")
100 * sum([seq.upper().count('G') + seq.upper().count('C') for seq in f.sequences]) / f.get_stats()['total_length']

57.41399431496555