# Orthogroup construction between L. Tarentolae and L. infantum
Author: Dr Thomas Cokelaer, 2025 

- reference infantum: GCA_900500625.2   -- 36 contigs 
- reference tarentolae: GCA_009731335.1    -- 179 contigs

In [18]:
from sequana import *
from pylab import *
from bioconvert import fasta2faa # depends only on python
import pandas as pd

# first, we need to build the orthogroups using the GFF and orthofinder. 

We need to extract the amino acids sequences (instead of DNA)

In [9]:

g = GFF3("tarentolae.gff")
g.save_gff_filtered("temp.gff", features = ['gene'])
g.to_fasta("tarentolae.fa", "temp.fa", features=['gene'], identifier="ID")
conv = fasta2faa.FASTA2FAA("temp.fa", "tarentolae.faa")
conv()

g = GFF3("infantum.gff")
g.save_gff_filtered("temp.gff", features = ['gene'])
g.to_fasta("infantum.fa", "temp.fa", features=['gene'], identifier="ID")
from bioconvert import fasta2faa # depends only on python
conv = fasta2faa.FASTA2FAA("temp.fa", "infantum.faa")
conv()





UsageError: Line magic function `%%bash` not found.


In [14]:
%%bash 
# run only if you rerun previous cell
#mkdir -p orthodata || mv *.faa orthodata || rm -f temp.gff temp.fa


In [15]:
%%bash
# run only once
#orthofinder -f orthodata/ && cp orthodata/OrthoFinder/Results_Jan07/Orthogroups/Orthogroups.tsv .


# Reading orthogroups and cleanup

In [82]:
ortho_groups = pd.read_csv("Orthogroups.tsv", sep="\t")

In [83]:
# number of orthogroups with both species
len(ortho_groups.dropna())

7457

In [84]:
# number of groups where infantum is undefined (tarentolae only)
print(f"Number of groups where infantum is missing : {ortho_groups.isna()['infantum'].sum()}")

# number of groups where tarentolae is undefined (infantum only)
print(f"Number of groups where tarentolae is missing : {ortho_groups.isna()['tarentolae'].sum()}")



Number of groups where infantum is missing : 94
Number of groups where tarentolae is missing : 76


In [85]:
S = sum([len(x.split(",")) for x in ortho_groups[ortho_groups.isna()['infantum']]['tarentolae']])
print(f"Number of tarentolae genes where infantum is missing : {S}")
S = sum([len(x.split(",")) for x in ortho_groups[ortho_groups.isna()['tarentolae']]['infantum']])
print(f"Number of infantum genes where tarentolae is missing : {S}")


Number of tarentolae genes where infantum is missing : 342
Number of infantum genes where tarentolae is missing : 353


In [86]:
# Let us drop the orthogroups with only one species.
ortho_groups = ortho_groups.dropna()

# Some groups are not homogenous.
- case 1: for a given group, one species has several genes that are not on the same chromosome
- case 2: the chromomosome for species 1 is different from species 2

Those cases maybe biological but we want to group orthogroups by chromosome so, let us filter these cases

## cleaning 1: non homogenous chromosomal gene location within a species

In [87]:
# first, remove groups where chromosomes are not homogenous within one of the species
indices_inf = []
indices_tar = []
for index in ortho_groups.index:    
    # focus on infantum first
    x = ortho_groups.loc[index,'infantum']
    # chromosome is contain in the name
    X = set([y.split("_")[-1][0:2] for y in x.split(",")])
    if len(X) != 1:
        indices_inf.append(index)
        #print(f"infantum {index, X}")


    x = ortho_groups.loc[index,'tarentolae']
    # chromosome is contain in the name
    X = set([y.split("_")[-1][0:2] for y in x.split(",")])
    if len(X) != 1:
        indices_tar.append(index)
        #print(f"tarentolae {index, X}")
    

In [88]:
# 47 incoherent groups in infantum, 80 in tarentolae and 115 in total
S1 = set(indices_inf)
S2 = set(indices_tar)
S3 = S1.union(S2)
print(len(S1), len(S2), len(S3))

47 80 115


In [89]:
# remove the groups where one of the species has a heterogenous chromosomal gene locations
ortho_groups = ortho_groups.loc[[x for x in ortho_groups.index if x not in S3]]


In [90]:
# let us add the chromosome of the species for each group
# add chromosome
chr_name_inf = []
chr_name_tar = []
for index in ortho_groups.index:    
    x = ortho_groups.loc[index,'infantum']
    X = set([y.split("_")[-1][0:2] for y in x.split(",")])
    chr_name_inf.append(list(X)[0])

    x = ortho_groups.loc[index,'tarentolae']
    X = set([y.split("_")[-1][0:2] for y in x.split(",")])
    chr_name_tar.append(list(X)[0])
ortho_groups['chr_name_inf'] = chr_name_inf
ortho_groups['chr_name_tar'] = chr_name_tar



# Cleaning 2: remove incoherent chromosome location between the 2 species

In [92]:
indices = []
for index in ortho_groups.index:
    x = ortho_groups.loc[index, 'chr_name_inf']
    y = ortho_groups.loc[index, 'chr_name_tar']
    if x!=y:
        indices.append(index)
print(f"Found {len(indices)} incoherent chrom name")
ortho_groups = ortho_groups.loc[[x for x in ortho_groups.index if x not in indices]]

Found 25 incoherent chrom name


In [94]:
len(ortho_groups)

7317

In [114]:
Nfinal = sum([len(x.split(',')) for x in ortho_groups['infantum'].values]) + sum([len(x.split(',')) for x in ortho_groups['tarentolae'].values])

In [115]:
N = len(GFF3("tarentolae.gff").df.query("genetic_type=='gene'")) + len(GFF3("infantum.gff").df.query("genetic_type=='gene'")) 
print(f"Original orthofinder assignement is {16413 / N * 100}%")
final_assignation = Nfinal / N * 100
print(f"Original orthofinder assignement is {final_assignation}%")

Original orthofinder assignement is 94.4035430806396%
Original orthofinder assignement is 87.36339583572989%


In [118]:
ortho_groups.to_csv("Orthogroups_cleaned.tsv", sep="\t", index=None)