In [1]:
##### Imports
import os, subprocess
import itertools
import math

import pandas as pd
from Bio import SeqIO, Seq, SeqFeature, SeqRecord, AlignIO, Align, Entrez, Graphics

from scripts import functions

In [2]:
##### Constants
REF_ANNOTATIONS_FILE = 'data/hg38.gencode.v41.primary_annotation.gtf'

UBQ_CONSENSUS = "ATGCAGATCTTCGTGAAGACCCTGACTGGTAAGACCATCACCCTCGAGGTGGAGCCCAGTGACACCATCGAGAATGTCAAGGCAAAGATCCAAGANAAGGAAGGCATCCCTCCTGACCAGCAGAGGTTGATCTTTGCNGGNAAACAGCTGGAAGATGGNCGCACCCTGTCTGACTACAACATCCAGAAAGAGTCCACCCTGCACCTGGTGCTCCGTCTNAGAGGTGGG"
TRANSLATE_CDS_ID = {"ENSPMAP00000006225": "Lamprey (Petromyzon marinus)",
				"ENSCSAVP00000012392": "Ciona savignyi",
				"ENSLACP00000022259": "Coelacanth (Latimeria chalumnae)",
				"ENSPFOP00000022587": "Amazon molly (Poecilia formosa)",
				"ENSCINP00000027318": "Ciona intestinalis - ENSCING00000015387.2",
				# "ENSCINP00000027318": "Ciona intestinalis",
				"ENSGACP00000026768": "Stickleback (Gasterosteus aculeatus)",
				"ENSCINP00000030891": "Ciona intestinalis - ENSCING00000021662.1",
				# "ENSCINP00000030891": "Ciona intestinalis",
				"ENSPCAP00000008325": "Hyrax (Procavia capensis)",
				"ENSCHIP00000014262": "Goat (Capra hircus)",
				"ENSDARP00000139647": "Zebrafish (Danio rerio)", 	# DNA
				"ENSDARP00000140603": "Zebrafish (Danio rerio)",	# Protein
				"ENSORLP00000015065": "Japanese medaka HdrR (Oryzias latipes)",
				"ENSACLP00000009605": "Eastern happy (Astatotilapia calliptera) - ENSACLG00000006563.1",
				"ENSACLP00000014166": "Eastern happy (Astatotilapia calliptera) - ENSACLG00000009674.1",
				# "ENSACLP00000009605": "Eastern happy (Astatotilapia calliptera)",
				# "ENSACLP00000014166": "Eastern happy (Astatotilapia calliptera)",
				"ENSXMAP00000023177": "Platyfish (Xiphophorus maculatus)",
				"ENSPCIP00000043051": "Koala (Phascolarctos cinereus)",
				"ENSBTAP00000053003": "Cow (Bos taurus)",
				"ENSPLAP00000001439": "Sailfin molly (Poecilia latipinna) - ENSPLAG00000002494.1",
				"ENSPLAP00000019951": "Sailfin molly (Poecilia latipinna) - ENSPLAG00000002479.1",
				# "ENSPLAP00000001439": "Sailfin molly (Poecilia latipinna)",
				# "ENSPLAP00000019951": "Sailfin molly (Poecilia latipinna)",
				"ENSNGAP00000021386": "Upper Galilee mountains blind mole rat (Nannospalax galili)",
				"ENSHCOP00000009639": "Tiger tail seahorse (Hippocampus comes)",
				"ENSLBEP00000021236": "Ballan wrasse (Labrus bergylta)",
				"ENSSLDP00000024700": "Yellowtail amberjack (Seriola lalandi dorsalis)",
				"ENSHBUP00000016144": "Burton's mouthbrooder (Haplochromis burtoni) - ENSHBUG00000018116.1",
				"ENSHBUP00000034276": "Burton's mouthbrooder (Haplochromis burtoni) - ENSHBUG00000021698.1",
				# "ENSHBUP00000016144": "Burton's mouthbrooder (Haplochromis burtoni)",
				# "ENSHBUP00000034276": "Burton's mouthbrooder (Haplochromis burtoni)",
				"ENSSDUP00000023286": "Greater amberjack (Seriola dumerili)",
				"ENSPKIP00000009939": "Paramormyrops kingsleyae (Paramormyrops kingsleyae)",
				"ENSPTRP00000050925": "Chimpanzee (Pan troglodytes)",
				"ENSSGRP00000030473": "Golden-line barbel (Sinocyclocheilus grahami)",
				"ENSCLMP00005010920": "Lumpfish (Cyclopterus lumpus)",
				"ENSSLUP00000021246": "Pike-perch (Sander lucioperca)",
				"ENSOARP00020005495": "Sheep (Ovis aries)",
				"ENSSVLP00005023449": "Eurasian red squirrel (Sciurus vulgaris)",
				"ENSLCRP00005042299": "Large yellow croaker (Larimichthys crocea) - ENSLCRG00005016225.1",
				"ENSLCRP00005042302": "Large yellow croaker (Larimichthys crocea) - ENSLCRG00005016226.1",
				"ENSLCRP00005042306": "Large yellow croaker (Larimichthys crocea) - ENSLCRG00005016228.1",
				# "ENSLCRP00005042299": "Large yellow croaker (Larimichthys crocea)",
				# "ENSLCRP00005042302": "Large yellow croaker (Larimichthys crocea)",
				# "ENSLCRP00005042306": "Large yellow croaker (Larimichthys crocea)",
				"ENSTRUP00000057853": "Fugu (Takifugu rubripes)",
				"ENSRFEP00010028825": "Greater horseshoe bat (Rhinolophus ferrumequinum)",
				"ENSSAUP00010062031": "Gilthead seabream (Sparus aurata)",
				"ENSMODP00000020484": "Opossum (Monodelphis domestica)",
				"ENSONIP00000041861": "Nile tilapia (Oreochromis niloticus)",
				"ENSBSLP00000037601": "Siamese fighting fish (Betta splendens)",
				"ENSMMDP00005044403": "Pinecone soldierfish (Myripristis murdjan)",
				"ENSCARP00000108608": "Goldfish (Carassius auratus)",
				"ENSGEVP00005012653": "Goodes thornscrub tortoise (Gopherus evgoodei)",
				"FBpp0073035": "Drosophila melanogaster",
				"ENSGMOP00000050001": "Atlantic cod (Gadus morhua)",
				"ENSMAMP00000011057": "Zig-zag eel (Mastacembelus armatus)",
				"ENSATEP00000008835": "Climbing perch (Anabas testudineus)",
				"ENSSMAP00000000429": "Turbot (Scophthalmus maximus)",
				"ENSACAP00000041232": "Green anole (Anolis carolinensis)",
				"ENSPPYP00000005811": "Sumatran orangutan (Pongo abelii)",
				"ENSRNOP00000074688": "Rat (Rattus norvegicus)",
				"ENSDLAP00005067172": "European seabass (Dicentrarchus labrax) - ENSDLAG00005018286.2",
				"ENSDLAP00005066837": "European seabass (Dicentrarchus labrax) - ENSDLAG00005015725.2",
				"ENSDLAP00005083630": "European seabass (Dicentrarchus labrax) - ENSDLAG00005028514.1",
				# "ENSDLAP00005067172": "European seabass (Dicentrarchus labrax)",
				# "ENSDLAP00005066837": "European seabass (Dicentrarchus labrax)",
				# "ENSDLAP00005083630": "European seabass (Dicentrarchus labrax)",
				"ENSOMYP00000115309": "Rainbow trout (Oncorhynchus mykiss)",
				"ENSSSAP00000008699": "Atlantic salmon (Salmo salar)",
				"ENSXETP00000104973": "Tropical clawed frog (Xenopus tropicalis)",		# DNA
				"ENSXETP00000102146": "Tropical clawed frog (Xenopus tropicalis)",		# Protein
				"ENSSSCP00000062692": "Pig (Sus scrofa)",
				"F25B5.4.2": "Caenorhabditis elegans",
				"ENSMUSP00000114180": "Mouse (Mus musculus)",
				"ENSP00000344818": "Human"}

IGNORED_ORTHOLOGS = ("Ciona intestinalis - ENSCING00000015387.2 - ENSCINP00000027318",
					"Ciona intestinalis - ENSCING00000021662.1 - ENSCINP00000030891",
					"Nile tilapia (Oreochromis niloticus) - ENSONIP00000041861",
					"European seabass (Dicentrarchus labrax) - ENSDLAG00005015725.2 - ENSDLAP00005066837",
					"Paramormyrops kingsleyae (Paramormyrops kingsleyae) - ENSPKIP00000009939",
					"Large yellow croaker (Larimichthys crocea) - ENSLCRG00005016226.1 - ENSLCRP00005042302",
					"Large yellow croaker (Larimichthys crocea) - ENSLCRG00005016225.1 - ENSLCRP00005042299",
					"Large yellow croaker (Larimichthys crocea) - ENSLCRG00005016228.1 - ENSLCRP00005042306",
					"Sailfin molly (Poecilia latipinna) - ENSPLAG00000002494.1 - ENSPLAP00000001439",
					"Sailfin molly (Poecilia latipinna) - ENSPLAG00000002479.1 - ENSPLAP00000019951",
					"Amazon molly (Poecilia formosa) - ENSPFOP00000022587",
					"European seabass (Dicentrarchus labrax) - ENSDLAG00005028514.1 - ENSDLAP00005083630",
					"Lamprey (Petromyzon marinus) - ENSPMAP00000006225",
					"Burton's mouthbrooder (Haplochromis burtoni) - ENSHBUG00000021698.1 - ENSHBUP00000034276",
					"Burton's mouthbrooder (Haplochromis burtoni) - ENSHBUG00000018116.1 - ENSHBUP00000016144",
					"Coelacanth (Latimeria chalumnae) - ENSLACP00000022259")

DNA_BASES = ('A', 'T', 'G', 'C')

In [3]:
##### Functions
def lines_overlap(A_start:int|float, A_end:int|float, B_start:int|float, B_end:int|float) -> bool:
	'''
	This is only applicable if A_start <= A_end and B_start <= B_end
	'''
	for arg in A_start, A_end, B_start, B_end:
		assert isinstance(arg, int) or isinstance(arg, float), f'Argument "{arg}" is type {type(arg)}. Type must be int or float.'
	
	lefthand_overlap = A_start <= B_start and A_end >= B_start
	righthand_overlap = A_start >= B_start and A_start <= B_end
	return lefthand_overlap or righthand_overlap






In [4]:
##### Settings
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows', 200)

In [5]:
##### Loading Data
ortho_seqs = list(SeqIO.parse("data/OG_Human_UBC_orthologs_CDS.fa", "fasta"))

for i, cds in enumerate(ortho_seqs):
	ortho_seqs[i].name = TRANSLATE_CDS_ID[cds.name] + " - " + ortho_seqs[i].name

ortho_seqs = [(record.name, str(record.seq)) for record in ortho_seqs if record.name not in IGNORED_ORTHOLOGS]

In [9]:
##### 
def align_subunits(ref_seq:str, 
				ref_name:str, 
				query_tuples: list[tuple],
				protein_seqtype : bool = False,
				gap_open_cost : float = None,
				gap_extend_cost : float = None,
				min_identity : int|float = None,
				min_similarity : int|float = None,
				min_length : int|float = None,
				keep_temp_files : bool = True,
				return_alignments_plus_counts : bool = False):
	'''
	
	'''
	alignments = []
	for query_name, query_seq in query_tuples:
		align_num = math.ceil( len(query_seq)/len(ref_seq) )
		print(query_name, len(query_seq), align_num)
		q_aligns = functions.emboss_matcher(ref_seq,
							query_seq,
							ref_name,
							query_name,
							align_num,
							protein_seqtype=protein_seqtype,
							gap_open=gap_open_cost,
							gap_extend=gap_extend_cost,
							stop_temp_overwrite=False,
							keep_temp_files=keep_temp_files)
		alignments.extend(q_aligns)
	if min_identity is not None:
		alignments = [ali for ali in alignments if ali["Identity"] >= min_identity]
	if min_similarity is not None:
		alignments = [ali for ali in alignments if ali["Similarity"] >= min_similarity]
	if min_length is not None:
		alignments = [ali for ali in alignments if ali["Length"] >= min_length]
	alignments.sort(key=lambda ali: (ali["Query_name"], ali["Query_align_start"]))

	# Check for overlaps
	for query_name, _ in query_tuples:
		aligns = [ali for ali in alignments if ali["Query_name"] == query_name]
		for ali_A, ali_B in itertools.combinations(aligns, 2):
			assert not lines_overlap(ali_A["Query_align_start"], 
									ali_A["Query_align_end"], 
									ali_B["Query_align_start"], 
									ali_B["Query_align_end"]), f'{ali_A["Alignment_number"]} and {ali_B["Alignment_number"]} in {ali_A["Query_name"]}'

	if return_alignments_plus_counts:
		ali_counts = []
		for query_name, _ in query_tuples:
			count = len([ali for ali in alignments if ali["Query_name"] == query_name])
			ali_counts.append((query_name, count))
		ali_counts.sort()
		return alignments, ali_counts

	else:
		return alignments

ortho_aligns, ortho_counts = align_subunits(UBQ_CONSENSUS,
											"ubq consensus",
											ortho_seqs,
											min_identity=50,
											min_similarity=50,
											min_length=len(UBQ_CONSENSUS)*0.75,
											keep_temp_files=False,
											return_alignments_plus_counts=True)

Ciona savignyi - ENSCSAVP00000012392 2181 10


NameError: name 'emboss_matcher' is not defined

In [None]:
##### 
for gene, _ in ortho_seqs:
	print(gene)
	aligns = [ali for ali in ortho_aligns if ali["Query_name"] == gene]
	for i, ali in enumerate(aligns):
		print(f'\t{i}: {ali["Query_align_start"]} to {ali["Query_align_end"]}   ({ali["Length"]} bp, {ali["Identity"]}% identity, {ali["Similarity"]}% similarity)')

Ciona savignyi - ENSCSAVP00000012392
	0: 121 to 347   (227 bp, 81.1% identity, 81.1% similarity)
	1: 349 to 575   (227 bp, 80.2% identity, 80.2% similarity)
	2: 577 to 803   (227 bp, 80.2% identity, 80.2% similarity)
	3: 805 to 1031   (227 bp, 81.9% identity, 81.9% similarity)
	4: 1033 to 1259   (227 bp, 81.5% identity, 81.5% similarity)
	5: 1261 to 1487   (227 bp, 81.1% identity, 81.1% similarity)
	6: 1489 to 1715   (227 bp, 80.2% identity, 80.2% similarity)
	7: 1717 to 1949   (233 bp, 79.0% identity, 79.0% similarity)
	8: 1951 to 2177   (227 bp, 80.2% identity, 80.2% similarity)
Stickleback (Gasterosteus aculeatus) - ENSGACP00000026768
	0: 1 to 227   (227 bp, 86.3% identity, 86.3% similarity)
	1: 229 to 455   (227 bp, 86.3% identity, 86.3% similarity)
	2: 457 to 683   (227 bp, 86.3% identity, 86.3% similarity)
	3: 685 to 911   (227 bp, 86.8% identity, 86.8% similarity)
	4: 913 to 1139   (227 bp, 86.3% identity, 86.3% similarity)
	5: 1147 to 1373   (227 bp, 85.9% identity, 85.9% simil