# Observed Taylor law: variance vs mean 
**in protein coding gene length distributions for the different species**

## Import python modules

In [1]:
import numpy as np
#import matplotlib.pyplot as plt
from scipy import stats
#import os 
import pandas as pd
#import math
#
from plotnine import *
import warnings
warnings.filterwarnings('ignore')

## Retrieving the statistical descriptions of the gene length's distributions for the different genomes

In [2]:
# statistics on length distribution for different species
working_on_extra_path = "../../../"
stat_file = working_on_extra_path + "main_tables/stat_protCodGenes.tsv"
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")
stat_df = stat_df[["division_7", "division_8", "species", "assembly", "taxonomy_id", "Lineage"]] 

# visualize data
pd.set_option('display.max_columns', None)
if 1:
    print(stat_df.shape)
    print(stat_df.value_counts('division_7', dropna=False))
    display(stat_df.head(2))

The statistical descriptions of the protein coding gene distributions for the different species is in:
 ../../../main_tables/stat_protCodGenes.tsv 

(33627, 6)
division_7
bacteria       31943
fungi           1014
protists         237
vertebrates      222
metazoa          115
plants            96
Name: count, dtype: int64


Unnamed: 0,division_7,division_8,species,assembly,taxonomy_id,Lineage
0,bacteria,archaea,methanobacterium_bryantii_gca_002287175,ASM228717v1,2161,Archaea; Euryarchaeota; Methanomada group; Met...
1,bacteria,archaea,methanobacterium_formicicum_gca_000762265,ASM76226v1,2162,Archaea; Euryarchaeota; Methanomada group; Met...


## Python functions

### All protein coding genes from the paper (33,627)
incluyendo 168 entradas de division_7 que se pierden al hacer el label division_8.

## Filtro por calidad de Genomas: usando NCBI genome annotations

#### Get good genomes
Genome status: Complete Genome or Chromosomes (from NCBI genome annotation)

In [3]:
from pathlib import Path
home = str(Path.home())
print(home)

/home/emuro


In [4]:
# Prokarya genomes: get those with a good genome annotation ("Status")
#
ncbi_genome_home = home + "/Desktop/goingOn/geneLength/NCBI_genomeReports/"
col1 = "TaxID"
col2 = "Assembly Accession"

good_status = ["Complete Genome", "Chromosome"] #["Complete Genome", "Chromosome", "Scaffold", "Contig"]

# Get well annotated eukarya genomes
g_prok_df = pd.read_csv(ncbi_genome_home + "prokaryotes.txt", low_memory=False, sep="\t") # tax_id, status, accession 
g_euk_df  = pd.read_csv(ncbi_genome_home + "eukaryotes.txt", sep="\t")                    
print("prokaryotes", g_prok_df.shape) 
print("eukaryotes",  g_euk_df.shape)

# Ya filtra la calidad!
if 0: # 0: no filtra por calidad
    g_prok_df = g_prok_df[ g_prok_df["Status"].isin(good_status)]
    g_euk_df  = g_euk_df[  g_euk_df["Status"].isin(good_status)]
print("prokaryotes", g_prok_df.shape)
print("eukaryotes", g_euk_df.shape)

prokaryotes (405092, 23)
eukaryotes (22897, 19)
prokaryotes (405092, 23)
eukaryotes (22897, 19)


### From NCBI Assemblies files 
- dict: GC\% of Assembly_accession of species
- dict: size (Mbp) of Assembly_accession
- dict: replicons of Assembly_accession
- dict: status of Assembly_accession

In [5]:
print("Only high quality genomes!..If previously filtered")

#
# Eukaryotes
if 0:
    print(g_euk_df.columns)
dict_euk_groupOfAssembly     = dict(zip(g_euk_df["Assembly Accession"], g_euk_df["Group"]))         
dict_euk_subgroupOfAssembly  = dict(zip(g_euk_df["Assembly Accession"], g_euk_df["SubGroup"]))    
dict_euk_taxidOfAssembly     = dict(zip(g_euk_df["Assembly Accession"], g_euk_df["TaxID"]))          # print(dict_euk_taxidOfAssembly)
dict_euk_gcOfAssembly        = dict(zip(g_euk_df["Assembly Accession"], g_euk_df["GC%"]))            
dict_euk_sizeOfAssembly      = dict(zip(g_euk_df["Assembly Accession"], g_euk_df["Size (Mb)"]))      
dict_euk_repliconsOfAssembly = dict(zip(g_euk_df["Assembly Accession"], g_euk_df["Replicons"]))     
dict_euk_statusOfAssembly    = dict(zip(g_euk_df["Assembly Accession"], g_euk_df["Status"]))         
if 1:
  print(len(dict_euk_statusOfAssembly), "Eukaryotes")

# Prokaryotes
if 0:
    print(g_prok_df.columns)
dict_groupOfAssembly     = dict(zip(g_prok_df["Assembly Accession"], g_prok_df["Group"]))         
dict_subgroupOfAssembly  = dict(zip(g_prok_df["Assembly Accession"], g_prok_df["SubGroup"]))    
dict_taxidOfAssembly     = dict(zip(g_prok_df["Assembly Accession"], g_prok_df["TaxID"]))            # print(dict_taxidOfAssembly)
dict_gcOfAssembly        = dict(zip(g_prok_df["Assembly Accession"], g_prok_df["GC%"]))            
dict_sizeOfAssembly      = dict(zip(g_prok_df["Assembly Accession"], g_prok_df["Size (Mb)"]))       
dict_repliconsOfAssembly = dict(zip(g_prok_df["Assembly Accession"], g_prok_df["Replicons"]))     
dict_statusOfAssembly    = dict(zip(g_prok_df["Assembly Accession"], g_prok_df["Status"]))        
if 1:
  print(len(dict_sizeOfAssembly), "Prokaryotes")

# Final dictionaries for Eukaryotes and Prokaryotes
dict_groupOfAssembly.update(    dict_euk_groupOfAssembly)
dict_subgroupOfAssembly.update( dict_euk_subgroupOfAssembly)
dict_taxidOfAssembly.update(    dict_euk_taxidOfAssembly)
dict_gcOfAssembly.update(       dict_euk_gcOfAssembly)
dict_sizeOfAssembly.update(     dict_euk_sizeOfAssembly)
dict_repliconsOfAssembly.update(dict_euk_repliconsOfAssembly)
dict_statusOfAssembly.update(   dict_euk_statusOfAssembly)
if 1:
  print(len(dict_sizeOfAssembly), "Eukaryotes & Prokaryotes")

Only high quality genomes!..If previously filtered
22897 Eukaryotes
405092 Prokaryotes
427989 Eukaryotes & Prokaryotes


#### Taxid/assemblies for high quality genome assemblies

In [6]:
taxid_highqu_genome_l = list(dict_taxidOfAssembly.values())
assembly_highqu_genome_l = list(dict_taxidOfAssembly.keys())

### From Ensembl Tax_id file 
dict: Assembly_accession of species (species is a perfect id in this case)

In [7]:
# ENSEMBL assembly accession
# system
import os
system = list(os.uname())[0]
if system == 'Linux':
    base_dir = "/media/emuro/Nubya/"
elif system == 'Darwin':
    base_dir = "/Volumes/Nubya/"
ensembl_Id_file = base_dir + "results/geneLength/outputInputFiles/" + "some_tables/species_Ensembl_taxid/" + "species_Ensembl_EMv2.0.tsv" # with covid
print(ensembl_Id_file)

# retrieve data
id_df = pd.read_csv(ensembl_Id_file, sep="\t")
id_df = id_df[["species", "assembly_accession"]]

dict_assemblyOfSpecies = dict(zip(id_df["species"], id_df["assembly_accession"])) # print(len(dict_assemblyOfSpecies))
# visualize data
if 0:
    print(dict_assemblyOfspecies)
pd.set_option('display.max_columns', None)
if 0:
    display(id_df.head(2))
    print(id_df.shape)

# Add the assembly of the species
stat_df["ensembl_assembly"] = stat_df["species"].map(dict_assemblyOfSpecies)
if 1:
    display(stat_df.head(2))
    print(stat_df.shape)       

/media/emuro/Nubya/results/geneLength/outputInputFiles/some_tables/species_Ensembl_taxid/species_Ensembl_EMv2.0.tsv


Unnamed: 0,division_7,division_8,species,assembly,taxonomy_id,Lineage,ensembl_assembly
0,bacteria,archaea,methanobacterium_bryantii_gca_002287175,ASM228717v1,2161,Archaea; Euryarchaeota; Methanomada group; Met...,GCA_002287175.1
1,bacteria,archaea,methanobacterium_formicicum_gca_000762265,ASM76226v1,2162,Archaea; Euryarchaeota; Methanomada group; Met...,GCA_000762265.1


(33627, 7)


#### Taxid and assembly accesssion with high quality genome

In [8]:
# taxid with high quality genome assemblies
taxid_highqu_genome_l    = list(dict_taxidOfAssembly.values()) # taxids
assembly_highqu_genome_l = list(dict_taxidOfAssembly.keys())   # assemblies

stat_df = stat_df[stat_df["taxonomy_id"].isin(taxid_highqu_genome_l)] # reduce to hquality taxonomies...
print(stat_df.shape) # much more redundancy, the assembly commands
#
stat_df = stat_df[stat_df["ensembl_assembly"].isin( assembly_highqu_genome_l )]
print(stat_df.shape)

(32526, 7)
(31736, 7)


#### For each filtered species add: genome size, gc content, chromosomes,... 

In [11]:
# Add the assembly of the species
stat_df["ensembl_assembly"] = stat_df["species"].map(dict_assemblyOfSpecies)
# Add more genomic info
stat_df["group"]      = stat_df["ensembl_assembly"].map(dict_groupOfAssembly)
stat_df["subgroup"]        = stat_df["ensembl_assembly"].map(dict_subgroupOfAssembly)
stat_df["gc_percent"]      = stat_df["ensembl_assembly"].map(dict_gcOfAssembly)
stat_df["size_Mbp"]        = stat_df["ensembl_assembly"].map(dict_sizeOfAssembly)
stat_df["assembly_status"] = stat_df["ensembl_assembly"].map(dict_statusOfAssembly)
stat_df["chromosomes"]     = stat_df["ensembl_assembly"].map(dict_repliconsOfAssembly)

stat_df = stat_df[stat_df.division_8 != "bacteria"]
stat_df = stat_df[stat_df.division_8 != "archaea"]
#stat_df = stat_df[stat_df.division_8 != "fungi"]
#stat_df = stat_df[stat_df.division_8 != "plants"]
#stat_df = stat_df[stat_df.division_8 != "protists"]
#stat_df = stat_df[stat_df.division_8 != "archaea"]
stat_df = stat_df[stat_df.subgroup == "Birds"]
#stat_df = stat_df[stat_df.subgroup == "Fishes"]

display(stat_df.head(40))
print(stat_df.shape)
print(stat_df['division_8'].value_counts(dropna=False))

Unnamed: 0,division_7,division_8,species,assembly,taxonomy_id,Lineage,ensembl_assembly,group,subgroup,gc_percent,size_Mbp,assembly_status,chromosomes
33266,vertebrates,vertebrates,dromaius_novaehollandiae,droNov1,8790,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_003342905.1,Animals,Birds,41.7,1192.25,Scaffold,mitochondrion MT:NC_002784.1/
33267,vertebrates,vertebrates,apteryx_haastii,aptHaa1,8823,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_003342985.1,Animals,Birds,40.8,1221.44,Scaffold,-
33268,vertebrates,vertebrates,apteryx_owenii,aptOwe1,8824,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_003342965.1,Animals,Birds,40.9,1231.28,Scaffold,-
33269,vertebrates,vertebrates,anas_platyrhynchos_platyrhynchos,CAU_duck1.0,8840,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_002743455.1,Animals,Birds,41.7073,1136.42,Chromosome,chromosome 1:CM008538.1; chromosome 2:CM008539...
33270,vertebrates,vertebrates,numida_meleagris,NumMel1.0,8996,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_002078875.2,Animals,Birds,41.8975,1043.26,Chromosome,chromosome 1:NC_034409.1/CM007814.2; chromosom...
33271,vertebrates,vertebrates,gallus_gallus,GRCg6a,9031,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_000002315.5,Animals,Birds,51.8,1065.37,Chromosome,-
33273,vertebrates,vertebrates,serinus_canaria,SCA1,9135,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_000534875.1,Animals,Birds,42.6,1152.1,Scaffold,-
33356,vertebrates,vertebrates,melopsittacus_undulatus,Melopsittacus_undulatus_6.3,13146,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_000238935.1,Animals,Birds,41.4,1117.37,Scaffold,-
33364,vertebrates,vertebrates,nothoprocta_perdicaria,notPer1,30464,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_003342845.1,Animals,Birds,42.1,965.904,Scaffold,-
33384,vertebrates,vertebrates,junco_hyemalis,ASM382977v1,40217,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,GCA_003829775.1,Animals,Birds,40.7,958.315,Scaffold,-


(18, 13)
division_8
vertebrates    18
Name: count, dtype: int64
