# Observed Taylor law: variance vs mean 
**in protein coding gene length distributions for the different species**

## Import python modules

In [18]:
import numpy as np
import pandas as pd
from pathlib import Path

## Retrieving the statistical descriptions of the gene length's distributions for the different genomes

In [32]:
# statistics on length distribution for different species
stat_file = "/home/emuro/stat_protCodGenes_with_ncbiGenomeData.tsv"
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")
#stat_df = stat_df[["division_7", "division_8", "species", "taxonomy_id", "Lineage", "ensembl_assembly_accession", "group", "subgroup", "assembly_status"]] 
stat_df = stat_df[["division_7", "species", "taxonomy_id", "Lineage", "group", "subgroup"]] 

# visualize the data frame
if 1:
    print(stat_df.shape)
    print(stat_df.value_counts('division_7', dropna=False))
    pd.set_option('display.max_columns', stat_df.shape[1])
    display(stat_df.head(2))

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /home/emuro/stat_protCodGenes_with_ncbiGenomeData.tsv 

(33627, 6)
division_7
bacteria       31943
fungi           1014
protists         237
vertebrates      222
metazoa          115
plants            96
Name: count, dtype: int64


Unnamed: 0,division_7,species,taxonomy_id,Lineage,group,subgroup
0,bacteria,methanobacterium_bryantii_gca_002287175,2161,Archaea; Euryarchaeota; Methanomada group; Met...,Euryarchaeota,Methanomada group
1,bacteria,methanobacterium_formicicum_gca_000762265,2162,Archaea; Euryarchaeota; Methanomada group; Met...,Euryarchaeota,Methanomada group


In [33]:
##stat_df["Fer_group"] = np.nan
# visualize the data frame
if 1:
    print(stat_df.shape)
    print(stat_df.value_counts('division_7', dropna=False))
    pd.set_option('display.max_columns', stat_df.shape[1])
    display(stat_df.head(2))

(33627, 6)
division_7
bacteria       31943
fungi           1014
protists         237
vertebrates      222
metazoa          115
plants            96
Name: count, dtype: int64


Unnamed: 0,division_7,species,taxonomy_id,Lineage,group,subgroup
0,bacteria,methanobacterium_bryantii_gca_002287175,2161,Archaea; Euryarchaeota; Methanomada group; Met...,Euryarchaeota,Methanomada group
1,bacteria,methanobacterium_formicicum_gca_000762265,2162,Archaea; Euryarchaeota; Methanomada group; Met...,Euryarchaeota,Methanomada group


## Retrieving the data from Fer

In [34]:
# statistics on length distribution for different species
fer_file = "/home/emuro/git/github/EM_geneLength_nature/main_tables/extra_tables/genes.xlsx"
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", fer_file, "\n")

# retrieve data and diminish the number of columns
fer_df = pd.read_excel(fer_file, sheet_name="Vertebrata")
#fer_df = fer_df[["division_7", "division_8", "species", "taxonomy_id", "Lineage", "ensembl_assembly_accession", "group", "subgroup", "assembly_status"]] 

# visualize the data frame
if 1:
    print(fer_df.shape)
    #print(fer_df.value_counts('division_7', dropna=False))
    pd.set_option('display.max_columns', stat_df.shape[1])
    display(fer_df.head(2))

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /home/emuro/git/github/EM_geneLength_nature/main_tables/extra_tables/genes.xlsx 

(222, 4)


Unnamed: 0,species,genes_mean,genes_var,group
0,acanthochromis_polyacanthus,17577.741964,763691400.0,fishes
1,amphilophus_citrinellus,13673.609124,586451100.0,fishes


In [39]:
df = pd.merge(stat_df, fer_df, on="species", how="left")
df = df[df.division_7=="vertebrates"]
# visualize the data frame
if 1:
    print(df.shape)
    #print(fer_df.value_counts('division_7', dropna=False))
    pd.set_option('display.max_columns', df.shape[1])
    pd.set_option('display.max_rows', 222)
    pd.set_option('display.max_colwidth', None)
    df = df.drop(['division_7', 'genes_mean', 'genes_var'], axis=1)
    display(df)

(222, 9)


Unnamed: 0,species,taxonomy_id,Lineage,group_x,subgroup,group_y
33238,petromyzon_marinus,7757,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Cyclostomata; Hyperoartia; Petromyzontiformes; Petromyzontidae; Petromyzon,,,fishes
33239,eptatretus_burgeri,7764,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Cyclostomata; Myxini; Myxiniformes; Myxinidae; Eptatretinae; Eptatretus,Animals,Fishes,fishes
33240,callorhinchus_milii,7868,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Chondrichthyes; Holocephali; Chimaeriformes; Callorhinchidae; Callorhinchus,Animals,Fishes,fishes
33241,latimeria_chalumnae,7897,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Coelacanthiformes; Coelacanthidae; Latimeria,Animals,Fishes,fishes
33242,lepisosteus_oculatus,7918,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Actinopterygii; Neopterygii; Holostei; Semionotiformes; Lepisosteidae; Lepisosteus,Animals,Fishes,fishes
33243,clupea_harengus,7950,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Actinopterygii; Neopterygii; Teleostei; Clupei; Clupeiformes; Clupeoidei; Clupeidae; Clupea,,,fishes
33244,danio_rerio,7955,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Actinopterygii; Neopterygii; Teleostei; Ostariophysi; Cypriniformes; Danionidae; Danioninae; Danio,Animals,Fishes,fishes
33245,astyanax_mexicanus,7994,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Actinopterygii; Neopterygii; Teleostei; Ostariophysi; Characiformes; Characoidei; Characidae; Astyanax,Animals,Fishes,fishes
33246,astyanax_mexicanus_pachon,7994,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Actinopterygii; Neopterygii; Teleostei; Ostariophysi; Characiformes; Characoidei; Characidae; Astyanax,Animals,Fishes,fishes
33247,ictalurus_punctatus,7998,Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Actinopterygii; Neopterygii; Teleostei; Ostariophysi; Siluriformes; Ictaluridae; Ictalurus,,,fishes


In [8]:
# Save the file if desired
if 0:
    stat_df.to_csv("/home/emuro/stat_protCodGenes_with_ncbiGenomeData_FerGroup.tsv", sep="\t", index=False, header=True)