# Observed Taylor law: variance vs mean 
**in protein coding gene length distributions for the different species**

## Import python modules

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

## Retrieving the statistical descriptions of the gene length's distributions for the different genomes

In [2]:
# statistics on length distribution for different species
stat_file = "/home/emuro/stat_protCodGenes_with_ncbiGenomeData.tsv"
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")
#stat_df = stat_df[["division_7", "division_8", "species", "taxonomy_id", "Lineage", "ensembl_assembly_accession", "group", "subgroup", "assembly_status"]] 

##stat_df = stat_df[["division_7", "division_8", "species", "taxonomy_id", "Lineage", "group", "subgroup"]] 

# visualize the data frame
if 1:
    print(stat_df.shape)
    print(stat_df.value_counts('division_7', dropna=False))
    print(stat_df.value_counts('group',      dropna=False))
    pd.set_option('display.max_columns',     stat_df.shape[1])
    display(stat_df.head(2))

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /home/emuro/stat_protCodGenes_with_ncbiGenomeData.tsv 

(33627, 13)
division_7
bacteria       31943
fungi           1014
protists         237
vertebrates      222
metazoa          115
plants            96
Name: count, dtype: int64
group
Proteobacteria                     14417
Terrabacteria group                10997
FCB group                           2601
NaN                                 1605
Fungi                                980
Euryarchaeota                        686
PVC group                            308
TACK group                           291
Animals                              274
Spirochaetes                         221
Protists                             211
Acidobacteria                        190
DPANN group                          122
Bacteria incertae sedis              118
Fusobacteria                         110
Plants                                70
The

Unnamed: 0,division_7,division_8,species,assembly,taxonomy_id,Lineage,ensembl_assembly_accession,group,subgroup,gc_percent,size_Mbp,assembly_status,chromosomes
0,bacteria,archaea,methanobacterium_bryantii_gca_002287175,ASM228717v1,2161,Archaea; Euryarchaeota; Methanomada group; Met...,GCA_002287175.1,Euryarchaeota,Methanomada group,33.2,3.46637,Contig,-
1,bacteria,archaea,methanobacterium_formicicum_gca_000762265,ASM76226v1,2162,Archaea; Euryarchaeota; Methanomada group; Met...,GCA_000762265.1,Euryarchaeota,Methanomada group,41.3,2.44999,Complete Genome,chromosome:NZ_CP006933.1/CP006933.1


In [3]:
##stat_df["Fer_group"] = np.nan
# visualize the data frame
if 1:
    print(stat_df.shape)
    print(stat_df.value_counts('division_7', dropna=False))
    pd.set_option('display.max_columns', stat_df.shape[1])
    display(stat_df.head(2))

(33627, 13)
division_7
bacteria       31943
fungi           1014
protists         237
vertebrates      222
metazoa          115
plants            96
Name: count, dtype: int64


Unnamed: 0,division_7,division_8,species,assembly,taxonomy_id,Lineage,ensembl_assembly_accession,group,subgroup,gc_percent,size_Mbp,assembly_status,chromosomes
0,bacteria,archaea,methanobacterium_bryantii_gca_002287175,ASM228717v1,2161,Archaea; Euryarchaeota; Methanomada group; Met...,GCA_002287175.1,Euryarchaeota,Methanomada group,33.2,3.46637,Contig,-
1,bacteria,archaea,methanobacterium_formicicum_gca_000762265,ASM76226v1,2162,Archaea; Euryarchaeota; Methanomada group; Met...,GCA_000762265.1,Euryarchaeota,Methanomada group,41.3,2.44999,Complete Genome,chromosome:NZ_CP006933.1/CP006933.1


## Retrieving the data from Fer

In [4]:
# statistics on length distribution for different species
fer_file = "/home/emuro/git/github/EM_geneLength_nature/main_tables/extra_tables/genes.xlsx"
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", fer_file, "\n")

# retrieve data and diminish the number of columns
fer_df = pd.read_excel(fer_file, sheet_name="Vertebrata")
#fer_df = fer_df[["division_7", "division_8", "species", "taxonomy_id", "Lineage", "ensembl_assembly_accession", "group", "subgroup", "assembly_status"]] 

# visualize the data frame
if 1:
    print(fer_df.shape)
    #print(fer_df.value_counts('division_7', dropna=False))
    pd.set_option('display.max_columns', stat_df.shape[1])
    display(fer_df.head(2))

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /home/emuro/git/github/EM_geneLength_nature/main_tables/extra_tables/genes.xlsx 

(222, 4)


Unnamed: 0,species,genes_mean,genes_var,group
0,acanthochromis_polyacanthus,17577.741964,763691400.0,fishes
1,amphilophus_citrinellus,13673.609124,586451100.0,fishes


In [5]:
merged_df = pd.merge(stat_df, fer_df, on="species", how="left")
merged_df = merged_df.rename(columns={'group_x': 'group', 'group_y': 'group_Fer'})
df_cp = merged_df[["division_7", "division_8", "species", "taxonomy_id", "Lineage", "group", "subgroup", "group_Fer"]]

df_cp = df_cp[df_cp.division_7=="vertebrates"]
# visualize the data frame
if 1:
    print(merged_df.shape)
    #print(fer_df.value_counts('division_7', dropna=False))
    pd.set_option('display.max_columns', merged_df.shape[1])
    #pd.set_option('display.max_rows', 222)
    #pd.set_option('display.max_colwidth', None)
    display(df_cp[df_cp.division_7=="vertebrates"].head(2))

(33627, 16)


Unnamed: 0,division_7,division_8,species,taxonomy_id,Lineage,group,subgroup,group_Fer
33238,vertebrates,vertebrates,petromyzon_marinus,7757,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,,,fishes
33239,vertebrates,vertebrates,eptatretus_burgeri,7764,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,Animals,Fishes,fishes


In [6]:
# Save the file if desired
if 0:
    merged_df.to_csv("/home/emuro/stat_protCodGenes_with_ncbiGenomeData_FerGroup.tsv", sep="\t", index=False, header=True)