# Protein coding gene length distributions

## Import python modules

In [1]:
import pandas as pd
import numpy as np
import re

import sys
sys.path.append('../../../lib/')
import constants as c

## The protein coding gene length distributions for the different species

### Retrieving the statistical descriptions of the gene length's distributions

#### Statistical description 

In [2]:
# statistics on length distributions 
stat_file = c.STAT_G_FILE
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")
stat_df = stat_df[["division_both_dbs", "species", "count", "mean", "var", "Lineage", "trunk_genes_path"]] # trunk_genes_path as unique id
# visualize data
if 1:
    pd.set_option('display.max_columns', None)
    display(stat_df.head(2))
    print(stat_df.shape)
    print(stat_df["division_both_dbs"].value_counts())
    print(stat_df.index)
    print(stat_df.info())

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /home/emuro/git/github/EM_geneLength_nature/working_on/analysis/groups_of_organisms/../../../lib/../main_tables/stat_protCodGenes.tsv 



Unnamed: 0,division_both_dbs,species,count,mean,var,Lineage,trunk_genes_path
0,archaea,methanobacterium_bryantii_gca_002287175,3168,840.40404,422343.848699,Archaea; Euryarchaeota; Methanomada group; Met...,ftp.ensemblgenomes.org/pub/bacteria/release-49...
1,archaea,methanobacterium_formicicum_gca_000762265,2352,862.427296,347239.612747,Archaea; Euryarchaeota; Methanomada group; Met...,ftp.ensemblgenomes.org/pub/bacteria/release-49...


(33627, 7)
bacteria         30714
archaea           1229
fungi             1014
protists           237
vertebrates        222
invertebrates      115
plants              96
Name: division_both_dbs, dtype: int64
RangeIndex(start=0, stop=33627, step=1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33627 entries, 0 to 33626
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   division_both_dbs  33627 non-null  object 
 1   species            33627 non-null  object 
 2   count              33627 non-null  int64  
 3   mean               33627 non-null  float64
 4   var                33627 non-null  float64
 5   Lineage            33459 non-null  object 
 6   trunk_genes_path   33627 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 1.8+ MB
None


In [3]:
# statistics on length distributions with genome length
if 1:
    stat_genome_file = "/home/emuro/git/github/EM_geneLength_nature/working_on_tables/stat_protCodGenes_with_ncbiGenomeData.tsv"
    print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", stat_genome_file, "\n")

    # retrieve data and diminish the number of columns
    stat_genome_df = pd.read_csv(stat_genome_file, low_memory=False, sep="\t")
    stat_genome_df = stat_genome_df[["trunk_genes_path", "size_Mbp", "gc_percent", "group", "subgroup"]] # trunk_genes_path as unique id
    stat_genome_df['gc_percent'] = pd.to_numeric(stat_genome_df['gc_percent'], errors='coerce')
    print(stat_genome_df.info())
    
    # visualize data
    if 1:
        pd.set_option('display.max_columns', None)
        display(stat_genome_df.head(2))
        print(stat_genome_df.shape)
        #print(stat_genome_df["trunk_genes_path"].value_counts())

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /home/emuro/git/github/EM_geneLength_nature/working_on_tables/stat_protCodGenes_with_ncbiGenomeData.tsv 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33627 entries, 0 to 33626
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   trunk_genes_path  33627 non-null  object 
 1   size_Mbp          32022 non-null  float64
 2   gc_percent        32016 non-null  float64
 3   group             32022 non-null  object 
 4   subgroup          32022 non-null  object 
dtypes: float64(2), object(3)
memory usage: 1.3+ MB
None


Unnamed: 0,trunk_genes_path,size_Mbp,gc_percent,group,subgroup
0,ftp.ensemblgenomes.org/pub/bacteria/release-49...,3.46637,33.2,Euryarchaeota,Methanomada group
1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,2.44999,41.3,Euryarchaeota,Methanomada group


(33627, 5)


In [4]:
# merge to obtain genome length data
if 1:
    stat_df = pd.merge(stat_df, stat_genome_df, on="trunk_genes_path", how='left').copy()

    # visualize data
    if 1:
        pd.set_option('display.max_columns', None)
        print(stat_df.shape)
        display(stat_df.head(2))
        print(stat_df["division_both_dbs"].value_counts())
        print(stat_df.info())
        #print(stat_df[ stat_df['species']=='homo_sapiens'])

(33627, 11)


Unnamed: 0,division_both_dbs,species,count,mean,var,Lineage,trunk_genes_path,size_Mbp,gc_percent,group,subgroup
0,archaea,methanobacterium_bryantii_gca_002287175,3168,840.40404,422343.848699,Archaea; Euryarchaeota; Methanomada group; Met...,ftp.ensemblgenomes.org/pub/bacteria/release-49...,3.46637,33.2,Euryarchaeota,Methanomada group
1,archaea,methanobacterium_formicicum_gca_000762265,2352,862.427296,347239.612747,Archaea; Euryarchaeota; Methanomada group; Met...,ftp.ensemblgenomes.org/pub/bacteria/release-49...,2.44999,41.3,Euryarchaeota,Methanomada group


bacteria         30714
archaea           1229
fungi             1014
protists           237
vertebrates        222
invertebrates      115
plants              96
Name: division_both_dbs, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 33627 entries, 0 to 33626
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   division_both_dbs  33627 non-null  object 
 1   species            33627 non-null  object 
 2   count              33627 non-null  int64  
 3   mean               33627 non-null  float64
 4   var                33627 non-null  float64
 5   Lineage            33459 non-null  object 
 6   trunk_genes_path   33627 non-null  object 
 7   size_Mbp           32022 non-null  float64
 8   gc_percent         32016 non-null  float64
 9   group              32022 non-null  object 
 10  subgroup           32022 non-null  object 
dtypes: float64(4), int64(1), object(6)
memory usage: 3.1+ MB
None


## Get a column with the groups fo organisms

In [5]:
# define the groups of organisms to study
g_orgs_l = ['Archaea', 'Bacteria']
g_orgs_l += ['Ascomycota', 'Basidiomycota', 'Microsporidia', 'Mucoromycota']
g_orgs_l += ['Primates', 'Mammalia', 'Aves', 'Actinopterygii']
g_orgs_l += ['Arthropoda']
g_orgs_l += ['Viridiplantae']

In [6]:
stat_df['g_orgs'] = None
g = "protists"
mask = stat_df['division_both_dbs'].str.contains(g, case=True, na=False)
stat_df.loc[stat_df['g_orgs'].isna() & mask, 'g_orgs'] = g 
for g in g_orgs_l:
    mask = stat_df['Lineage'].str.contains(g, case=True, na=False)
    stat_df.loc[stat_df['g_orgs'].isna() & mask, 'g_orgs'] = g 
if 1:
    pd.set_option('display.max_columns', None)
    display(stat_df.head(2))
    print(stat_df.columns)
    print(stat_df.shape)
    print(stat_df["g_orgs"].value_counts(dropna=False))
    #print(stat_df.index)
    #print(stat_df.info())

Unnamed: 0,division_both_dbs,species,count,mean,var,Lineage,trunk_genes_path,size_Mbp,gc_percent,group,subgroup,g_orgs
0,archaea,methanobacterium_bryantii_gca_002287175,3168,840.40404,422343.848699,Archaea; Euryarchaeota; Methanomada group; Met...,ftp.ensemblgenomes.org/pub/bacteria/release-49...,3.46637,33.2,Euryarchaeota,Methanomada group,Archaea
1,archaea,methanobacterium_formicicum_gca_000762265,2352,862.427296,347239.612747,Archaea; Euryarchaeota; Methanomada group; Met...,ftp.ensemblgenomes.org/pub/bacteria/release-49...,2.44999,41.3,Euryarchaeota,Methanomada group,Archaea


Index(['division_both_dbs', 'species', 'count', 'mean', 'var', 'Lineage',
       'trunk_genes_path', 'size_Mbp', 'gc_percent', 'group', 'subgroup',
       'g_orgs'],
      dtype='object')
(33627, 12)
Bacteria          30555
Archaea            1228
Ascomycota          720
protists            237
None                231
Basidiomycota       213
Mammalia             96
Viridiplantae        93
Arthropoda           86
Actinopterygii       60
Microsporidia        30
Mucoromycota         28
Primates             26
Aves                 24
Name: g_orgs, dtype: int64


## Obtain the average value for each group

In [7]:
#mean
print(stat_df['mean'].groupby(stat_df['g_orgs']).describe()[['mean', 'count']].sort_values('mean', ascending=True))

                        mean    count
g_orgs                               
Archaea           834.934335   1228.0
Bacteria          943.020605  30555.0
Microsporidia     974.837340     30.0
Mucoromycota     1556.128325     28.0
Ascomycota       1640.509244    720.0
Basidiomycota    1884.245002    213.0
protists         2059.139453    237.0
Viridiplantae    3681.998556     93.0
Arthropoda       7314.754165     86.0
Actinopterygii  16991.892429     60.0
Aves            27877.229133     24.0
Mammalia        44982.428558     96.0
Primates        48594.583334     26.0


In [8]:
#count
print(stat_df['count'].groupby(stat_df['g_orgs']).describe()[['mean', 'count']].sort_values('mean', ascending=True))

                        mean    count
g_orgs                               
Archaea          2327.409609   1228.0
Microsporidia    2862.700000     30.0
Bacteria         4009.489085  30555.0
Ascomycota       9237.661111    720.0
Basidiomycota   10202.807512    213.0
protists        10709.616034    237.0
Arthropoda      15485.058140     86.0
Aves            15678.666667     24.0
Mucoromycota    16979.607143     28.0
Mammalia        19577.020833     96.0
Primates        20730.730769     26.0
Actinopterygii  23582.466667     60.0
Viridiplantae   41638.064516     93.0


In [9]:
#count
print(stat_df['size_Mbp'].groupby(stat_df['g_orgs']).describe()[['mean', 'count']].sort_values('mean', ascending=True))

                       mean    count
g_orgs                              
Archaea            2.376664   1206.0
Bacteria           4.370578  29119.0
Microsporidia      7.174138     29.0
Ascomycota        28.776631    698.0
Basidiomycota     33.596340    203.0
protists          48.341418    219.0
Mucoromycota      60.994219     27.0
Arthropoda       367.595529     75.0
Actinopterygii   901.278362     47.0
Aves            1123.132722     18.0
Viridiplantae   2086.469470     70.0
Mammalia        2634.399481     77.0
Primates        2904.005217     23.0
