# Distribution of the lengths of the protein coding genes 
**for the different species**

## Import python modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats 
import os
import sys
sys.path.append('../../lib/')
import constants as c


BOOL_SAVE_FILE = 0

## The protein coding gene length distributions for the different species

### Retrieving the statistical descriptions of the gene length's distributions

#### Statistical description 

In [2]:
# statistics on length distributions 
stat_file = c.STAT_G_FILE
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")

# visualize data
if 1:
    pd.set_option('display.max_columns', None)
    display(stat_df.head(2))
    print(stat_df.shape)
    print(stat_df.columns)
    print(stat_df["division_7"].value_counts(), "\n")
    print(stat_df["division_8"].value_counts(), "\n")
    print(stat_df["division_both_dbs"].value_counts(), "\n")

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /home/emuro/git/github/EM_geneLength_nature/working_on/preprocessing_data/../../lib/../main_tables/stat_protCodGenes.tsv 



Unnamed: 0,index_original,division_7,species,assembly,trunk_genes_path,genes_file,count,mean,std,var,min,25perc,50perc,75perc,max,log10_mean,log10_std,log10_var,log10_min,log10_25perc,log10_50perc,log10_75perc,log10_max,log_mean,log_std,log_var,log_min,log_25perc,log_50perc,log_75perc,log_max,taxonomy_id,genebuild,Scientific_name,Rank,Lineage,division_4_byLineage,Virus_host,division_8,bool_Rank,sameTax_order,division_both_dbs
0,7068,bacteria,methanobacterium_bryantii_gca_002287175,ASM228717v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_bryantii...,3168,840.40404,649.879873,422343.848699,131,416.0,686.0,1079.0,7910,2.827684,0.287565,0.082694,2.117271,2.619093,2.836324,3.033021,3.898176,6.510982,0.662143,0.438433,4.875197,6.030685,6.530878,6.98379,8.975883,2161,2017.0,Methanobacterium bryantii,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea
1,26156,bacteria,methanobacterium_formicicum_gca_000762265,ASM76226v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_formicic...,2352,862.427296,589.270407,347239.612747,107,458.0,728.0,1115.0,6779,2.849723,0.276255,0.076317,2.029384,2.660865,2.862131,3.047275,3.831166,6.56173,0.6361,0.404623,4.672829,6.126869,6.590301,7.01661,8.821585,2162,2014.0,Methanobacterium formicicum,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea


(33627, 42)
Index(['index_original', 'division_7', 'species', 'assembly',
       'trunk_genes_path', 'genes_file', 'count', 'mean', 'std', 'var', 'min',
       '25perc', '50perc', '75perc', 'max', 'log10_mean', 'log10_std',
       'log10_var', 'log10_min', 'log10_25perc', 'log10_50perc',
       'log10_75perc', 'log10_max', 'log_mean', 'log_std', 'log_var',
       'log_min', 'log_25perc', 'log_50perc', 'log_75perc', 'log_max',
       'taxonomy_id', 'genebuild', 'Scientific_name', 'Rank', 'Lineage',
       'division_4_byLineage', 'Virus_host', 'division_8', 'bool_Rank',
       'sameTax_order', 'division_both_dbs'],
      dtype='object')
division_7
bacteria         31943
fungi             1014
protists           237
vertebrates        222
invertebrates      115
plants              96
Name: count, dtype: int64 

division_8
bacteria         30555
archaea           1228
fungi             1007
protists           237
vertebrates        221
invertebrates      115
plants              96
Name: co

## Modify the data frame

In [3]:
df = stat_df.copy()
df.loc[df.division_7 == "metazoa",        'division_7']        = "invertebrates"
df.loc[df.division_8 == "metazoa",        'division_8']        = "invertebrates"
df.loc[df.division_both_dbs == "metazoa", 'division_both_dbs'] = "invertebrates"

In [4]:
if 1:
    pd.set_option('display.max_columns', None)
    display(stat_df.head(2))
    print(df.shape)
    print(df.columns)
    print(df["division_7"].value_counts(), "\n")
    print(df["division_8"].value_counts(), "\n")
    print(df["division_both_dbs"].value_counts(), "\n")

Unnamed: 0,index_original,division_7,species,assembly,trunk_genes_path,genes_file,count,mean,std,var,min,25perc,50perc,75perc,max,log10_mean,log10_std,log10_var,log10_min,log10_25perc,log10_50perc,log10_75perc,log10_max,log_mean,log_std,log_var,log_min,log_25perc,log_50perc,log_75perc,log_max,taxonomy_id,genebuild,Scientific_name,Rank,Lineage,division_4_byLineage,Virus_host,division_8,bool_Rank,sameTax_order,division_both_dbs
0,7068,bacteria,methanobacterium_bryantii_gca_002287175,ASM228717v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_bryantii...,3168,840.40404,649.879873,422343.848699,131,416.0,686.0,1079.0,7910,2.827684,0.287565,0.082694,2.117271,2.619093,2.836324,3.033021,3.898176,6.510982,0.662143,0.438433,4.875197,6.030685,6.530878,6.98379,8.975883,2161,2017.0,Methanobacterium bryantii,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea
1,26156,bacteria,methanobacterium_formicicum_gca_000762265,ASM76226v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_formicic...,2352,862.427296,589.270407,347239.612747,107,458.0,728.0,1115.0,6779,2.849723,0.276255,0.076317,2.029384,2.660865,2.862131,3.047275,3.831166,6.56173,0.6361,0.404623,4.672829,6.126869,6.590301,7.01661,8.821585,2162,2014.0,Methanobacterium formicicum,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea


(33627, 42)
Index(['index_original', 'division_7', 'species', 'assembly',
       'trunk_genes_path', 'genes_file', 'count', 'mean', 'std', 'var', 'min',
       '25perc', '50perc', '75perc', 'max', 'log10_mean', 'log10_std',
       'log10_var', 'log10_min', 'log10_25perc', 'log10_50perc',
       'log10_75perc', 'log10_max', 'log_mean', 'log_std', 'log_var',
       'log_min', 'log_25perc', 'log_50perc', 'log_75perc', 'log_max',
       'taxonomy_id', 'genebuild', 'Scientific_name', 'Rank', 'Lineage',
       'division_4_byLineage', 'Virus_host', 'division_8', 'bool_Rank',
       'sameTax_order', 'division_both_dbs'],
      dtype='object')
division_7
bacteria         31943
fungi             1014
protists           237
vertebrates        222
invertebrates      115
plants              96
Name: count, dtype: int64 

division_8
bacteria         30555
archaea           1228
fungi             1007
protists           237
vertebrates        221
invertebrates      115
plants              96
Name: co

## Save file if desired
be careful

In [5]:
out_file = c.STAT_G_FILE
print(out_file, os.path.isfile(out_file))
if BOOL_SAVE_FILE: # Save the file if desired
    print("saving in: ", out_file)
    df.to_csv(out_file, sep="\t", index=False, header=True)
    print("...file saved. Done!")

/home/emuro/git/github/EM_geneLength_nature/working_on/preprocessing_data/../../lib/../main_tables/stat_protCodGenes.tsv True
