# Distribution of the lengths of the proteins
**for the different species (reference proteomes)**

## Import python modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats 
import os
import sys
sys.path.append('../../lib/')
import constants as c


BOOL_SAVE_FILE = 0

## The protein length distributions for the different species

### Retrieving the statistical descriptions of the length's distributions

In [2]:
# statistics on length distribution for different species     
stat_file = c.STAT_P_FILE
print("The statistical descriptions of the protein distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")

# visualize data
if 1:
    pd.set_option('display.max_columns', None)
    display(stat_df.head(2))
    print(stat_df.shape)
    print(stat_df.columns)
    print(stat_df["division_both_dbs"].value_counts(), "\n")

The statistical descriptions of the protein distributions for the different species is in:
 /home/emuro/git/github/EM_geneLength_nature/working_on/preprocessing_data/../../lib/../main_tables/stat_proteins.tsv 



Unnamed: 0,species,proteome_id,tax_id,superregnum,num_prot_cod_genes,uniprot_fasta_file,count,mean,std,var,min,25perc,50perc,75perc,max,log10_mean,log10_std,log10_var,log10_min,log10_25perc,log10_50perc,log10_75perc,log10_max,log_mean,log_std,log_var,log_min,log_25perc,log_50perc,log_75perc,log_max,Scientific_name,Rank,Lineage,division_4_byLineage,Virus_host,superregnum_lineage,bool_Rank,division_both_dbs
0,Halorubrum saccharovorum,UP000053331,2248,archaea,2334,/ftp.uniprot.org/pub/databases/uniprot/current...,2334.0,250.641817,169.730987,28808.608041,29.0,127.0,208.5,330.0,1869.0,2.310023,0.280604,0.078739,1.462398,2.103804,2.319105,2.518514,3.271609,5.319025,0.646115,0.417465,3.367296,4.844187,5.339936,5.799093,7.533159,Halorubrum saccharovorum,Species,Archaea; Euryarchaeota; Stenosarchaea group; H...,Archaea,,archaea,1,archaea
1,Pyrodictium occultum,UP000053352,2309,archaea,1602,/ftp.uniprot.org/pub/databases/uniprot/current...,1602.0,285.092385,186.591395,34816.348736,48.0,151.0,248.0,370.0,1504.0,2.372852,0.270438,0.073136,1.681241,2.178977,2.394452,2.568202,3.177248,5.463693,0.622705,0.387762,3.871201,5.01728,5.513429,5.913503,7.315884,Pyrodictium occultum,Species,Archaea; Crenarchaeota; Thermoprotei; Desulfur...,Archaea,,archaea,1,archaea


(9913, 39)
Index(['species', 'proteome_id', 'tax_id', 'superregnum', 'num_prot_cod_genes',
       'uniprot_fasta_file', 'count', 'mean', 'std', 'var', 'min', '25perc',
       '50perc', '75perc', 'max', 'log10_mean', 'log10_std', 'log10_var',
       'log10_min', 'log10_25perc', 'log10_50perc', 'log10_75perc',
       'log10_max', 'log_mean', 'log_std', 'log_var', 'log_min', 'log_25perc',
       'log_50perc', 'log_75perc', 'log_max', 'Scientific_name', 'Rank',
       'Lineage', 'division_4_byLineage', 'Virus_host', 'superregnum_lineage',
       'bool_Rank', 'division_both_dbs'],
      dtype='object')
division_both_dbs
bacteria         7997
fungi             772
archaea           330
vertebrates       248
invertebrates     226
plants            184
protists          156
Name: count, dtype: int64 



## Modify the data frame

In [3]:
df = stat_df.copy()
df.loc[df.division_both_dbs == "metazoa", 'division_both_dbs'] = "invertebrates"

In [4]:
if 1:
    pd.set_option('display.max_columns', None)
    display(stat_df.head(2))
    print(df.shape)
    print(df.columns)
    print(df["division_both_dbs"].value_counts(), "\n")

Unnamed: 0,species,proteome_id,tax_id,superregnum,num_prot_cod_genes,uniprot_fasta_file,count,mean,std,var,min,25perc,50perc,75perc,max,log10_mean,log10_std,log10_var,log10_min,log10_25perc,log10_50perc,log10_75perc,log10_max,log_mean,log_std,log_var,log_min,log_25perc,log_50perc,log_75perc,log_max,Scientific_name,Rank,Lineage,division_4_byLineage,Virus_host,superregnum_lineage,bool_Rank,division_both_dbs
0,Halorubrum saccharovorum,UP000053331,2248,archaea,2334,/ftp.uniprot.org/pub/databases/uniprot/current...,2334.0,250.641817,169.730987,28808.608041,29.0,127.0,208.5,330.0,1869.0,2.310023,0.280604,0.078739,1.462398,2.103804,2.319105,2.518514,3.271609,5.319025,0.646115,0.417465,3.367296,4.844187,5.339936,5.799093,7.533159,Halorubrum saccharovorum,Species,Archaea; Euryarchaeota; Stenosarchaea group; H...,Archaea,,archaea,1,archaea
1,Pyrodictium occultum,UP000053352,2309,archaea,1602,/ftp.uniprot.org/pub/databases/uniprot/current...,1602.0,285.092385,186.591395,34816.348736,48.0,151.0,248.0,370.0,1504.0,2.372852,0.270438,0.073136,1.681241,2.178977,2.394452,2.568202,3.177248,5.463693,0.622705,0.387762,3.871201,5.01728,5.513429,5.913503,7.315884,Pyrodictium occultum,Species,Archaea; Crenarchaeota; Thermoprotei; Desulfur...,Archaea,,archaea,1,archaea


(9913, 39)
Index(['species', 'proteome_id', 'tax_id', 'superregnum', 'num_prot_cod_genes',
       'uniprot_fasta_file', 'count', 'mean', 'std', 'var', 'min', '25perc',
       '50perc', '75perc', 'max', 'log10_mean', 'log10_std', 'log10_var',
       'log10_min', 'log10_25perc', 'log10_50perc', 'log10_75perc',
       'log10_max', 'log_mean', 'log_std', 'log_var', 'log_min', 'log_25perc',
       'log_50perc', 'log_75perc', 'log_max', 'Scientific_name', 'Rank',
       'Lineage', 'division_4_byLineage', 'Virus_host', 'superregnum_lineage',
       'bool_Rank', 'division_both_dbs'],
      dtype='object')
division_both_dbs
bacteria         7997
fungi             772
archaea           330
vertebrates       248
invertebrates     226
plants            184
protists          156
Name: count, dtype: int64 



## Save file if desired
be careful

In [5]:
out_file = c.STAT_P_FILE
print(out_file, os.path.isfile(out_file))
if BOOL_SAVE_FILE: # Save the file if desired
    print("saving in: ", out_file)
    df.to_csv(out_file, sep="\t", index=False, header=True)
    print("...file saved. Done!")

/home/emuro/git/github/EM_geneLength_nature/working_on/preprocessing_data/../../lib/../main_tables/stat_proteins.tsv True
