# Add a new column (of division) to the statistical descriptions
in the protein length stat descriptions of the distributions for the different species. The column will be based in ensembl (subrregnum: bacteria, archaea, viruses). For superregnum eukaryota: uniprot provided taxonomy (tax_id) is used to obtain the group of organims from ensembl. 

## Import python modules

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import sys

#
# init some data paths
stat_base_path = "/media/emuro/Nubya/" 
stat_file_path = stat_base_path + "results/geneLength/outputInputFiles/analysis/some_statistics/stat_description/"
stat_prot_file_path = stat_file_path + "proteins/"
#
gitHubProject_base_path = "../../"
#main_tables_path       = gitHubProject_base_path + "main_tables/"
#extra_tables_path      = main_tables_path + "extra_tables/"
working_on_tables_path  = gitHubProject_base_path + "working_on_tables/"
if not os.path.isdir(working_on_tables_path):
    print("The path (working_on_tables_path) is not defined! Check.")  
    print(working_on_tables_path, os.path.isdir(working_on_tables_path))
    sys.exit(0)
#
# Initialization:
stat_file_ensembl = "all__stat_description.ensembl_v3.0__withLineage.tsv" # file to be examined (ensembl)
stat_file_uniprot = "stat_description.protein.uniprot_reference_proteome_v3.0__withLineage.tsv" # file (uniprot)
BOOL_SAVE_FILE = 0

---
## Retrieving the statistical descriptions  

---
### Of the gene length's distributions

In [2]:
# statistics on length distribution for different species
stat_file = stat_file_path + stat_file_ensembl
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")
if 1: # visualize the data frame
    print(stat_df.shape)
    print(stat_df.columns)
    print(stat_df.value_counts('division_both_dbs', dropna=False))
    pd.set_option('display.max_columns', stat_df.shape[1])
    display(stat_df.head(2))

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /media/emuro/Nubya/results/geneLength/outputInputFiles/analysis/some_statistics/stat_description/all__stat_description.ensembl_v3.0__withLineage.tsv 

(33630, 42)
Index(['index_original', 'division_7', 'species', 'assembly',
       'trunk_genes_path', 'genes_file', 'count', 'mean', 'std', 'var', 'min',
       '25perc', '50perc', '75perc', 'max', 'log10_mean', 'log10_std',
       'log10_var', 'log10_min', 'log10_25perc', 'log10_50perc',
       'log10_75perc', 'log10_max', 'log_mean', 'log_std', 'log_var',
       'log_min', 'log_25perc', 'log_50perc', 'log_75perc', 'log_max',
       'taxonomy_id', 'genebuild', 'Scientific_name', 'Rank', 'Lineage',
       'division_4_byLineage', 'Virus_host', 'division_8', 'bool_Rank',
       'sameTax_order', 'division_both_dbs'],
      dtype='object')
division_both_dbs
bacteria       30714
archaea         1229
fungi           1014
protists         237


Unnamed: 0,index_original,division_7,species,assembly,trunk_genes_path,genes_file,count,mean,std,var,min,25perc,50perc,75perc,max,log10_mean,log10_std,log10_var,log10_min,log10_25perc,log10_50perc,log10_75perc,log10_max,log_mean,log_std,log_var,log_min,log_25perc,log_50perc,log_75perc,log_max,taxonomy_id,genebuild,Scientific_name,Rank,Lineage,division_4_byLineage,Virus_host,division_8,bool_Rank,sameTax_order,division_both_dbs
0,7068,bacteria,methanobacterium_bryantii_gca_002287175,ASM228717v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_bryantii...,3168,840.40404,649.879873,422343.848699,131,416.0,686.0,1079.0,7910,2.827684,0.287565,0.082694,2.117271,2.619093,2.836324,3.033021,3.898176,6.510982,0.662143,0.438433,4.875197,6.030685,6.530878,6.98379,8.975883,2161,2017.0,Methanobacterium bryantii,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea
1,26156,bacteria,methanobacterium_formicicum_gca_000762265,ASM76226v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_formicic...,2352,862.427296,589.270407,347239.612747,107,458.0,728.0,1115.0,6779,2.849723,0.276255,0.076317,2.029384,2.660865,2.862131,3.047275,3.831166,6.56173,0.6361,0.404623,4.672829,6.126869,6.590301,7.01661,8.821585,2162,2014.0,Methanobacterium formicicum,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea



---
### Of the protein length's distributions

In [3]:
# statistics on length distribution for different species
stat_prot_file = stat_prot_file_path + stat_file_uniprot
print("The statistical descriptions of the proteome distributions for the different species is in:\n", stat_prot_file, "\n")

# retrieve data and diminish the number of columns
stat_p_df = pd.read_csv(stat_prot_file, low_memory=False, sep="\t")
if 1: # visualize the data frame
    print(stat_p_df.shape)
    print(stat_p_df.columns)
    print(stat_p_df.value_counts('superregnum', dropna=False))
    pd.set_option('display.max_columns', stat_p_df.shape[1])
    display(stat_p_df.head(2))

The statistical descriptions of the proteome distributions for the different species is in:
 /media/emuro/Nubya/results/geneLength/outputInputFiles/analysis/some_statistics/stat_description/proteins/stat_description.protein.uniprot_reference_proteome_v3.0__withLineage.tsv 

(19854, 38)
Index(['species', 'proteome_id', 'tax_id', 'superregnum', 'num_prot_cod_genes',
       'uniprot_fasta_file', 'count', 'mean', 'std', 'var', 'min', '25perc',
       '50perc', '75perc', 'max', 'log10_mean', 'log10_std', 'log10_var',
       'log10_min', 'log10_25perc', 'log10_50perc', 'log10_75perc',
       'log10_max', 'log_mean', 'log_std', 'log_var', 'log_min', 'log_25perc',
       'log_50perc', 'log_75perc', 'log_max', 'Scientific_name', 'Rank',
       'Lineage', 'division_4_byLineage', 'Virus_host', 'superregnum_lineage',
       'bool_Rank'],
      dtype='object')
superregnum
viruses      9939
bacteria     7997
eukaryota    1588
archaea       330
Name: count, dtype: int64


Unnamed: 0,species,proteome_id,tax_id,superregnum,num_prot_cod_genes,uniprot_fasta_file,count,mean,std,var,min,25perc,50perc,75perc,max,log10_mean,log10_std,log10_var,log10_min,log10_25perc,log10_50perc,log10_75perc,log10_max,log_mean,log_std,log_var,log_min,log_25perc,log_50perc,log_75perc,log_max,Scientific_name,Rank,Lineage,division_4_byLineage,Virus_host,superregnum_lineage,bool_Rank
0,Halorubrum saccharovorum,UP000053331,2248,archaea,2334,/ftp.uniprot.org/pub/databases/uniprot/current...,2334.0,250.641817,169.730987,28808.608041,29.0,127.0,208.5,330.0,1869.0,2.310023,0.280604,0.078739,1.462398,2.103804,2.319105,2.518514,3.271609,5.319025,0.646115,0.417465,3.367296,4.844187,5.339936,5.799093,7.533159,Halorubrum saccharovorum,Species,Archaea; Euryarchaeota; Stenosarchaea group; H...,Archaea,,archaea,1
1,Pyrodictium occultum,UP000053352,2309,archaea,1602,/ftp.uniprot.org/pub/databases/uniprot/current...,1602.0,285.092385,186.591395,34816.348736,48.0,151.0,248.0,370.0,1504.0,2.372852,0.270438,0.073136,1.681241,2.178977,2.394452,2.568202,3.177248,5.463693,0.622705,0.387762,3.871201,5.01728,5.513429,5.913503,7.315884,Pyrodictium occultum,Species,Archaea; Crenarchaeota; Thermoprotei; Desulfur...,Archaea,,archaea,1


## Add the column  
to the protein length's distributions

In [4]:
stat_p_df["division_both_dbs"] = stat_p_df["superregnum"]
#display(stat_p_df)
#

# fungi
stat_p_df["division_both_dbs"] = np.where((stat_p_df['division_both_dbs']=='eukaryota') & 
                                          (stat_p_df['Lineage'].str.find('Eukaryota; Fungi')>= 0), 
                                          'fungi', stat_p_df["division_both_dbs"])
# metazoa
stat_p_df["division_both_dbs"] = np.where((stat_p_df['division_both_dbs']=='eukaryota') & 
                                          (stat_p_df['Lineage'].str.find('Eukaryota; Metazoa;')>= 0), 
                                          'metazoa', stat_p_df["division_both_dbs"])

# vertebrates
stat_p_df["division_both_dbs"] = np.where((stat_p_df['division_both_dbs']=='metazoa') & 
                                          (stat_p_df['Lineage'].str.find('Vertebrata;')>= 0), 
                                          'vertebrates', stat_p_df["division_both_dbs"])

# plantae
stat_p_df["division_both_dbs"] = np.where((stat_p_df['division_both_dbs']=='eukaryota') & 
                                          ((stat_p_df['Lineage'].str.find('Eukaryota; Viridiplantae;')>= 0) |
                                           (stat_p_df['Lineage'].str.find('Eukaryota; Rhodophyta;')>= 0)), 
                                          'plants', stat_p_df["division_both_dbs"])

# other are protists: Amoebozoa, Apusozoa, Choanoflagellata, Cryptophyceae, Discoba, Filasterea,
#                     Haptista, Ichthyosporea, Metamonada, Rotosphaerida, Sar
stat_p_df["division_both_dbs"] = np.where(stat_p_df['division_both_dbs']=='eukaryota', 
                                          'protists', stat_p_df["division_both_dbs"])

if 1: # visualize the data frame
    print(stat_p_df.shape)
    print(stat_p_df.value_counts('division_both_dbs', dropna=False))
    pd.set_option('display.max_columns', stat_p_df.shape[1])
    display(stat_p_df.head(2))

(19854, 39)
division_both_dbs
viruses        9939
bacteria       7997
fungi           772
archaea         330
vertebrates     248
metazoa         228
plants          184
protists        156
Name: count, dtype: int64


Unnamed: 0,species,proteome_id,tax_id,superregnum,num_prot_cod_genes,uniprot_fasta_file,count,mean,std,var,min,25perc,50perc,75perc,max,log10_mean,log10_std,log10_var,log10_min,log10_25perc,log10_50perc,log10_75perc,log10_max,log_mean,log_std,log_var,log_min,log_25perc,log_50perc,log_75perc,log_max,Scientific_name,Rank,Lineage,division_4_byLineage,Virus_host,superregnum_lineage,bool_Rank,division_both_dbs
0,Halorubrum saccharovorum,UP000053331,2248,archaea,2334,/ftp.uniprot.org/pub/databases/uniprot/current...,2334.0,250.641817,169.730987,28808.608041,29.0,127.0,208.5,330.0,1869.0,2.310023,0.280604,0.078739,1.462398,2.103804,2.319105,2.518514,3.271609,5.319025,0.646115,0.417465,3.367296,4.844187,5.339936,5.799093,7.533159,Halorubrum saccharovorum,Species,Archaea; Euryarchaeota; Stenosarchaea group; H...,Archaea,,archaea,1,archaea
1,Pyrodictium occultum,UP000053352,2309,archaea,1602,/ftp.uniprot.org/pub/databases/uniprot/current...,1602.0,285.092385,186.591395,34816.348736,48.0,151.0,248.0,370.0,1504.0,2.372852,0.270438,0.073136,1.681241,2.178977,2.394452,2.568202,3.177248,5.463693,0.622705,0.387762,3.871201,5.01728,5.513429,5.913503,7.315884,Pyrodictium occultum,Species,Archaea; Crenarchaeota; Thermoprotei; Desulfur...,Archaea,,archaea,1,archaea


### Save the file
**if desired** (be careful not to overwrite any file!)

In [5]:
print(working_on_tables_path, os.path.isdir(working_on_tables_path))
out_file = working_on_tables_path + stat_file_uniprot
print("saving in: ", out_file)
if BOOL_SAVE_FILE: # Save the file if desired
    stat_p_df.to_csv(out_file, 
                     sep="\t", index=False, header=True)
    print("...file saved. Done!")

../../working_on_tables/ True
saving in:  ../../working_on_tables/stat_description.protein.uniprot_reference_proteome_v3.0__withLineage.tsv
