# Add a new column (of division) to the statistical descriptions
in the protein coding gene length stat descriptions of the distributions for the different species. The column will be based in ensembl (division_7) and uniprot provided taxonomy (division_4_byLineage)

## Import python modules

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import sys


gitHubProject_base_path = "../../../"
main_tables_path  = gitHubProject_base_path + "main_tables/"
if not os.path.isdir(main_tables_path):
    print("The path (main_tables_path) is not defined! Check.")  
    print(main_tables_path, os.path.isdir(main_tables_path))
    sys.exit(0)
#    
working_on_tables_path  = gitHubProject_base_path + "working_on_tables/noisy/"
if not os.path.isdir(working_on_tables_path):
    print("The path (working_on_tables_path) is not defined! Check.")  
    print(working_on_tables_path, os.path.isdir(working_on_tables_path))
    sys.exit(0)
#
# Initialization:
original_merged_file_name   = "stat_merged.tsv"
noisy_merged_7671_file_name = "stat_merged_7671.tsv"

BOOL_SAVE_FILE = 1

## Retrieving the statistical descriptions 
**of the length's distributions for the different merged species**

**Original:**

In [2]:
# statistics on length distribution for different species
merged_file = main_tables_path + original_merged_file_name
print("The statistical descriptions of the length distributions for the different species (merged set) is in:\n", merged_file, "\n")

# retrieve data and diminish the number of columns
merged_df = pd.read_csv(merged_file, low_memory=False, sep="\t")
merged_taxid_l = merged_df["tax_id"].to_list()

if 1: # visualize the data frame
    print(merged_df.shape)
    display(merged_df.columns)
    print(merged_df.value_counts('merged_division_superregnum', dropna=False))
    pd.set_option('display.max_columns', merged_df.shape[1])
    display(merged_df.head(2))


# eliminate the columns that the noisy species (7671) do not have
merged_df = merged_df.drop(columns=['ratio_mean_prots_genes', 'diff_prots_genes', 'abs_diff', 'ratio_prots_genes'])
#
print(working_on_tables_path, os.path.isdir(working_on_tables_path))
out_original_file = working_on_tables_path + "noisy_preprocessed_" + original_merged_file_name
print("to be saved in (not yet): ", out_original_file)
if BOOL_SAVE_FILE: # Save the file if desired
    print("saving...")
    merged_df.to_csv(out_original_file, 
                     sep="\t", index=False, header=True)
    print("...file saved. Done!")

The statistical descriptions of the length distributions for the different species (merged set) is in:
 ../../../main_tables/stat_merged.tsv 

(6519, 66)


Index(['genes_species', 'genes_assembly', 'genes_trunk_genes_path',
       'genes_genes_file', 'genes_count', 'genes_mean', 'genes_std',
       'genes_var', 'genes_min', 'genes_25perc', 'genes_50perc',
       'genes_75perc', 'genes_max', 'genes_log10_mean', 'genes_log10_std',
       'genes_log10_var', 'genes_log10_min', 'genes_log10_25perc',
       'genes_log10_50perc', 'genes_log10_75perc', 'genes_log10_max',
       'genes_log_mean', 'genes_log_std', 'genes_log_var', 'genes_log_min',
       'genes_log_25perc', 'genes_log_50perc', 'genes_log_75perc',
       'genes_log_max', 'genes_division', 'tax_id', 'prots_species',
       'prots_proteome_id', 'prots_superregnum', 'prots_num_prot_cod_genes',
       'prots_uniprot_fasta_file', 'prots_count', 'prots_mean', 'prots_std',
       'prots_var', 'prots_min', 'prots_25perc', 'prots_50perc',
       'prots_75perc', 'prots_max', 'prots_log10_mean', 'prots_log10_std',
       'prots_log10_var', 'prots_log10_min', 'prots_log10_25perc',
       'prots

merged_division_superregnum
bacteria       5468
fungi           533
archaea         227
vertebrates      92
protist          91
plants           59
metazoa          49
Name: count, dtype: int64


Unnamed: 0,genes_species,genes_assembly,genes_trunk_genes_path,genes_genes_file,genes_count,genes_mean,genes_std,genes_var,genes_min,genes_25perc,genes_50perc,genes_75perc,genes_max,genes_log10_mean,genes_log10_std,genes_log10_var,genes_log10_min,genes_log10_25perc,genes_log10_50perc,genes_log10_75perc,genes_log10_max,genes_log_mean,genes_log_std,genes_log_var,genes_log_min,genes_log_25perc,genes_log_50perc,genes_log_75perc,genes_log_max,genes_division,tax_id,prots_species,prots_proteome_id,prots_superregnum,prots_num_prot_cod_genes,prots_uniprot_fasta_file,prots_count,prots_mean,prots_std,prots_var,prots_min,prots_25perc,prots_50perc,prots_75perc,prots_max,prots_log10_mean,prots_log10_std,prots_log10_var,prots_log10_min,prots_log10_25perc,prots_log10_50perc,prots_log10_75perc,prots_log10_max,prots_log_mean,prots_log_std,prots_log_var,prots_log_min,prots_log_25perc,prots_log_50perc,prots_log_75perc,prots_log_max,merged_division_superregnum,ratio_mean_prots_genes,diff_prots_genes,abs_diff,ratio_prots_genes
0,archangium_gephyra_gca_001027285,ASM102728v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.archangium_gephyra_gca_00...,10119.0,1103.871035,1127.066255,1270278.0,113.0,524.0,887.0,1388.0,36653.0,2.919254,0.333869,0.111468,2.053078,2.719331,2.947924,3.142389,4.56411,6.721831,0.768762,0.590995,4.727388,6.261492,6.787845,7.235619,10.509251,bacteria,48,Archangium gephyra,UP000035579,bacteria,10110,/ftp.uniprot.org/pub/databases/uniprot/current...,10110.0,367.499703,375.772244,141204.779058,37.0,174.0,295.0,462.0,12217.0,2.441094,0.334836,0.112115,1.568202,2.240549,2.469822,2.664642,4.086965,5.620827,0.770987,0.594422,3.610918,5.159055,5.686975,6.135565,9.410584,bacteria,0.998757,-9.0,9.0,0.999111
1,chondromyces_crocatus_gca_001189295,ASM118929v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.chondromyces_crocatus_gca...,8339.0,1211.51445,1051.408605,1105460.0,89.0,629.0,977.0,1460.0,24476.0,2.978963,0.30025,0.09015,1.94939,2.798651,2.989895,3.164353,4.38874,6.859315,0.691352,0.477968,4.488636,6.444131,6.884487,7.286192,10.105448,bacteria,52,Chondromyces crocatus,UP000067626,bacteria,8327,/ftp.uniprot.org/pub/databases/uniprot/current...,8327.0,403.466555,350.57956,122906.027917,29.0,209.0,326.0,486.0,8158.0,2.501095,0.300984,0.090591,1.462398,2.320146,2.513218,2.686636,3.911584,5.758984,0.693041,0.480306,3.367296,5.342334,5.786897,6.186209,9.006754,bacteria,0.99908,-12.0,12.0,0.998561


../../../working_on_tables/ True
to be saved in (not yet):  ../../../working_on_tables/noisy_preprocessed_stat_merged.tsv
saving...
...file saved. Done!


## Get noisy species
and eliminate the ones that are not noisy

In [3]:
# statistics on length distribution for different species
noisy_7671_file = working_on_tables_path + noisy_merged_7671_file_name
print("The statistical descriptions of the length distributions for the different species (merged set) is in:\n", noisy_7671_file, "\n")

# retrieve data and diminish the number of columns
noisy_df = pd.read_csv(noisy_7671_file, low_memory=False, sep="\t")
noisy_df = noisy_df.drop(
    noisy_df[noisy_df["tax_id"].isin(merged_taxid_l)].index
)
if 1: # visualize the data frame
    print(noisy_df.shape)
    display(noisy_df.columns)
    print(noisy_df.value_counts('merged_division_superregnum', dropna=False))
    pd.set_option('display.max_columns', noisy_df.shape[1])
    display(noisy_df.head(2))

The statistical descriptions of the length distributions for the different species (merged set) is in:
 ../../../working_on_tables/stat_merged_7671.tsv 

(1152, 62)


Index(['genes_species', 'genes_assembly', 'genes_trunk_genes_path',
       'genes_genes_file', 'genes_count', 'genes_mean', 'genes_std',
       'genes_var', 'genes_min', 'genes_25perc', 'genes_50perc',
       'genes_75perc', 'genes_max', 'genes_log10_mean', 'genes_log10_std',
       'genes_log10_var', 'genes_log10_min', 'genes_log10_25perc',
       'genes_log10_50perc', 'genes_log10_75perc', 'genes_log10_max',
       'genes_log_mean', 'genes_log_std', 'genes_log_var', 'genes_log_min',
       'genes_log_25perc', 'genes_log_50perc', 'genes_log_75perc',
       'genes_log_max', 'genes_division', 'tax_id', 'prots_species',
       'prots_proteome_id', 'prots_superregnum', 'prots_num_prot_cod_genes',
       'prots_uniprot_fasta_file', 'prots_count', 'prots_mean', 'prots_std',
       'prots_var', 'prots_min', 'prots_25perc', 'prots_50perc',
       'prots_75perc', 'prots_max', 'prots_log10_mean', 'prots_log10_std',
       'prots_log10_var', 'prots_log10_min', 'prots_log10_25perc',
       'prots

merged_division_superregnum
bacteria       991
archaea         56
fungi           33
protist         23
vertebrates     22
metazoa         14
plants          13
Name: count, dtype: int64


Unnamed: 0,genes_species,genes_assembly,genes_trunk_genes_path,genes_genes_file,genes_count,genes_mean,genes_std,genes_var,genes_min,genes_25perc,genes_50perc,genes_75perc,genes_max,genes_log10_mean,genes_log10_std,genes_log10_var,genes_log10_min,genes_log10_25perc,genes_log10_50perc,genes_log10_75perc,genes_log10_max,genes_log_mean,genes_log_std,genes_log_var,genes_log_min,genes_log_25perc,genes_log_50perc,genes_log_75perc,genes_log_max,genes_division,tax_id,prots_species,prots_proteome_id,prots_superregnum,prots_num_prot_cod_genes,prots_uniprot_fasta_file,prots_count,prots_mean,prots_std,prots_var,prots_min,prots_25perc,prots_50perc,prots_75perc,prots_max,prots_log10_mean,prots_log10_std,prots_log10_var,prots_log10_min,prots_log10_25perc,prots_log10_50perc,prots_log10_75perc,prots_log10_max,prots_log_mean,prots_log_std,prots_log_var,prots_log_min,prots_log_25perc,prots_log_50perc,prots_log_75perc,prots_log_max,merged_division_superregnum
3,vitreoscilla_filiformis_gca_002222655,ASM222265v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.vitreoscilla_filiformis_g...,3557.0,976.152376,716.082928,512774.759677,122.0,488.0,818.0,1241.0,7925.0,2.887739,0.305841,0.093539,2.08636,2.68842,2.912753,3.093772,3.898999,6.649266,0.704226,0.495934,4.804021,6.190315,6.706862,7.123673,8.977778,bacteria,63,Vitreoscilla filiformis,UP000199729,bacteria,3396,/ftp.uniprot.org/pub/databases/uniprot/current...,3396.0,327.099529,239.207207,57220.087882,40.0,166.0,275.5,414.0,2641.0,2.413182,0.306324,0.093835,1.60206,2.220108,2.440121,2.617,3.421768,5.556556,0.705337,0.497501,3.688879,5.111988,5.618586,6.025866,7.878913,bacteria
7,gemmata_obscuriglobus_gca_003149495,ASM314949v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.gemmata_obscuriglobus_gca...,6994.0,1032.628253,851.139209,724437.95359,80.0,458.0,845.0,1313.0,19658.0,2.90065,0.313935,0.098555,1.90309,2.660865,2.926857,3.118265,4.293539,6.678994,0.722863,0.522531,4.382027,6.126869,6.739337,7.18007,9.88624,bacteria,114,Gemmata obscuriglobus,UP000245802,bacteria,6809,/ftp.uniprot.org/pub/databases/uniprot/current...,6809.0,345.550007,286.37333,82009.684081,26.0,154.0,282.0,438.0,6552.0,2.423783,0.316089,0.099912,1.414973,2.187521,2.450249,2.641474,3.816374,5.580967,0.727822,0.529726,3.258097,5.036953,5.641907,6.082219,8.787526,bacteria


### Save the file
**if desired** (be careful not to overwrite any file!).
Use **BOOL_SAVE_FILE variable** above for that.

In [4]:
print(working_on_tables_path, os.path.isdir(working_on_tables_path))
out_file = working_on_tables_path + "diff_6519_and_" + noisy_merged_7671_file_name
print("to be saved in (not yet): ", out_file)
if BOOL_SAVE_FILE: # Save the file if desired
    print("saving...")
    noisy_df.to_csv(out_file, 
                   sep="\t", index=False, header=False)
    print("...file saved. Done!")

../../../working_on_tables/ True
to be saved in (not yet):  ../../../working_on_tables/diff_6519_and_stat_merged_7671.tsv
saving...
...file saved. Done!
